diff --git a/scripts/01_download_images.sh b/scripts/01_download_images.sh index 196d1b599a2be20e36f1b0e364915dd0cfbdd9cb..b3a802dae89dba9547ef59c7b6faff5ce74e6e15 100755 --- a/scripts/01_download_images.sh +++ b/scripts/01_download_images.sh @@ -1,26 +1,44 @@ #!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + command -v convert >/dev/null 2>&1 || { echo >&2 "I require convert (imagemagick) but it's not installed. Aborting."; exit 1; } command -v jq >/dev/null 2>&1 || { echo >&2 "I require jq (json parser) but it's not installed. Aborting."; exit 1; } CURRENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" BASE_DIR="$(dirname "${CURRENT_DIR}")" -IMAGES_CACHE_FOLDER="${CURRENT_DIR}/cache/download" -INPUT_WORDS_LIST="${BASE_DIR}/assets/files/words.json" +# CSV source file for words +SOURCE_CSV_FILE="${CURRENT_DIR}/words.csv" +MAX_RANDOM_WORDS_TO_DOWNLOAD=100 +MAX_IMAGES_PER_WORD=5 # Images variants KEYWORD_VARIANTS=",image,picture,drawing,black and white,painting,icon" -COUNT_BY_WEYWORD_VARIANT=3 + +IMAGES_CACHE_FOLDER="${CURRENT_DIR}/cache/download" +mkdir -p "${IMAGES_CACHE_FOLDER}" + KEYWORD_VARIANTS_LIST="$(echo "${KEYWORD_VARIANTS}" | tr "," "\n")" -# QWANT parameters -LANG="fr_FR" -TYPE="images" -OFFSET=0 -BASE_URL="https://api.qwant.com/api/search/${TYPE}" +get_images_from_ddg() { + local -r QUERY_STRING="$1" + local -r BASE_URL="https://duckduckgo.com" + + local -r TOKEN="$(curl --silent "${BASE_URL}/q=${QUERY_STRING}" | sed 's/[&,]/\n/g' | sed 's/"//g' | grep "vqd" | tail -n 1 | cut -d'=' -f2)" + + local -r SEARCH_URL="https://duckduckgo.com/?t=ffab&q=${QUERY_STRING}&atb=v301-1&iax=images&ia=images" + local -r RESULT="$(curl --silent "${SEARCH_URL}")" + local -r JSON_URL="${BASE_URL}$(echo "${RESULT}" | sed 's/ /\n/g' | grep "initialize" | cut -d';' -f2 | cut -d "'" -f2 | sed 's/\/d\.js/\/i.js/g')" + + local -r RESULTS="$(curl --silent "${JSON_URL}")" + local -r THUMBNAILS_URLS="$(echo "${RESULTS}" | sed 's/ /\n/g' | sed 's/,/\n/g' | grep '"thumbnail":' | cut -d '"' -f4 | head -n ${MAX_IMAGES_PER_WORD})" + echo "${THUMBNAILS_URLS}" +} -WORDS_LIST="$(cat "${INPUT_WORDS_LIST}" | grep -E '\"fr\": \"[A-Z\-]+\",$' | cut -d'"' -f4 | sort | uniq | sort -R)" +WORDS_LIST="$(cat "${SOURCE_CSV_FILE}" | cut -d';' -f1 | sort | uniq | sort -R | head -n ${MAX_RANDOM_WORDS_TO_DOWNLOAD})" while read -r KEYWORD; do if [[ -n "${KEYWORD}" ]]; then echo "KEYWORD: ${KEYWORD}" @@ -33,17 +51,7 @@ while read -r KEYWORD; do QUERY_STRING="${QUERY_STRING} ${VARIANT}" fi - # Get QWANT API query from keyword - QUERY="$(echo "${QUERY_STRING}" | tr "A-Z" "a-z" | sed 's| |%20|g')" - QUERY_URL="${BASE_URL}?count=${COUNT_BY_WEYWORD_VARIANT}&q=${QUERY}&t=${TYPE}&safesearch=1&offset=${OFFSET}&locale=${LANG}&uiv=4" - # echo "QUERY_URL: ${QUERY_URL}" - - # Get QWANT thumbnails urls from keyword - URL_LIST="$(curl "${QUERY_URL}" \ - --silent \ - -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0' \ - --compressed \ - | jq | grep "media_preview" | cut -d'"' -f4 | sed 's|^//||g')" + URL_LIST="$(get_images_from_ddg "${QUERY_STRING}")" if [[ -z "${URL_LIST}" ]]; then echo " No image found..."