Skip to content
Snippets Groups Projects
Commit b6e40e09 authored by Benoît Harrault's avatar Benoît Harrault
Browse files

Improve get images from internet script (via DuckDuckGo)

parent 67d01904
No related branches found
No related tags found
1 merge request!68Resolve "Improve "get images from internet" process"
Pipeline #4855 passed
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
command -v convert >/dev/null 2>&1 || { echo >&2 "I require convert (imagemagick) but it's not installed. Aborting."; exit 1; }
command -v jq >/dev/null 2>&1 || { echo >&2 "I require jq (json parser) but it's not installed. Aborting."; exit 1; }
CURRENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
BASE_DIR="$(dirname "${CURRENT_DIR}")"
IMAGES_CACHE_FOLDER="${CURRENT_DIR}/cache/download"
INPUT_WORDS_LIST="${BASE_DIR}/assets/files/words.json"
# CSV source file for words
SOURCE_CSV_FILE="${CURRENT_DIR}/words.csv"
MAX_RANDOM_WORDS_TO_DOWNLOAD=100
MAX_IMAGES_PER_WORD=5
# Images variants
KEYWORD_VARIANTS=",image,picture,drawing,black and white,painting,icon"
COUNT_BY_WEYWORD_VARIANT=3
IMAGES_CACHE_FOLDER="${CURRENT_DIR}/cache/download"
mkdir -p "${IMAGES_CACHE_FOLDER}"
KEYWORD_VARIANTS_LIST="$(echo "${KEYWORD_VARIANTS}" | tr "," "\n")"
# QWANT parameters
LANG="fr_FR"
TYPE="images"
OFFSET=0
BASE_URL="https://api.qwant.com/api/search/${TYPE}"
get_images_from_ddg() {
local -r QUERY_STRING="$1"
local -r BASE_URL="https://duckduckgo.com"
local -r TOKEN="$(curl --silent "${BASE_URL}/q=${QUERY_STRING}" | sed 's/[&,]/\n/g' | sed 's/"//g' | grep "vqd" | tail -n 1 | cut -d'=' -f2)"
local -r SEARCH_URL="https://duckduckgo.com/?t=ffab&q=${QUERY_STRING}&atb=v301-1&iax=images&ia=images"
local -r RESULT="$(curl --silent "${SEARCH_URL}")"
local -r JSON_URL="${BASE_URL}$(echo "${RESULT}" | sed 's/ /\n/g' | grep "initialize" | cut -d';' -f2 | cut -d "'" -f2 | sed 's/\/d\.js/\/i.js/g')"
local -r RESULTS="$(curl --silent "${JSON_URL}")"
local -r THUMBNAILS_URLS="$(echo "${RESULTS}" | sed 's/ /\n/g' | sed 's/,/\n/g' | grep '"thumbnail":' | cut -d '"' -f4 | head -n ${MAX_IMAGES_PER_WORD})"
echo "${THUMBNAILS_URLS}"
}
WORDS_LIST="$(cat "${INPUT_WORDS_LIST}" | grep -E '\"fr\": \"[A-Z\-]+\",$' | cut -d'"' -f4 | sort | uniq | sort -R)"
WORDS_LIST="$(cat "${SOURCE_CSV_FILE}" | cut -d';' -f1 | sort | uniq | sort -R | head -n ${MAX_RANDOM_WORDS_TO_DOWNLOAD})"
while read -r KEYWORD; do
if [[ -n "${KEYWORD}" ]]; then
echo "KEYWORD: ${KEYWORD}"
......@@ -33,17 +51,7 @@ while read -r KEYWORD; do
QUERY_STRING="${QUERY_STRING} ${VARIANT}"
fi
# Get QWANT API query from keyword
QUERY="$(echo "${QUERY_STRING}" | tr "A-Z" "a-z" | sed 's| |%20|g')"
QUERY_URL="${BASE_URL}?count=${COUNT_BY_WEYWORD_VARIANT}&q=${QUERY}&t=${TYPE}&safesearch=1&offset=${OFFSET}&locale=${LANG}&uiv=4"
# echo "QUERY_URL: ${QUERY_URL}"
# Get QWANT thumbnails urls from keyword
URL_LIST="$(curl "${QUERY_URL}" \
--silent \
-H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0' \
--compressed \
| jq | grep "media_preview" | cut -d'"' -f4 | sed 's|^//||g')"
URL_LIST="$(get_images_from_ddg "${QUERY_STRING}")"
if [[ -z "${URL_LIST}" ]]; then
echo " No image found..."
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment