Skip to content
Snippets Groups Projects
Commit 998167aa authored by Benoît Harrault's avatar Benoît Harrault
Browse files

Merge branch '65-improve-get-images-from-internet-process' into 'master'

Resolve "Improve "get images from internet" process"

Closes #65

See merge request !68
parents 67d01904 b6e40e09
No related branches found
No related tags found
1 merge request!68Resolve "Improve "get images from internet" process"
Pipeline #4858 passed
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
command -v convert >/dev/null 2>&1 || { echo >&2 "I require convert (imagemagick) but it's not installed. Aborting."; exit 1; }
command -v jq >/dev/null 2>&1 || { echo >&2 "I require jq (json parser) but it's not installed. Aborting."; exit 1; }
CURRENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
BASE_DIR="$(dirname "${CURRENT_DIR}")"
IMAGES_CACHE_FOLDER="${CURRENT_DIR}/cache/download"
INPUT_WORDS_LIST="${BASE_DIR}/assets/files/words.json"
# CSV source file for words
SOURCE_CSV_FILE="${CURRENT_DIR}/words.csv"
MAX_RANDOM_WORDS_TO_DOWNLOAD=100
MAX_IMAGES_PER_WORD=5
# Images variants
KEYWORD_VARIANTS=",image,picture,drawing,black and white,painting,icon"
COUNT_BY_WEYWORD_VARIANT=3
IMAGES_CACHE_FOLDER="${CURRENT_DIR}/cache/download"
mkdir -p "${IMAGES_CACHE_FOLDER}"
KEYWORD_VARIANTS_LIST="$(echo "${KEYWORD_VARIANTS}" | tr "," "\n")"
# QWANT parameters
LANG="fr_FR"
TYPE="images"
OFFSET=0
BASE_URL="https://api.qwant.com/api/search/${TYPE}"
get_images_from_ddg() {
local -r QUERY_STRING="$1"
local -r BASE_URL="https://duckduckgo.com"
local -r TOKEN="$(curl --silent "${BASE_URL}/q=${QUERY_STRING}" | sed 's/[&,]/\n/g' | sed 's/"//g' | grep "vqd" | tail -n 1 | cut -d'=' -f2)"
local -r SEARCH_URL="https://duckduckgo.com/?t=ffab&q=${QUERY_STRING}&atb=v301-1&iax=images&ia=images"
local -r RESULT="$(curl --silent "${SEARCH_URL}")"
local -r JSON_URL="${BASE_URL}$(echo "${RESULT}" | sed 's/ /\n/g' | grep "initialize" | cut -d';' -f2 | cut -d "'" -f2 | sed 's/\/d\.js/\/i.js/g')"
local -r RESULTS="$(curl --silent "${JSON_URL}")"
local -r THUMBNAILS_URLS="$(echo "${RESULTS}" | sed 's/ /\n/g' | sed 's/,/\n/g' | grep '"thumbnail":' | cut -d '"' -f4 | head -n ${MAX_IMAGES_PER_WORD})"
echo "${THUMBNAILS_URLS}"
}
WORDS_LIST="$(cat "${INPUT_WORDS_LIST}" | grep -E '\"fr\": \"[A-Z\-]+\",$' | cut -d'"' -f4 | sort | uniq | sort -R)"
WORDS_LIST="$(cat "${SOURCE_CSV_FILE}" | cut -d';' -f1 | sort | uniq | sort -R | head -n ${MAX_RANDOM_WORDS_TO_DOWNLOAD})"
while read -r KEYWORD; do
if [[ -n "${KEYWORD}" ]]; then
echo "KEYWORD: ${KEYWORD}"
......@@ -33,17 +51,7 @@ while read -r KEYWORD; do
QUERY_STRING="${QUERY_STRING} ${VARIANT}"
fi
# Get QWANT API query from keyword
QUERY="$(echo "${QUERY_STRING}" | tr "A-Z" "a-z" | sed 's| |%20|g')"
QUERY_URL="${BASE_URL}?count=${COUNT_BY_WEYWORD_VARIANT}&q=${QUERY}&t=${TYPE}&safesearch=1&offset=${OFFSET}&locale=${LANG}&uiv=4"
# echo "QUERY_URL: ${QUERY_URL}"
# Get QWANT thumbnails urls from keyword
URL_LIST="$(curl "${QUERY_URL}" \
--silent \
-H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0' \
--compressed \
| jq | grep "media_preview" | cut -d'"' -f4 | sed 's|^//||g')"
URL_LIST="$(get_images_from_ddg "${QUERY_STRING}")"
if [[ -z "${URL_LIST}" ]]; then
echo " No image found..."
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment