Update dialect downloader

This commit is contained in:
2021-05-25 23:23:46 +01:00
parent 15d2215197
commit 5b1a616f74

View File

@@ -1,35 +1,70 @@
#! /bin/bash
regions="$(wget -q -O - https://dialectsarchive.com/dialects-accents)"
url="https://www.dialectsarchive.com"
regions="$(echo "$regions" | sed -n -e '/Select a continent/,/clear/{ /Select a continent/d; /clear/d; p; }' | grep -io --color=never 'https[^"]*')"
continents="africa
asia
australia-oceania
caribbean
central-america
europe
middle-east
north-america
south-america"
for region in $regions
for continent in $continents
do
countries="$(wget -q -O - $region)"
countries_html="$(wget -q -O - $url/$continent)"
countries="$(echo "$countries" | sed -n -e '/Please select a/,/clear/{ /Please select a/d; /clear/d; p; }' | grep -io --color=never 'https[^"]*')"
country_urls="$( \
echo "$countries_html" | \
sed -n -e '/Please select a/,/clear/{p;}' | \
grep -io --color=never "$url/[^\"]*" \
)"
for country in $countries
for country_url in $country_urls
do
dialects="$(wget -q -O - $country)"
dialect_html="$(wget -q -O - $country_url)"
dialects="$(echo "$dialects" | sed -n -e '/Please select a/,/clear/{ /Please select a/d; /clear/d; p; }' | grep -io --color=never 'https[^"]*')"
country="$(basename $country_url)"
dir="$(basename $country)"
dialect_lines="$( \
echo "$dialect_html" | \
grep -iP --color=never "$url/$country-\d+\">[^>]*>.*?(fe)?male" \
)"
mkdir -p "$dir"
directory="$continent/$country"
for dialect in $dialects
mkdir -p "$directory"
IFS=$'\n'
for dialect_line in $dialect_lines
do
dialect="$(wget -q -O - $dialect)"
dialect_url="$( \
echo "$dialect_line" | \
grep -io --color=never 'https[^"]*' \
)"
dialect="$(echo $dialect | grep -io 'https[^"]*\.mp3' | head -1)"
dialect_info="$( \
echo "$dialect_line" | \
grep -Po --color=never "(?<=</a>)[^<]+(?=<)" | \
sed 's/\W*$//' | \
sed 's/^\W*//' | \
sed 's/\//_/g' \
)"
file="$(basename $dialect)"
dialect_html="$(wget -q -O - $dialect_url)"
echo "Downloading $file to $dir"
wget -q -O "$dir/$file" $dialect
dialect_download="$( \
echo $dialect_html | \
grep -io 'https[^"]*\.mp3' | \
head -1 \
)"
filename="$(basename $dialect_url): $dialect_info.mp3"
echo "Downloading $filename to $directory"
wget -q -O "$directory/$filename" $dialect_url
done
done
done