Update dialect downloader
This commit is contained in:
@@ -1,35 +1,70 @@
|
||||
#! /bin/bash
|
||||
|
||||
regions="$(wget -q -O - https://dialectsarchive.com/dialects-accents)"
|
||||
url="https://www.dialectsarchive.com"
|
||||
|
||||
regions="$(echo "$regions" | sed -n -e '/Select a continent/,/clear/{ /Select a continent/d; /clear/d; p; }' | grep -io --color=never 'https[^"]*')"
|
||||
continents="africa
|
||||
asia
|
||||
australia-oceania
|
||||
caribbean
|
||||
central-america
|
||||
europe
|
||||
middle-east
|
||||
north-america
|
||||
south-america"
|
||||
|
||||
for region in $regions
|
||||
for continent in $continents
|
||||
do
|
||||
countries="$(wget -q -O - $region)"
|
||||
countries_html="$(wget -q -O - $url/$continent)"
|
||||
|
||||
countries="$(echo "$countries" | sed -n -e '/Please select a/,/clear/{ /Please select a/d; /clear/d; p; }' | grep -io --color=never 'https[^"]*')"
|
||||
country_urls="$( \
|
||||
echo "$countries_html" | \
|
||||
sed -n -e '/Please select a/,/clear/{p;}' | \
|
||||
grep -io --color=never "$url/[^\"]*" \
|
||||
)"
|
||||
|
||||
for country in $countries
|
||||
for country_url in $country_urls
|
||||
do
|
||||
dialects="$(wget -q -O - $country)"
|
||||
dialect_html="$(wget -q -O - $country_url)"
|
||||
|
||||
dialects="$(echo "$dialects" | sed -n -e '/Please select a/,/clear/{ /Please select a/d; /clear/d; p; }' | grep -io --color=never 'https[^"]*')"
|
||||
country="$(basename $country_url)"
|
||||
|
||||
dir="$(basename $country)"
|
||||
dialect_lines="$( \
|
||||
echo "$dialect_html" | \
|
||||
grep -iP --color=never "$url/$country-\d+\">[^>]*>.*?(fe)?male" \
|
||||
)"
|
||||
|
||||
mkdir -p "$dir"
|
||||
directory="$continent/$country"
|
||||
|
||||
for dialect in $dialects
|
||||
mkdir -p "$directory"
|
||||
|
||||
IFS=$'\n'
|
||||
for dialect_line in $dialect_lines
|
||||
do
|
||||
dialect="$(wget -q -O - $dialect)"
|
||||
dialect_url="$( \
|
||||
echo "$dialect_line" | \
|
||||
grep -io --color=never 'https[^"]*' \
|
||||
)"
|
||||
|
||||
dialect="$(echo $dialect | grep -io 'https[^"]*\.mp3' | head -1)"
|
||||
dialect_info="$( \
|
||||
echo "$dialect_line" | \
|
||||
grep -Po --color=never "(?<=</a>)[^<]+(?=<)" | \
|
||||
sed 's/\W*$//' | \
|
||||
sed 's/^\W*//' | \
|
||||
sed 's/\//_/g' \
|
||||
)"
|
||||
|
||||
file="$(basename $dialect)"
|
||||
dialect_html="$(wget -q -O - $dialect_url)"
|
||||
|
||||
echo "Downloading $file to $dir"
|
||||
wget -q -O "$dir/$file" $dialect
|
||||
dialect_download="$( \
|
||||
echo $dialect_html | \
|
||||
grep -io 'https[^"]*\.mp3' | \
|
||||
head -1 \
|
||||
)"
|
||||
|
||||
filename="$(basename $dialect_url): $dialect_info.mp3"
|
||||
|
||||
echo "Downloading $filename to $directory"
|
||||
wget -q -O "$directory/$filename" $dialect_url
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
Reference in New Issue
Block a user