2021-05-25 12:01:54 +01:00
|
|
|
#! /bin/bash
|
|
|
|
|
|
2021-05-25 23:23:46 +01:00
|
|
|
url="https://www.dialectsarchive.com"
|
2021-05-25 12:01:54 +01:00
|
|
|
|
2021-05-25 23:23:46 +01:00
|
|
|
continents="africa
|
|
|
|
|
asia
|
|
|
|
|
australia-oceania
|
|
|
|
|
caribbean
|
|
|
|
|
central-america
|
|
|
|
|
europe
|
|
|
|
|
middle-east
|
|
|
|
|
north-america
|
|
|
|
|
south-america"
|
2021-05-25 12:01:54 +01:00
|
|
|
|
2021-05-26 00:02:15 +01:00
|
|
|
destination="$1"
|
|
|
|
|
|
2021-05-25 23:23:46 +01:00
|
|
|
for continent in $continents
|
2021-05-25 12:01:54 +01:00
|
|
|
do
|
2021-05-25 23:23:46 +01:00
|
|
|
countries_html="$(wget -q -O - $url/$continent)"
|
2021-05-25 12:01:54 +01:00
|
|
|
|
2021-05-25 23:23:46 +01:00
|
|
|
country_urls="$( \
|
|
|
|
|
echo "$countries_html" | \
|
|
|
|
|
sed -n -e '/Please select a/,/clear/{p;}' | \
|
2021-05-26 00:02:15 +01:00
|
|
|
grep -io --color=never "$url/[^\"]*" | \
|
|
|
|
|
uniq
|
2021-05-25 23:23:46 +01:00
|
|
|
)"
|
2021-05-25 12:01:54 +01:00
|
|
|
|
2021-05-25 23:23:46 +01:00
|
|
|
for country_url in $country_urls
|
2021-05-25 12:01:54 +01:00
|
|
|
do
|
2021-05-25 23:23:46 +01:00
|
|
|
dialect_html="$(wget -q -O - $country_url)"
|
2021-05-25 12:01:54 +01:00
|
|
|
|
2021-05-25 23:23:46 +01:00
|
|
|
country="$(basename $country_url)"
|
2021-05-25 12:01:54 +01:00
|
|
|
|
2021-05-25 23:23:46 +01:00
|
|
|
dialect_lines="$( \
|
|
|
|
|
echo "$dialect_html" | \
|
|
|
|
|
grep -iP --color=never "$url/$country-\d+\">[^>]*>.*?(fe)?male" \
|
|
|
|
|
)"
|
2021-05-25 12:01:54 +01:00
|
|
|
|
2021-05-26 00:02:15 +01:00
|
|
|
directory="$destination/$continent/$country"
|
2021-05-25 12:01:54 +01:00
|
|
|
|
2021-05-25 23:23:46 +01:00
|
|
|
mkdir -p "$directory"
|
|
|
|
|
|
|
|
|
|
IFS=$'\n'
|
|
|
|
|
for dialect_line in $dialect_lines
|
2021-05-25 12:01:54 +01:00
|
|
|
do
|
2021-05-25 23:23:46 +01:00
|
|
|
dialect_url="$( \
|
|
|
|
|
echo "$dialect_line" | \
|
|
|
|
|
grep -io --color=never 'https[^"]*' \
|
|
|
|
|
)"
|
|
|
|
|
|
|
|
|
|
dialect_info="$( \
|
|
|
|
|
echo "$dialect_line" | \
|
|
|
|
|
grep -Po --color=never "(?<=</a>)[^<]+(?=<)" | \
|
|
|
|
|
sed 's/\W*$//' | \
|
|
|
|
|
sed 's/^\W*//' | \
|
|
|
|
|
sed 's/\//_/g' \
|
|
|
|
|
)"
|
|
|
|
|
|
|
|
|
|
dialect_html="$(wget -q -O - $dialect_url)"
|
2021-05-25 12:01:54 +01:00
|
|
|
|
2021-05-25 23:23:46 +01:00
|
|
|
dialect_download="$( \
|
|
|
|
|
echo $dialect_html | \
|
|
|
|
|
grep -io 'https[^"]*\.mp3' | \
|
|
|
|
|
head -1 \
|
|
|
|
|
)"
|
2021-05-25 12:01:54 +01:00
|
|
|
|
2021-05-25 23:23:46 +01:00
|
|
|
filename="$(basename $dialect_url): $dialect_info.mp3"
|
2021-05-25 12:01:54 +01:00
|
|
|
|
2021-05-25 23:23:46 +01:00
|
|
|
echo "Downloading $filename to $directory"
|
2021-05-26 00:02:15 +01:00
|
|
|
wget -q -O "$directory/$filename" $dialect_download
|
2021-05-25 12:01:54 +01:00
|
|
|
done
|
|
|
|
|
done
|
|
|
|
|
done
|