Files
scripts/dialectdownloader.sh

74 lines
1.7 KiB
Bash
Raw Normal View History

2021-05-25 12:01:54 +01:00
#! /bin/bash
2021-05-25 23:23:46 +01:00
url="https://www.dialectsarchive.com"
2021-05-25 12:01:54 +01:00
2021-05-25 23:23:46 +01:00
continents="africa
asia
australia-oceania
caribbean
central-america
europe
middle-east
north-america
south-america"
2021-05-25 12:01:54 +01:00
2021-05-26 00:02:15 +01:00
destination="$1"
2021-05-25 23:23:46 +01:00
for continent in $continents
2021-05-25 12:01:54 +01:00
do
2021-05-25 23:23:46 +01:00
countries_html="$(wget -q -O - $url/$continent)"
2021-05-25 12:01:54 +01:00
2021-05-25 23:23:46 +01:00
country_urls="$( \
echo "$countries_html" | \
sed -n -e '/Please select a/,/clear/{p;}' | \
2021-05-26 00:02:15 +01:00
grep -io --color=never "$url/[^\"]*" | \
uniq
2021-05-25 23:23:46 +01:00
)"
2021-05-25 12:01:54 +01:00
2021-05-25 23:23:46 +01:00
for country_url in $country_urls
2021-05-25 12:01:54 +01:00
do
2021-05-25 23:23:46 +01:00
dialect_html="$(wget -q -O - $country_url)"
2021-05-25 12:01:54 +01:00
2021-05-25 23:23:46 +01:00
country="$(basename $country_url)"
2021-05-25 12:01:54 +01:00
2021-05-25 23:23:46 +01:00
dialect_lines="$( \
echo "$dialect_html" | \
grep -iP --color=never "$url/$country-\d+\">[^>]*>.*?(fe)?male" \
)"
2021-05-25 12:01:54 +01:00
2021-05-26 00:02:15 +01:00
directory="$destination/$continent/$country"
2021-05-25 12:01:54 +01:00
2021-05-25 23:23:46 +01:00
mkdir -p "$directory"
IFS=$'\n'
for dialect_line in $dialect_lines
2021-05-25 12:01:54 +01:00
do
2021-05-25 23:23:46 +01:00
dialect_url="$( \
echo "$dialect_line" | \
grep -io --color=never 'https[^"]*' \
)"
dialect_info="$( \
echo "$dialect_line" | \
grep -Po --color=never "(?<=</a>)[^<]+(?=<)" | \
sed 's/\W*$//' | \
sed 's/^\W*//' | \
sed 's/\//_/g' \
)"
dialect_html="$(wget -q -O - $dialect_url)"
2021-05-25 12:01:54 +01:00
2021-05-25 23:23:46 +01:00
dialect_download="$( \
echo $dialect_html | \
grep -io 'https[^"]*\.mp3' | \
head -1 \
)"
2021-05-25 12:01:54 +01:00
2021-05-25 23:23:46 +01:00
filename="$(basename $dialect_url): $dialect_info.mp3"
2021-05-25 12:01:54 +01:00
2021-05-25 23:23:46 +01:00
echo "Downloading $filename to $directory"
2021-05-26 00:02:15 +01:00
wget -q -O "$directory/$filename" $dialect_download
2021-05-25 12:01:54 +01:00
done
done
done