diff --git a/dialectdownloader.sh b/dialectdownloader.sh index 722c712..d2e0b7c 100755 --- a/dialectdownloader.sh +++ b/dialectdownloader.sh @@ -1,35 +1,70 @@ #! /bin/bash -regions="$(wget -q -O - https://dialectsarchive.com/dialects-accents)" +url="https://www.dialectsarchive.com" -regions="$(echo "$regions" | sed -n -e '/Select a continent/,/clear/{ /Select a continent/d; /clear/d; p; }' | grep -io --color=never 'https[^"]*')" +continents="africa +asia +australia-oceania +caribbean +central-america +europe +middle-east +north-america +south-america" -for region in $regions +for continent in $continents do - countries="$(wget -q -O - $region)" + countries_html="$(wget -q -O - $url/$continent)" - countries="$(echo "$countries" | sed -n -e '/Please select a/,/clear/{ /Please select a/d; /clear/d; p; }' | grep -io --color=never 'https[^"]*')" + country_urls="$( \ + echo "$countries_html" | \ + sed -n -e '/Please select a/,/clear/{p;}' | \ + grep -io --color=never "$url/[^\"]*" \ + )" - for country in $countries + for country_url in $country_urls do - dialects="$(wget -q -O - $country)" + dialect_html="$(wget -q -O - $country_url)" - dialects="$(echo "$dialects" | sed -n -e '/Please select a/,/clear/{ /Please select a/d; /clear/d; p; }' | grep -io --color=never 'https[^"]*')" + country="$(basename $country_url)" - dir="$(basename $country)" + dialect_lines="$( \ + echo "$dialect_html" | \ + grep -iP --color=never "$url/$country-\d+\">[^>]*>.*?(fe)?male" \ + )" - mkdir -p "$dir" + directory="$continent/$country" - for dialect in $dialects + mkdir -p "$directory" + + IFS=$'\n' + for dialect_line in $dialect_lines do - dialect="$(wget -q -O - $dialect)" + dialect_url="$( \ + echo "$dialect_line" | \ + grep -io --color=never 'https[^"]*' \ + )" - dialect="$(echo $dialect | grep -io 'https[^"]*\.mp3' | head -1)" + dialect_info="$( \ + echo "$dialect_line" | \ + grep -Po --color=never "(?<=)[^<]+(?=<)" | \ + sed 's/\W*$//' | \ + sed 's/^\W*//' | \ + sed 's/\//_/g' \ + )" - file="$(basename $dialect)" + dialect_html="$(wget -q -O - $dialect_url)" - echo "Downloading $file to $dir" - wget -q -O "$dir/$file" $dialect + dialect_download="$( \ + echo $dialect_html | \ + grep -io 'https[^"]*\.mp3' | \ + head -1 \ + )" + + filename="$(basename $dialect_url): $dialect_info.mp3" + + echo "Downloading $filename to $directory" + wget -q -O "$directory/$filename" $dialect_url done done done