Batch download code snippets
Batch download snippets from http://codesnippets.joyent.com and convert them to text files using man textutil (available on Mac OS X 10.4 or later).
Note: Old snippet versions will be automatically replaced by the downloaded snippets without a backup!
Author: jv
License: The MIT License, Copyright (c) 2008 jv
Usage:
Note: Old snippet versions will be automatically replaced by the downloaded snippets without a backup!
Author: jv
License: The MIT License, Copyright (c) 2008 jv
Usage:
# usage: bds [-p num] [-t tag] [-u user] tag bds vim bds -p 1280 bds -u jvs bds -t plistbuddy bds -t tar bds -t ipfw -u jvs
#!/opt/local/bin/bash # "batch download snippets" from http://codesnippets.joyent.com and # convert them to text files using man textutil (available on Mac OS X 10.4 or later). # # Note: Old snippet versions will be automatically replaced by the downloaded snippets without a backup! # An alternative to man textutil is html2text, http://www.mbayer.de/html2text/ (which is available via MacPorts). # # Author: jv # License: The MIT License, http://www.opensource.org/licenses/mit-license.php # Copyright (c) 2008 jv # # cat /usr/local/bin/bds # # usage: bds [-p num] [-t tag] [-u user] tag declare BaseURL='http://codesnippets.joyent.com' declare download_dir="${HOME}/Desktop/Snippets" # make sure there is no trailing slash BaseURL="${BaseURL%/}" download_dir="${download_dir%/}" declare BasePostURL="${BaseURL}/posts/show" declare BaseTagURL="${BaseURL}/tag" declare BaseUserURL="${BaseURL}/user" # make sure there is no trailing slash BasePostURL="${BasePostURL%/}" BaseTagURL="${BaseTagURL%/}" BaseUserURL="${BaseUserURL%/}" # man textutil declare InputEncoding='utf-8' declare OutputEncoding='utf-8' export IFS=$' \t\n' # function to download a single post specified by a post number: bds -p num # cf. snippet, http://codesnippets.joyent.com/posts/show/1282 function snippet() { declare NL OPWD file outputfile postnum title url if [[ "${1//[[:digit:]]/}" != "" ]]; then echo "Argument error. No positive integer: ${1}"; return 1; fi postnum="${1}" url="${BasePostURL}/${postnum}" download_dir="${download_dir}/single-downloads" /bin/mkdir -p "${download_dir}" OPWD="${PWD}" cd "${download_dir}" /usr/bin/curl -L -O -s --max-time 25 "${url}" || exit 1 # download snippet web page file="${download_dir}/${url##*/}" trap '/bin/rm -f "${file}"; exit 0' 0 1 2 13 15 # get title of downloaded web page #title="$(/usr/bin/sed -E -n -e '/<[tT][iI][tT][lL][eE]>/{s/^.*<[tT][iI][tT][lL][eE]>(.*)<\/[tT][iI][tT][lL][eE]>.*$/\1/p;q;}' "${file}" | \ # /usr/bin/sed -E -e 's/\[[^][:space:]]*\]//g')" # delete [xxx] tag elements of title title="$(/usr/bin/egrep -m 1 -io '<title>.*</title>' "${file}" | /usr/bin/sed -E -e 's/^<title>[[:space:]]*|[[:space:]]*<\/title>$//g' \ -e 's/\[[^][:space:]]*\]//g')" # delete [xxx] tag elements of title title="${title//CodeSnippets:/}" title="${title//\//:}" title="${title// /_}" title="${title//[[:cntrl:]]/}" title="${title%"${title##*[!_]}"}" # remove trailing underscores if [[ $title == '_CodeDrive_Snippets_courtesy_of_Peter_Coopers_handy_little_app' ]] || [[ -z "$title" ]]; then printf "\e[0K\e[31m%s\e[0m: %s\n" "couldn't access" "${url}" /bin/rm "${file}" return 1 fi outputfile="${download_dir}/${postnum}_${title}.txt" #outputfile="${download_dir}/${title}.txt" # without post number prefix #outputfile="${outputfile//__/_}" # uniq underscores printf "\n\e[0K\e[1;30m%s\e[0m: %s\n\n" "saved as" "${outputfile}" /usr/bin/textutil -output "${outputfile}" -convert txt -inputencoding "${InputEncoding}" -encoding "${OutputEncoding}" "${file}" /bin/rm "${file}" # escape backslashes # man bash 2>/dev/null | less -p 'Each command in a pipeline' #outputfile="$(printf "%q" "${outputfile}")" # cf. help printf outputfile="${outputfile//\\/\\\\}" NL=$'\\\n' cat <<EOF | /bin/ed -s "${outputfile}" H ,g/Snippets is a public source code repository/1,/Snippets is a public source code repository/d ,g/You need to create an account or log in to post comments to this site//You need to create an account or log in to post comments to this site/,\$d ,g|(See related posts)$|s|.See related posts.|${NL}${NL}| ,g|^to.* by.* on .*[[:digit:]]$|s|^to\(.*\) by\(.*\) on \(.*[[:digit:]]\)$|${NL}${NL}Author:\2${NL}Date: \3${NL}URL: ${url}${NL}Tags:\1${NL}| ,g|^Comments on this post$|s|\(Comments on this post\)|${NL}\1:| ,g| posts on .* at |s|\(.* posts on .* at .*\)|${NL}\1:| w EOF # additional ed commands # delete line numbers # ,g|^[[:space:]]*[[:digit:]]\{1,\}[[:space:]]\{1,3\}|s|^[[:space:]]*[[:digit:]]\{1,\}[[:space:]]\{1,3\}\(.*\)$|\1| # delete range of lines # 4,11d cd "${OPWD}" return 0 } #----------------------------------------- end of function snippet declare pflag tflag uflag declare cnt count dir_name file no_posts_check NL OPWD outputfile postnum tagsite title url urls website if [[ $# -eq 0 ]]; then printf "%s\n%s\n" 'No arguments given!' "Usage: ${0##*/} [-p num] [-t tag] [-u user] tag" 1>&2 exit 1 fi while getopts ":p:t:u:" option do case $option in p) pflag="$OPTARG" ;; t) tflag="$OPTARG" ;; u) uflag="$OPTARG" ;; [?]) printf "%s\n%s\n" 'Argument error!' "Usage: ${0##*/} [-p num] [-t tag] [-u user] tag" 1>&2; exit 1;; *) ;; esac done shift $(($OPTIND - 1)) if [[ $# -eq 1 ]]; then dir_name="${1}" tagsite="${BaseTagURL}/${1}" elif [[ $# -gt 1 ]]; then printf "%s\n%s\n" 'Too many arguments!' "Usage: ${0##*/} [-p num] [-t tag] [-u user] tag" 1>&2 exit 1 elif [[ -n "${pflag}" ]]; then snippet "${pflag}" exit 0 elif [[ -n "${tflag}" ]] && [[ -n "${uflag}" ]]; then dir_name="${tflag}-${uflag}" tagsite="${BaseUserURL}/${uflag}/tag/${tflag}" elif [[ -n "${tflag}" ]]; then dir_name="${tflag}" tagsite="${BaseTagURL}/${tflag}" elif [[ -n "${uflag}" ]]; then dir_name="${uflag}" tagsite="${BaseUserURL}/${uflag}" else printf "%s\n%s\n" 'Argument error!' "Usage: ${0##*/} [-p num] [-t tag] [-u user] tag" 1>&2 exit 1 fi tagsite="${tagsite%/}" #echo $dir_name #echo $tagsite count=1 cnt=0 curl_max_time=20 website='' no_posts_check='' NL=$'\\\n' download_dir="${download_dir}/${dir_name//\//:}" download_dir="${download_dir%/}" /bin/mkdir -p "${download_dir}" OPWD="${PWD}" cd "${download_dir}" # print download directory printf "\n\e[0K\e[1;30m%s\e[0m: %s\n\n" "download directory" "${download_dir}" while [[ -z "${no_posts_check}" ]]; do # download website of the form: # http://somewebsite.com/tag/bash/1, # http://somewebsite.com/user/name/1 or # http://somewebsite.com/user/name/tag/bash/1 website="$(/usr/bin/curl -L -s --max-time $curl_max_time "${tagsite}/${count}" )" if [[ $? -ne 0 ]]; then printf "\e[0K\e[31m%s\e[0m: %s\n" "curl_max_time ${curl_max_time}" "${tagsite}/${count}" exit 1 fi #if [[ -n "$(printf "%s" "${website}" | /usr/bin/egrep -o 'Application error \(Apache\)')" ]]; then #no_posts_check='Application error (Apache)' #printf "\e[0K\e[31m%s\e[0m: %s\n" "no further posts" "${no_posts_check}" #fi if [[ -n "$(printf "%s" "${website}" | /usr/bin/egrep -o '>No posts<')" ]]; then no_posts_check='>No posts<' #printf "\e[0K\e[31m%s\e[0m: %s\n" "no further posts" "${no_posts_check}" fi : <<-'COMMENT' # works for Bash 3.0 or later if [[ "${website}" =~ '>No posts<' ]]; then no_posts_check="${BASH_REMATCH[0]}" #printf "\e[0K\e[31m%s\e[0m: %s\n" "no further posts" "${no_posts_check}" fi COMMENT if [[ -z "${no_posts_check}" ]]; then # extract relevant post URLs #urls=( $(printf "%s\n" "${website}" | /usr/bin/sed -E -n -e "s|^.* href=\"(/posts/show/[[:digit:]]+)\".*$|${BaseURL}\1|p;g") ) urls=( $(printf "%s\n" "${website}" | /usr/bin/egrep -o 'href="/posts/show/[[:digit:]]+"' | /usr/bin/sed -E -n -e "s|href=\"(/posts/show/[[:digit:]]+)\"|${BaseURL}\1|p;g") ) for ((i=0; i < "${#urls[@]}"; i++)); do url="${urls[${i}]}" postnum="${url##*/}" file="${download_dir}/${postnum}" trap '/bin/rm -f "${file}"; exit 0' 0 1 2 13 15 /usr/bin/curl -L -O -s --max-time $curl_max_time "${url}" if [[ $? -ne 0 ]]; then printf "\e[0K\e[31m%s\e[0m: %s\n" "curl_max_time ${curl_max_time}" "${url}" continue fi # get title of downloaded web page #title="$(/usr/bin/sed -E -n -e '/<[tT][iI][tT][lL][eE]>/{s/^.*<[tT][iI][tT][lL][eE]>(.*)<\/[tT][iI][tT][lL][eE]>.*$/\1/p;q;}' "${file}" | \ # /usr/bin/sed -E -e 's/\[[^][:space:]]*\]//g')" # delete [xxx] tag elements of title title="$(/usr/bin/egrep -m 1 -io '<title>.*</title>' "${file}" | /usr/bin/sed -E -e 's/^<title>[[:space:]]*|[[:space:]]*<\/title>$//g' \ -e 's/\[[^][:space:]]*\]//g')" # delete [xxx] tag elements of title title="${title//CodeSnippets:/}" title="${title//\//:}" title="${title// /_}" title="${title//[[:cntrl:]]/}" title="${title%"${title##*[!_]}"}" # remove trailing underscores #printf "%s\n" "${title}" if [[ $title == '_CodeDrive_Snippets_courtesy_of_Peter_Coopers_handy_little_app' ]] || [[ -z "$title" ]]; then printf "\e[0K\e[31m%s\e[0m: %s\n" "couldn't access" "${url}" /bin/rm "${file}" continue fi outputfile="${download_dir}/${postnum}_${title}.txt" #outputfile="${download_dir}/${title}.txt" # without post number prefix #outputfile="${outputfile//__/_}" # uniq underscores let cnt++ printf "\e[0K\e[1;32m%-6s\e[0m %s\n" "${cnt}" "${outputfile##*/}" /usr/bin/textutil -output "${outputfile}" -convert txt -inputencoding "${InputEncoding}" -encoding "${OutputEncoding}" "${file}" /bin/rm "${file}" # escape backslashes # man bash 2>/dev/null | less -p 'Each command in a pipeline' #outputfile="$(printf "%q" "${outputfile}")" # cf. help printf outputfile="${outputfile//\\/\\\\}" # edit $outputfile in-place with man ed # first delete lines at the beginning & end, # then remove the string 'See related posts' and add some newlines with $NL, # then convert the line 'to...by...on' to line 'Author:...', line 'Date:...', line 'URL:...' and line 'Tags:...' # and finally the last two ed commands insert two further newlines with $NL cat <<EOF | /bin/ed -s "${outputfile}" H ,g/Snippets is a public source code repository/1,/Snippets is a public source code repository/d ,g/You need to create an account or log in to post comments to this site//You need to create an account or log in to post comments to this site/,\$d ,g|(See related posts)$|s|.See related posts.|${NL}${NL}| ,g|^to.* by.* on .*[[:digit:]]$|s|^to\(.*\) by\(.*\) on \(.*[[:digit:]]\)$|${NL}${NL}Author:\2${NL}Date: \3${NL}URL: ${url}${NL}Tags:\1${NL}| ,g|^Comments on this post$|s|\(Comments on this post\)|${NL}\1:| ,g| posts on .* at |s|\(.* posts on .* at .*\)|${NL}\1:| w EOF # additional ed commands # delete line numbers # ,g|^[[:space:]]*[[:digit:]]\{1,\}[[:space:]]\{1,3\}|s|^[[:space:]]*[[:digit:]]\{1,\}[[:space:]]\{1,3\}\(.*\)$|\1| # delete range of lines # 4,11d done # for let count++ fi done # while cd "${OPWD}" exit 0