Re: Re: Re: SiteLinkMap

From NAT, 6 Years ago, written in Bash, viewed 568 times. This paste is a reply to Re: Re: SiteLinkMap from NAT - view diff
URL https://code.nat.moe/view/1b150eaa Embed
Download Paste or View Raw
  1. #!/bin/bash
  2.  
  3. TABS=0
  4. MIN_ALEXA='99999' # How powerful you friends is? Set a minimum alexa here to prevent spider being trap in sites like twitter, G+, github, ect.
  5.  
  6. temp=$(mktemp -d)
  7. cd $temp
  8. touch alexa.cache
  9.  
  10. GLOBAL_IGNORE='djangobook|wordpress|twitter|facebook|baidu|google'
  11.  
  12. [[ -z $MAX_DEPTH ]] && MAX_DEPTH=51 || let MAX_DEPTH++ || exit 1
  13.  
  14. function checkAlexa {
  15.         grep "$1" < "$temp/alexa.cache" || echo "$1:$(curl -s "http://www.alexa.com/siteinfo/$1" | tr -d " \n," | awk -F'awis-->' '{print $2}' | awk -F'<' '{print $1}')" >> "$temp/alexa.cache"
  16.         local rank=$(grep "$1" < "$temp/alexa.cache" | awk -F':' '{print $2}')
  17.         [[ -z $rank || $rank -gt $MIN_ALEXA ]] && return 0
  18.         return 1
  19. }
  20.  
  21. function getLinksByURL {
  22.         [[ $TABS == $MAX_DEPTH ]] && return
  23.         curl --max-time 10 -sL "$1" | awk -F'href=' '{print $2}' | awk -F'>| ' '{print $1}' | sort | uniq | grep 'http' | awk -F"/" '{print $3}' | tr -d "\"'" | sort | uniq
  24. }
  25.  
  26. function showTabs {
  27.         for i in $(seq 1 $1)
  28.         do
  29.                 echo -n "  "
  30.         done
  31. }
  32.  
  33. function showSiteLinkOut {
  34.         let TABS++
  35.         mkdir "$1" 2> /dev/null; cd "$1"
  36.         getLinksByURL "$1" | grep -vE "$1|$(cat $temp/DONE)" | while read -r out
  37.         do
  38.                 echo -n "|$1" >> $temp/DONE
  39.                 [[ $TABS -lt $MAX_DEPTH && -z $(grep -E "$(cat $temp/DONE)" <<< "$out") && ! "$1" == "$out" && -z $(grep -E "$GLOBAL_IGNORE" <<< "$out") && ! "$1" == "$out" ]] && checkAlexa "$out" && echo "[$TABS - $(pwd | awk -F"$temp" '{print $2}')] >>> $out" && showSiteLinkOut "$out" && let TABS-- && cd ..
  40.                 echo 0 > /dev/null
  41.         done
  42. }
  43.  
  44. echo "working at $temp"
  45. echo ">>> TRACE to $1 START"
  46. echo -n "$1" > $temp/DONE
  47. showSiteLinkOut $1
  48. cd $temp
  49. tree -d $1
  50. rm -r $temp

Reply to "Re: Re: Re: SiteLinkMap"

Here you can reply to the paste above

captcha