eventmachineをつかってクローラーを作る - ニート=>ネトゲ屋=>ブラック=>電話屋

https://github.com/modeverv/animation-crawler
eventmachineでクローラーを作ってみました。
一定のアクセス順にクロールしていきますので単純なクローラーではないですが
結構綺麗に作れたのではないかと思います
eventmachineは速いですね
- URIライブラリをモンキーパッチしてます
- eventmachineのサンプルになっていると思います
- 最終ダウンロードはcurlを使いました
- cloneしてダウンロードパスをいじってcron登録すればよいかと思います
require 'pp'
require './uri.rb'
# uriモジュールは|を許容するように変更している
require 'eventmachine'
require 'em-http'
require 'nokogiri'
require 'open-uri'
require 'httpclient'
require 'mechanize'
require 'net/http'

class Crawler
  DOWNLOADDIR = "/var/smb/sdd1/video"
  
  START_URL = 'http://youtubeanisoku1.blog106.fc2.com/'
  AGENT  = Mechanize.new
  
  CONCURRENCY = 128
  WATCH_INTERVAL = 1
  MEGA = (1024 * 1024).to_f

  JOB_ANISOKUTOP     = 'アニ速TOP'
  JOB_KOUSINPAGE     = '更新ページ'
  JOB_KOBETUPAGE     = '個別ページ'
  JOB_NOSUBSEARCH    = "nosub検索ページ"
  JOB_NOSUBVIDEO     = "nosubビデオページ"
  JOB_CONVERT        = "flv2mp4"

  # constructor
  # hash[:ffmpeg] convert or not
  # hash[:debug] debug or not
  # hash[:usecurl] download by curl
  def initialize hash
    @queue = []
    @queue.push({ kind: JOB_ANISOKUTOP, value: START_URL })
    @fetching = 0
    @downloads = {}
    @ffmpeg = hash[:ffmpeg] || false
    @debug  = hash[:debug] || false
    @usecurl= hash[:usecurl]|| false
    @gaman  = 20
  end
  
  def run
    EM.run do
      EM.add_periodic_timer(WATCH_INTERVAL) do
       
        diff = CONCURRENCY - @fetching

        diff.times do
          job = @queue.pop
          unless job 
            break
          end

          process job
        end
        
        if @fetching == 0
          @gaman -= 1
          puts "fetching:#{@fetching} gaman:#{@gaman}"
          if @gaman == 0
            puts "finish"
            pp @downloads          
            EM::stop_event_loop
          end
        else
          puts "fetching:#{@fetching}"
          pp @downloads 
        end
      end
    end    
  end

  def process job
    @fetching += 1
    case job[:kind]
    when JOB_ANISOKUTOP
      anisokutop job[:value]
    when JOB_KOUSINPAGE
      anisokukousin job[:value]
    when JOB_KOBETUPAGE
      anisokukobetu job[:value]
    when JOB_NOSUBSEARCH
      nosubsearch job[:value]
    when JOB_NOSUBVIDEO
      nosubvideo job[:value]
    when JOB_CONVERT
      convert job[:value]
    end
  end

  # anisoku top
  def anisokutop url
    req = EM::HttpRequest.new(url,:connect_timeout => 50).get

    req.errback { @fetching -= 1 }

    req.callback do
      page = Nokogiri::HTML(req.response)
      page.css(".Top_info div ul li a").each {|a|
        if a.attributes['title'].value =~ /更新状況/
          @queue.push({kind: JOB_KOUSINPAGE, value: a.attributes['href'].value })
        end
      }
      @fetching -= 1
    end    
  end

  # anisoku kousin
  def anisokukousin url
    req = EM::HttpRequest.new(url,:connect_timeout => 50).get

    req.errback { @fetching -= 1 }
    
    req.callback do
      page = Nokogiri::HTML(req.response)
      page.css(".article ul li a").each { |a|
        href = ""
        href = a.attributes["href"].value unless a.attributes["href"].nil?
        if href =~ /^http\:\/\/youtubeanisoku1\.blog106\.fc2\.com\/blog-entry-....\.html$/
          if a.attributes["title"]
            title = a.attributes["title"].value
          end
          if title
            puts title + "-" + href if @debug
            title = title.gsub(" ","",).gsub("/","").gsub("-","")
            @queue.push({kind: JOB_KOBETUPAGE, value: {title: title, href: href } })
          end
        end
      }
      @fetching -= 1
    end    
  end

  # anisoku kobetu
  def anisokukobetu value
    req = EM::HttpRequest.new(value[:href],:connect_timeout => 50).get

    req.errback { @fetching -= 1 }

    req.callback do
      page = Nokogiri::HTML(req.response)
      page.css("a").each { |a|
        href = ""
        href = a.attributes["href"].value unless a.attributes["href"].nil?
        if href =~ /^http:\/\/www.nosub\.tv\/\?s=/
          puts value[:title] + "-" + href if @debug 
          @queue.push({kind: JOB_NOSUBSEARCH, value: {title: value[:title], href: href } })
        end
      }
      @fetching -= 1
    end    
  end

  def nosubsearch value
    urls = []
    
    req = EM::HttpRequest.new(value[:href],:connect_timeout => 50).get
    
    req.errback { @fetching -= 1 }

    req.callback do
      page = Nokogiri::HTML(req.response)
      page.css(".title a[rel='bookmark']").each { |a|
        href = ""
        href = a.attributes["href"].value unless a.attributes["href"].nil?
        episode = a.attributes["title"].value
          .gsub(" ","").gsub("/","").gsub("　","").gsub("-","").gsub("#","")
        puts value[:title] + "-" + episode + "-" + href if @debug
        unless episode =~ /アニメPV集/
          hash = {title: value[:title] ,episode: episode, href: href }
          urls << hash
        end
      }
      @queue.push({kind: JOB_NOSUBVIDEO, value: urls })
      @fetching -= 1
    end
  end
  
  def nosubvideo value
    urls = []
    fetched = false

    value.each { |val|
      path = mkfilepath val[:title],val[:episode]
      
      if File.exists?(path) || File.exists?(path + ".mp4")
        fetched = true
        @fetching -= 1
        return 
      end
      
      break if fetched
      
      @fetching += 1
      
      req = EM::HttpRequest.new(val[:href],:connect_timeout => 50).get

      req.errback { @fetching -= 1 }

      req.callback do
        page = Nokogiri::HTML(req.response)
        videos = []
        page.css("script[type='text/javascript']").each { |script|
          next unless script.children[0] && script.children[0].to_s =~ /MukioPlayerURI/
          lines = script.children[0].to_s.gsub("\n","").split(";")
          lines.each {|l|
            next unless l =~ /addVideo/
            l =~ /type=(.*?)&/
            url = case $1
                  when "fc2"
                    #"type=fc2&vid=20140106PVrVWc2X&cid=msWVIDBFIbAghSVggCCFVpb0pjZFpWPQyIA19b10","360pFC2","",1);
                    l =~ /vid=(.*?)&/
                    u = "https://www.nosub.tv/wp-content/plugins/mukiopress//lianyue/?/fc2/#{$1}"
                    x = false
                    open(u) {|res| x = res.read }
                    x
                  when "video"
                    l =~ /file=(.*?)&/
                    #http://www.nosub.tv/wp-content/plugins/mukiopress/lianyue/?/url/XBCAVbX1ZVVVUGXB1RTEYVRl9JGF9VUAUDUVUAGRdBHFhEAA4LEgRLWRZWFgkLSlwRA1pFGzVaXQ0kVl5SAwYJGTAJDA0gC19UAA0IHAhFUQYt4F4CcB
                    clnt = HTTPClient.new()
                    res = clnt.get($1)
                    x = res.header['Location']
                    x == [] ? false : x[0]
                  when "youtube"
                    false
                  when "qq"
                    false
                  else
                    false
                  end
            
            puts "#{url} - #{l}" if @debug
            checksize = checkvideourl url if url
            if checksize
              downloadvideo url , path , checksize if url
              fetched = true
            end
            break if fetched
          }
        }
        @fetching -= 1        
      end
    }
    @fetching -= 1
  end
  
  def checkvideourl url 
    check = false
    puts "checkvideo url: #{url}"  if @debug
    begin
      http  = Net::HTTP.new(URI.parse(url).host)
      res = http.request_head(URI.parse(url))
      if res['location']
        return checkvideourl res['location']
      else
        if res['content-length'].to_i > 1000
          check = res['content-length'].to_i
        else
          check = false
        end
      end
    rescue => ex
      puts ex.inspect + " url:#{url}"
      check = false
    end
    puts "checkvideo url: #{url} check: #{check.to_s}"  if @debug
    return check
  end
  
  def downloadvideo url , path , size
    downloaded = 0
    
    if File.exists?(path) || File.exists?(path + ".mp4")
      return 
    end
    
    puts "download start: #{url} - #{path}"
    @downloads[path] = "start"
    fetched = false
    begin

      if @usecurl
        @fetching += 1
        command = "curl -# -L -R -o '#{path}' '#{url}' &"
        puts command 
        system command 
        @fetching -= 1
        @downloads[path] = "complete"
        return 
      end
      
      command = "touch '#{path}'"
      system command
      
      @fetching += 1

      file = open(path, "w+b")
      http = EM::HttpRequest.new(url,:connect_timeout => 50)
        .get({:redirects => 10,:head => {"accept-encoding" => "gzip, compressed"}})
      
      http.errback {|client|
        @downloads[path] = "error"
        p "download error: #{path} #{client.inspect}";
        file.close
        @fetching -= 1
        command = "rm -f '#{path}'"
        system command
      }
      
      http.callback {
        file.close
        unless http.response_header.status == 200
          puts "failed with response code #{http.response_header.status}"
        end
        @downloads[path] = "complete"
        puts "download complete: #{path} "
        @fetching -= 1
        @queue.push({kind: JOB_CONVERT,value: path}) if @ffmpeg
        fetched = true
      }

      http.headers do |hash|
        p [:headers, hash]
      end
      
      http.stream do |chunk|
        downloaded += chunk.length
        puts "#{File.basename path} : #{chunk.length}" if @debug
        if size > 0
          @downloads[path] = "download #{(downloaded/MEGA).round}M / #{(size.to_f/MEGA).round}M #{(downloaded.to_f / size.to_f * 100.0).round(2)}%"
        end
        file.write chunk
      end

      # AGENT.pluggable_parser.default = Mechanize::Download
      # AGENT.get(url).save(path)
    rescue => ex
      p ex
      fetched = false
    end
    fetched 
  end
  

  # convert
  def convert value
    command = "ffmpeg -i '#{value}' -vcodec mpeg4 -r 23.976 -b 600k -ac 2 -ar 44100 -ab 128k -strict experimental '#{value}.mp4'"
    puts command
    system command
    command = "rm -f '#{value}'"
    system command
  end

  def mkfilepath title,episode
    mkdirectory title
    DOWNLOADDIR + "/" + title + "/" + episode + ".flv"
  end
  
  def mkdirectory title
    begin 
      Dir.mkdir DOWNLOADDIR + "/" + title
    rescue => ex
    end
  end
  
end

# 高速なサーバーならmp4に変換しておくほうがよいでしょう
Crawler.new(ffmpeg: false,debug: false,usecurl: true).run