越前藩国 Wiki

shikint.rb

最終更新：2007年07月28日 01:18

匿名ユーザー

- view

だれでも歓迎！編集

require 'open-uri'
require 'rubygems'
require 'hpricot'
require 'kconv'
require 'jcode'
require 'net/http'
 
$KCODE='u'
 
 
#財務表パーサー（抽象クラス）
class ZaimuParser
  @@nations = {}
 
  #属性の定義
  attr :no, true
  attr :url, true
  attr :name , true
  attr :xpath , true
  attr :parsed , false
  attr :response , false
  attr :keyword , true
 
  #初期化メソッド
  def initialize
    @no = nil
    @url = nil
    @name = nil
    @parsed = nil
  end
 
   #パーサオブジェクトのファクトリメソッド。生成し、ブロックで与えられたプロパティ設定コードを実行した後、ハッシュに登録する。
  def self.create(&br)
    n = self.new
    n.instance_eval &br
    self.nations[n.no]=n
    n
  end
 
  #登録されたパーサオブジェクト（を保持するハッシュ）を返す
  def self.nations
     @@nations
  end
 
 #対象URLからHTMLドキュメントを読み込み、これを解析してHTMLのDOMツリーに変換する。
  def read_document
    doc = Hpricot(open(@url).read.toutf8)
  end
 
  #属性へのアクセサメソッド
  def no(val=nil)
 	@no = val if val
	@no
  end
 
  def name(val=nil)
	@name = val if val
	@name
  end
 
  def url(val=nil)
	@url = val if val
	@url
  end
  def xpath(val=nil)
    @xpath = val if val
    return @xpath if @xpath
  end
 
  def keyword(val=nil)
     @keyword = val if val
     @keyword
  end
 
 
  #HTTP のHEADメソッドで、ヘッダ情報のみを読み込む。
  def head
    uri = URI(@url)
    @response = nil
    Net::HTTP.start( uri.host, 80 ) {|http|
        @response = http.head( uri.path )
    }
  end
 
end
 
#「財務書式から残高を読み取る」パーサ
class TableParser < ZaimuParser
  attr :table_index , true
  attr :row_index,true
  attr :col_range, true
 
  #初期化メソッド。（各国で一番ありがちな）デフォルト値を設定している。
  def initialize
    super
    @xpath = "tr"  #デフォルトでは、財務表に含まれる<TR>タグの中から残高情報を探す
    @keyword = nil
    @row_index = -1 #インデックス値＝-1とは、Rubyでは「最後から１番目」を意味する
    @col_range = 5..9 #吏族書式の財務表では、6～10カラム目が資金・資源・食料・燃料・娯楽となっている。
  end
 
  #対象URLからページを読み取り、その中の（通常は）<TR>タグを抜き出し、さらにその中に指定されたキーワードが含まれる<TR>タグのみを返す。
  def read_table
    doc =  read_document
    (doc/(xpath)).find_all{|r|r.inner_text =~ keyword}
  end
 
  #キーワードに合致した<TR>タグのうち、指定されたただ一つを返す。(通常はページの一番最後に現れた物が最新の残高であることが多い）
  def read_row
    read_table[@row_index]
  end
 
  #属性へのアクセサ
  def row_index(val=nil)
     @row_index = val if val
     @row_index
  end
 
  def col_range(val=nil)
     @col_range = val if val
     @col_range
  end
 
  #ページを解析し、表の中から選ばれた行（ＴＲタグ）の中から、資金～娯楽に該当する５つを抜き出す。
  #ついでに２バイト文字の数字を半角に直したりもする。
  def parse
    last = read_row
    result = (last/"td").collect{|c|c.inner_text.strip.tr('０-９','0-9')}
    @parsed = result[@col_range]
    result
  end
 
end
 
#「フリーフォーマットで書かれた財務表」用パーサ（Blog形式の藩国で多いので、こんなクラス名に）
class BlogPageParser < ZaimuParser
 
  def initialize
    super
    @col_range = 0..4
  end
 
  def parse
    doc = read_document #HTMLページを読み込む。
    elem =  doc/(xpath) #HTMLページから、財務情報が含まれているタグを抜き出す
    text = elem.inner_text  #タグの中身の文字列を抜き出す。
    cols = []
    #抜き出したテキストを１行単位で調べ、「お金：」とか「資源：」とかなっている行に着目し、そこに書かれた数字を抜き出す。
    for keyword in [/金[：　]?\s*([０-９\d]+)/,/資源[：　]?\s*(([\d０-９]+))/,/食料[：　]?\s*(([\d０-９]+))/,/燃料[：　]?\s*(([\d０-９]+))/,/娯楽[：　]?\s*(([\d０-９]+))/ ]
      lines = elem.inner_text.find{|line| line =~ keyword}
      lines.to_s =~ keyword
      cols << $1.tr('０-９','0-9')
    end
    result = cols
    @parsed = result[@col_range].collect{|c|c.to_i}
    result
  end
 
end
 
#テスト用サンプルプログラム
#for value in ZaimuParser.nations.values.sort_by{|n| n.no}
#  print value.name.tosjis ,":"
#  value.head
#  p value.response['Last-Modified']
#  p value.response.key
#  begin 
#    parsed = value.parse
#  rescue
#    next
#  end
#  print parsed.join(',').to_s.tosjis,"\n"
#  print value.parsed.join(",").tosjis , "\n"
#
#end