#! /usr/bin/ruby -Ku ##### ## DBLP Analyzer ## ## ## % ./parse2.rb [option] ## option: -t { inproceeding | proceedings | journal | collections } ## -a num % 著者の論文数の閾値 ## -c num % 会議の論文数の閾値 ## -s num: year % データ取得開始年 ## -e num: year % データ取得終了年 ## なお,-t のオプションは,inproceedings のみしか動作を確認していない. ##### require 'uconv' require 'rexml/document' require 'optparse' # 下記のファイルは,プログラムを設置した場所に設定する require '/home/kshinoda/Work/Program/ruby/ConferenceNetwork/tools/DataTable' @CooccurrenceRC = Hash.new @ConferenceList = Hash.new @AuthorList = Hash.new $thr_auth = 10 $thr_conf = 100 $start_year = -1 $end_year = -1 $yearp = false def parseCollection(src) paper = REXML::Document.new src conference = "" paper.elements.each("incollection/booktitle") { |item| conference = item.get_text.to_s.sub(/\(.+\)/,"").chop STDERR.puts conference if @ConferenceList.has_key?(conference) @ConferenceList[conference] += 1 else @ConferenceList[conference] = 1 end } paper.elements.each("incollection/author") { |item| author = item.get_text.to_s.chomp(" ") if @AuthorList.has_key?(author) @AuthorList[author] += 1 else @AuthorList[author] = 1 end key = "#{conference}:#{author}" if @CooccurrenceRC.has_key?(key) @CooccurrenceRC[key] += 1 else @CooccurrenceRC[key] = 1 end } end def parseInProceedings(src) paper = REXML::Document.new src conference = "" year = paper.elements["inproceedings/year"].text year = year.gsub("*","").to_i if $yearp == true return if year < $start_year || year >= $end_year end paper.elements.each("inproceedings/booktitle") { |item| conference = item.get_text.to_s.sub(/\(.+\)/,"").sub(/(Vol\..+)/,"").gsub(",","").chomp(" ") conference = conference.gsub("ü","ü").gsub("é","é").sub("&","&").gsub("ß","ß").gsub("Ö","Ö").gsub("®","®").gsub("ä","ä").gsub("ö","ö").gsub("è","è").gsub(" ™","TM").gsub("í","í") next if $localConfTable.has_key?(conference) # ローカル会議の除去 conference = $appendConfTable[conference] if $appendConfTable.has_key?(conference) if conference == "" puts "this is miss: #{item.get_text}" exit end if @ConferenceList.has_key?(conference) @ConferenceList[conference] += 1 else @ConferenceList[conference] = 1 end } paper.elements.each("inproceedings/author") { |item| author = item.get_text.to_s.chomp(" ") if @AuthorList.has_key?(author) @AuthorList[author] += 1 else @AuthorList[author] = 1 end key = "#{conference}:#{author}" if @CooccurrenceRC.has_key?(key) @CooccurrenceRC[key] += 1 else @CooccurrenceRC[key] = 1 end } end def parseProceedings(src) paper = REXML::Document.new src conference = "" paper.elements.each("proceedings/booktitle") { |item| conference = item.get_text.to_s.chomp(" ") if @ConferenceList.has_key?(conference) @ConferenceList[conference] += 1 else @ConferenceList[conference] = 1 end } paper.elements.each("proceedings/author") { |item| author = item.get_text.to_s.chomp(" ") if @AuthorList.has_key?(author) @AuthorList[author] += 1 else @AuthorList[author] = 1 end key = "#{conference}:#{author}" if @CooccurrenceRC.has_key?(key) @CooccurrenceRC[key] += 1 else @CooccurrenceRC[key] = 1 end } end def parseArticle(src) paper = REXML::Document.new src conference = "" paper.elements.each("article/journal") { |item| conference = item.get_text.to_s.chomp(" ") if @ConferenceList.has_key?(conference) @ConferenceList[conference] += 1 else @ConferenceList[conference] = 1 end } paper.elements.each("article/author") { |item| author = item.get_text.to_s.chomp(" ") if @AuthorList.has_key?(author) @AuthorList[author] += 1 else @AuthorList[author] = 1 end key = "#{conference}:#{author}" if @CooccurrenceRC.has_key?(key) @CooccurrenceRC[key] += 1 else @CooccurrenceRC[key] = 1 end } end def outputData(interval) STDERR.puts Uconv.u8toeuc("\nデータ出力中") ## 著者データの出力 count = 0 step = 0.02 size = @AuthorList.size STDERR.puts Uconv.u8toeuc("著者データ出力中: #{size}") authFileName = "AuthorListDBLP-#{interval}-#{$thr_auth}.dat" authFileName = "AuthorListDBLP-#{interval}-#{$thr_auth}-#{$start_year}_#{$end_year}.dat" if $yearp == true authorFile = open(authFileName, "w") @AuthorList.keys.each { |key| if @AuthorList[key] < $thr_auth @AuthorList.delete(key) else authorFile.puts "\"#{key}\",#{@AuthorList[key]}" count += 1 end if count % 10 == 0 parsent = count.to_f/size.to_f astr = "*" * (parsent*40).to_i; line = sprintf("%-12s|%-40s| %7d\/%7d %3d%","著者データ",astr,count,size,parsent*100) STDERR.print(Uconv.u8toeuc("\x0D#{line}")) end } authorFile.close count = 0 size = @ConferenceList.size STDERR.puts Uconv.u8toeuc("\n会議データ出力中: #{size}") ## 会議データの出力 confFileName = "ConferenceListDBLP-#{interval}-#{$thr_conf}.dat" confFileName = "ConferenceListDBLP-#{interval}-#{$thr_conf}-#{$start_year}_#{$end_year}.dat" if $yearp == true confFile = open(confFileName, "w") @ConferenceList.keys.each { |key| if @ConferenceList[key] < $thr_conf @ConferenceList.delete(key) else conf = key conf = $shortnameTable[key] if $shortnameTable.has_key?(key) confFile.puts "\"#{conf}\",#{@ConferenceList[key]}" count += 1 end if count % 10 == 0 parsent = count.to_f/size.to_f astr = "*" * (parsent*40).to_i; line = sprintf("%-12s|%-40s| %7d\/%7d %3d%","会議データ",astr,count,size,parsent*100) STDERR.print(Uconv.u8toeuc("\x0D#{line}")) end } confFile.close ## 共起関係データの出力 STDERR.puts Uconv.u8toeuc("\n共起データ出力中: #{size}") count = 0 size = @AuthorList.size coRCFileName = "CooccurrenceRC-#{interval}-#{$thr_auth}-#{$thr_conf}.dat" coRCFileName = "CooccurrenceRC-#{interval}-#{$thr_auth}-#{$thr_conf}-#{$start_year}_#{$end_year}.dat" if $yearp == true cooccurrenceRCFile = open(coRCFileName,"w") cooccurrenceRCFile.print "\"\"," @ConferenceList.keys.each { |conf| conf = conf.gsub("ü","ü").gsub("é","é").sub("&","&").gsub("ß","ß").gsub("Ö","Ö").gsub("®","®").gsub("ä","ä").gsub("ö","ö").gsub("è","è") cooccurrenceRCFile.print "\"#{conf}\"," } cooccurrenceRCFile.print "\n" @AuthorList.keys.each { |author| cooccurrenceRCFile.print "\"#{author}\"," @ConferenceList.keys.each { |conf| key = "#{conf}:#{author}" if @CooccurrenceRC[key] == nil cooccurrenceRCFile.print "0," else cooccurrenceRCFile.print "#{@CooccurrenceRC[key]}," end } cooccurrenceRCFile.print "\n" count += 1 if count % 100 == 0 parsent = count.to_f/size.to_f astr = "*" * (parsent*40).to_i; line = sprintf("%-12s|%-40s| %7d\/%7d %3d%","共起データ",astr,count,size,parsent*100) STDERR.print(Uconv.u8toeuc("\x0D#{line}")) end } STDERR.print "\n" cooccurrenceRCFile.close @AuthorList = Hash.new @ConferenceList = Hash.new @CooccurenceRC = Hash.new end begin type = "" opts = [] opt = OptionParser.new opt.on("-t type"){|v| type = v} opt.on("-s start_year}"){|v| $start_year = v.to_i} opt.on("-e end_year}") {|v| $end_year = v.to_i} opt.on("-a author_thr") {|v| $thr_auth = v.to_i} opt.on("-c conf_thr") {|v| $thr_conf = v.to_i} opt.parse!(ARGV) start_n = ARGV.shift.to_i | 0 @tagname = "" STDERR.puts "Type: #{type}" case type when "inproceedings" @tagname = " 1970 if $end_year < $start_year STDERR.puts "ERROR: end_year: #{$end_year}" exit end $yearp = true end if $end_year > 1975 if $start_year < 1970 STDERR.puts "ERROR: start_year: #{$start_year}" exit end $yearp = true end File.open('dblp.xml') { |file| # doc = REXML::Document.new file # doc.elements.each("dblp/incollection") { |item| # puts item.text("booktitle").get_text # } count = 0 l_count = 0 storep = false paper_xml = "" while line = file.gets if line =~ /<\/incollection/ && storep == true type = "collection" paper_xml += line storep = false elsif line =~/<\/inproceedings/ && storep == true type = "inproceedings" paper_xml += line storep = false elsif line =~/<\/proceedings/ && storep == true type = "proceedinges" paper_xml += line storep = false elsif line =~/<\/article/ && storep == true type = "article" paper_xml += line storep = false #elsif line =~ / start_n end paper_xml += line if storep == true if storep == false && paper_xml != "" case type when "collection" parseCollection(paper_xml) when "inproceedings" parseInProceedings(paper_xml) when "proceedings" parseProceedings(paper_xml) when "article" parseArticle(paper_xml) end paper_xml = "" if count % 1000 == 0 && count !=0 STDERR.puts count elsif count % 100 == 0 && count != 0 STDERR.print "*" end #if count % 100000 == 0 && count != 0 # outputData("#{start_n}:#{count}") # start_n = count #end end end } outputData(type) end