-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathharvester.rb
executable file
·107 lines (104 loc) · 3.46 KB
/
harvester.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env ruby
# $Id$
require "fileutils"
require "time"
require "yaml"
require "rubygems"
require "oai"
# require "oai/harvester"
module Fuwatto
class Harvester
HARVEST_DIR = "harvest"
attr_reader :sites, :conf
def initialize( config_file )
@conf = YAML.load_file( config_file )
@sites = @conf.keys
end
def start
now = Time.now
sites.each do |site|
puts site
opts = build_options_hash( conf[ site ] )
opts[ :until ] = now.utc.xmlschema
last_updated = last_harvest_time( site )
# p last_updated
unless last_updated.nil? or last_updated == 0
opts[ :from ] = Time.at( last_updated ).utc.xmlschema
puts "last updated: #{ opts[ :from ] }"
else
opts[ :from ] = earliest( opts[:url] )
last_updated = 0
end
puts opts[ :url ]
period = 60 * 60 * 24 # default rotation is "daily".
case opts[ :period ]
when "daily"
period = 60 * 60 * 24
when "weekly"
period = 60 * 60 * 24 * 7
end
if ( now.to_i - last_updated ) < period # and false
puts "interval skip: last updated at #{ Time.at( last_updated ).iso8601 }"
next
end
p opts
oai = OAI::Client.new( opts[ :url ], opts )
opts.delete( :url )
opts.delete( :period )
response = oai.list_records( opts )
FileUtils.mkdir_p( File.join( HARVEST_DIR, site ) )
open( "#{ HARVEST_DIR }/#{ site }/#{ now.to_i }.xml", "w" ) do |io|
io.print response.doc
end
count = 2
while response.resumption_token
puts "resumptionToken: #{ response.resumption_token }"
response = oai.list_records( :resumptionToken => response.resumption_token )
open( "#{ HARVEST_DIR }/#{ site }/#{ now.to_i }-#{ count }.xml", "w" ) do |io|
io.print response.doc
end
count += 1
end
end
end
def last_harvest_time( site )
files = Dir.glob( "#{ HARVEST_DIR }/#{ site }/*.xml" )
# p "#{ HARVEST_DIR }/#{ site }/*.xml"
# p files
if files.nil? or files.empty?
puts "Files not found. This is the first harvesting!"
nil
else
files = files.map{|e|
/\A(\d+)\.xml\Z/ =~ File.basename( e )
$1.to_i
}.compact
# p files
files.sort[-1]
end
end
# Get earliest timestamp from repository
def earliest(url)
client = OAI::Client.new url
identify = client.identify
if "YYYY-MM-DD" == identify.granularity
Time.parse(identify.earliest_datestamp).strftime("%Y-%m-%d")
else
Time.parse(identify.earliest_datestamp).xmlschema
end
end
def build_options_hash( site )
options = { :url => site['url'] }
options[:set] = site['set'] if site['set']
options[:metadata_prefix] = site['prefix'] if site['prefix']
options
end
end
end
if $0 == __FILE__
crawler = Fuwatto::Harvester.new( "harvester.conf" )
crawler.start
# conf = YAML.load_file( "harvester.conf" )
# crawler = OAI::Harvester::Harvest.new
# crawler.start
end