Merge branch 'master' of git+ssh://git.sitka.bclibraries.ca/sitka/sitka-tools
[sitka/sitka-tools.git] / marc_import_overdrive / overdrive-import.pl
CommitLineData
38891e1e
JF
1#!/usr/bin/perl
2use strict;
3use warnings;
4
5use lib '/openils/lib/perl5/';
6
7use Error qw/:try/;
8use OpenILS::Utils::Fieldmapper;
9use Digest::MD5 qw/md5_hex/;
10use OpenSRF::Utils::JSON;
11use OpenILS::Application::AppUtils;
12use Data::Dumper;
13use Unicode::Normalize;
14use Encode;
15
16use FileHandle;
17use Time::HiRes qw/time/;
18use Getopt::Long;
19use MARC::Batch;
20use MARC::File::XML ( BinaryEncoding => 'utf-8' );
21use MARC::Charset;
22use DBI;
23
24#MARC::Charset->ignore_errors(1);
25
26my ($config, $idlfile, $marctype, $enc) =
27 ('/srv/openils/conf/opensrf_core.xml', '/srv/openils/conf/fm_IDL.xml', 'USMARC', 'utf8');
28
29my (@files, @trash_fields, @req_fields, $quiet, $startid);
30
31my @targetorg = ('BPR','BFN','BTE');
32
33@req_fields = ('856');
34my $overdrive_prefix = 'http\:\/\/downloads\.bclibrary\.ca\/ContentDetails\.htm\?ID\=';
35my $tcn_prefix = "LtG_";
36
37GetOptions(
38 'marctype=s' => \$marctype, # format of MARC files being processed defaults to USMARC, often set to XML
39 'encoding=s' => \$enc, # set assumed MARC encoding for MARC::Charset
40 'config=s' => \$config, # location of OpenSRF core config file, defaults to /openils/conf/opensrf_core.xml
41 'file=s' => \@files, # files to process (or you can simple list the files as unnamed arguments, i.e. @ARGV)
42 'required_fields=s' => \@req_fields, # skip any records missing these fields
43 'trash=s' => \@trash_fields, # fields to remove from all processed records
44 'xml_idl=s' => \$idlfile, # location of XML IDL file, defaults to /openils/conf/fm_IDL.xml
45 'startid=i' => \$startid, #starting ID
46 'quiet' => \$quiet # do not output progress count
47);
48
49@trash_fields = split(/,/,join(',',@trash_fields));
50@req_fields = split(/,/,join(',',@req_fields));
51
52if ($enc) {
53 MARC::Charset->ignore_errors(1);
54 MARC::Charset->assume_encoding($enc);
55}
56
57if (uc($marctype) eq 'XML') {
58 'open'->use(':utf8');
59} else {
60 bytes->use();
61}
62
63@files = @ARGV if (!@files);
64
65Fieldmapper->import(IDL => $idlfile);
66
67select STDERR; $| = 1;
68select STDOUT; $| = 1;
69
70my $batch = new MARC::Batch ( $marctype, @files );
71$batch->strict_off();
72$batch->warnings_off();
73
74my $starttime = time;
75my $rec;
76my $count = 0;
77
78my $id = 1 || $startid;
79
80PROCESS: while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) {
81 next if ($rec == -1);
82 print STDERR "======\n";
83 $id++;
84 $count++;
85
86 # Skip records that don't contain a required field (like '245', for example)
87 foreach my $req_field (@req_fields) {
88 if (!$rec->field("$req_field")) {
89 warn "\n!!! Record $count missing required field $req_field, skipping record.\n";
90 next PROCESS;
91 }
92 }
93
94# -----------------
95# Overdrive - specific code
96# -----------------
97
98 my $tcn_value;
99 my $tcn_source = 'Library To Go';
100
101 my $caption = 'DOWNLOADABLE AUDIOBOOK';
102
103 # this is the base 856 field we're going to generate separate fields for each org unit we're scoping at
104 my $baseurifield;
105
106 # check all 856s
107 URIFIELD: foreach my $uri ($rec->field('856')){
108
109 # Overdrive uses a $3 for Excerpts, we want to keep this intact so carry on then
110 next URIFIELD if ($uri->subfield('3'));
111
112 # we need a $u for a URL, if we don't have this it is bad
113 my $url = $uri->subfield('u');
114 if(!$url){
115 warn "856 has no URL in rec $id. Skipping";
116 }
117
118 # this record has been through Evergreen if a $9 exists somewhere
119 # instead, we can presumably pull the tcn from the 901 and generate our base field that way
120 if($uri->subfield('9')){
121 # if this doesn't match our prefix, ignore it, we only care about our current prefix
122 next unless($url =~ m/($overdrive_prefix)/);
123
124 # if it does:
125 # delete $9 subfield and use this as a base uri field
126 $baseurifield = $uri->clone;
127 $rec->delete_fields($uri);
128 next URIFIELD;
129 }
130
131
132 if($uri->subfield('z')){
133 $caption = 'EBOOK' if ($uri->subfield('z') =~ /Book/);
134 $uri->delete_subfield(code => 'z');
135 $uri->add_subfields('z' => 'Click to access online (library card required)');
136
137 next unless($url =~ m/($overdrive_prefix)/);
138
139 # trim out Overdrive's magical GUID-looking ID thingy
140 my $overdrivekey = $url;
141 $overdrivekey =~ s/($overdrive_prefix)//g;
142
143 # make it TCN-ish
144 $tcn_value = $tcn_prefix . $overdrivekey;
145
146 # we have a base for our scoped fields
147 $baseurifield = $uri->clone;
148
149 $rec->delete_fields($uri);
150 }
151 }
152
153 if(!$baseurifield){
154 die "the horror!";
155 }
156 # add some arbitrary stuff as prescribed by our cataloguer overlords
157 $rec = adjust_leader($rec);
158 $rec = process_custom_fields($rec);
159
160
161 # add a scoped field for each org unit in our array
162 foreach(@targetorg){
163 if($baseurifield){
164 my $newfield = $baseurifield->clone();
165 $newfield->add_subfields('9' => $_);
166 $rec->insert_fields_ordered($newfield);
167 } else {
168 die;# $rec->as_formatted();
169 }
170 }
171
172# -----------------
173# END Overdrive - specific code
174# -----------------
175
176
177 $rec->delete_field($_) for ($rec->field(@trash_fields));
178
179 my $field901 = MARC::Field->new(
180 '901' => ('', ''),
181 a => $tcn_value,
182 b => $tcn_source,
183 c => $id,
184 );
185
186
187 $rec->insert_fields_ordered($field901);
188
189 print $rec->as_formatted();
190
191 next PROCESS;
192
193
194 $tcn_value = $rec->subfield('901' => 'a');
195 $tcn_source = $rec->subfield('901' => 'b');
196 $id = $rec->subfield('901' => 'c');
197
198 (my $xml = $rec->as_xml_record()) =~ s/\n//sog;
199 $xml =~ s/^<\?xml.+\?\s*>//go;
200 $xml =~ s/>\s+</></go;
201 $xml =~ s/\p{Cc}//go;
202 $xml = OpenILS::Application::AppUtils->entityize($xml);
203 $xml =~ s/[\x00-\x1f]//go;
204
205 my $bib = new Fieldmapper::biblio::record_entry;
206 $bib->id($id);
207 $bib->active('t');
208 $bib->deleted('f');
209 $bib->marc($xml);
210 $bib->creator(0);
211 $bib->create_date('now');
212 $bib->editor(0);
213 $bib->edit_date('now');
214 $bib->tcn_source($tcn_source);
215 $bib->tcn_value($tcn_value);
216 $bib->last_xact_id('IMPORT-'.$starttime);
217
218 #print OpenSRF::Utils::JSON->perl2JSON($bib)."\n";
219
220 if (!$quiet){# && !($count % 50)) {
221 print STDERR "\r$count\t". $count / (time - $starttime);
222 }
223}
224
225sub adjust_leader {
226 my $rec = shift;
227 my $leader = $rec->leader();
228 $leader = substr($leader,0,5) . 'm' . substr($leader,6,length($leader));
229 $rec->leader($leader);
230 return $rec;
231}
232
233sub process_custom_fields{
234 my $rec = shift;
235 my $caption = shift;
236
237 my @newfields;
238
239 push @newfields, MARC::Field->new(
240 '538' => (' ', ' '),
241 a => "Requires OverDrive Media Console"
242 );
243 push @newfields, MARC::Field->new(
244 '594' => (' ', ' '),
245 a => "Library To Go"
246 );
247
248 push @newfields, MARC::Field->new(
249 '655' => (' ', '4'),
250 'a' => $caption
251 );
252
253 $rec->insert_fields_ordered(@newfields);
254 return $rec;
255}
256