Fix chomp.
[sitka/sitka-tools.git] / marc_export_custom / marc_export_custom
CommitLineData
456d236e
JF
1#!/usr/bin/perl
2# vim:et:sw=4:ts=4:
3use strict;
4use warnings;
5use bytes;
6
7use OpenSRF::System;
8use OpenSRF::EX qw/:try/;
9use OpenSRF::AppSession;
10use OpenSRF::Utils::JSON;
11use OpenSRF::Utils::SettingsClient;
12use OpenILS::Application::AppUtils;
13use OpenILS::Utils::Fieldmapper;
14use OpenILS::Utils::CStoreEditor;
15
16use MARC::Record;
17use MARC::File::XML;
18use UNIVERSAL::require;
19
20use Time::HiRes qw/time/;
21use Getopt::Long;
22
23use Config::Simple;
24use Data::Dumper;
25
26my @formats = qw/USMARC UNIMARC XML BRE ARE/;
27
28my ($config,$format,$encoding,$location,$dollarsign,$idl,$help,$holdings,$timeout,$export_mfhd,$type,$all_records) = ('/openils/conf/opensrf_core.xml','USMARC','MARC8','','$',0,undef,undef,0,undef,'biblio',undef);
48566282 29my ($exclusion_ini,$collapse_to_depth, $output_file);
456d236e 30my $cfg;
5eb3fcb8 31my $force901;
456d236e
JF
32
33GetOptions(
34 'help' => \$help,
35 'items' => \$holdings,
36 'mfhd' => \$export_mfhd,
37 'all' => \$all_records,
38 'location=s' => \$location,
39 'money=s' => \$dollarsign,
40 'config=s' => \$config,
41 'format=s' => \$format,
42 'type=s' => \$type,
43 'xml-idl=s' => \$idl,
44 'encoding=s' => \$encoding,
45 'timeout=i' => \$timeout,
5eb3fcb8 46 'force901' => \$force901,
456d236e
JF
47 'exclusion_ini=s' => \$exclusion_ini,
48 'collapse_to_depth=i' => \$collapse_to_depth,
48566282 49 'output-file=s' => \$output_file,
456d236e
JF
50);
51
52$cfg = new Config::Simple($exclusion_ini) if ($exclusion_ini);
53
54if ($help) {
55print <<"HELP";
56This script exports MARC authority, bibliographic, and serial holdings
57records from an Evergreen database.
58
59Input to this script can consist of a list of record IDs, with one record ID
60per line, corresponding to the record ID in the Evergreen database table of
61your requested record type.
62
63Alternately, passing the --all option will attempt to export all records of
64the specified type from the Evergreen database. The --all option starts at
65record ID 1 and increments the ID by 1 until the largest ID in the database
66is retrieved. This may not be very efficient for databases with large gaps
67in their ID sequences.
68
69Usage: $0 [options]
70 --help or -h This screen.
71 --config or -c Configuration file [/openils/conf/opensrf_core.xml]
72 --format or -f Output format (USMARC, UNIMARC, XML, BRE, ARE) [USMARC]
73 --encoding or -e Output encoding (UTF-8, ISO-8859-?, MARC8) [MARC8]
74 --xml-idl or -x Location of the IDL XML
75 --timeout Timeout for exporting a single record; increase if you
76 are using --holdings and are exporting records that
77 have a lot of items attached to them.
78 --type or -t Record type (BIBLIO, AUTHORITY) [BIBLIO]
79 --all or -a Export all records; ignores input list
80
81 Additional options for type = 'BIBLIO':
82 --items or -i Include items (holdings) in the output
83 --money Currency symbol to use in item price field [\$]
84 --mfhd Export serial MFHD records for associated bib records
85 Not compatible with --format=BRE
86 --location or -l MARC Location Code for holdings from
87 http://www.loc.gov/marc/organizations/orgshome.html
88
89Examples:
90
91To export a set of USMARC records in a file named "output_file" based on the
92IDs contained in a file named "list_of_ids":
93 cat list_of_ids | $0 > output_file
94
95To export a set of MARC21XML authority records in a file named "output.xml"
96for all authority records in the database:
97 $0 --format XML --type AUTHORITY --all > output.xml
98
99HELP
100 exit;
101}
102
103$type = lc($type);
104$format = uc($format);
105$encoding = uc($encoding);
106
48566282
RJ
107my $outfh;
108my $real_stdout;
109open($real_stdout, ">&STDOUT") or die "Can't dup STDOUT: $!";
110if($output_file) {
111 open($outfh, '>', $output_file) or die "Can't open file for output $output_file: $!";
112} else {
113 $outfh = $real_stdout;
114}
115
116binmode($outfh, ':raw') if ($encoding ne 'UTF-8');
117binmode($outfh, ':utf8') if ($encoding eq 'UTF-8');
456d236e
JF
118
119if (!grep { $format eq $_ } @formats) {
120 die "Please select a supported format. ".
121 "Right now that means one of [".
122 join('|',@formats). "]\n";
123}
124
125if ($format ne 'XML') {
126 my $type = 'MARC::File::' . $format;
127 $type->require;
128}
129
130if ($timeout <= 0) {
131 # set default timeout and/or correct silly user who
132 # supplied a negative timeout; default timeout of
133 # 300 seconds if exporting items determined empirically.
134 $timeout = $holdings ? 300 : 1;
135}
136
137OpenSRF::System->bootstrap_client( config_file => $config );
138
139if (!$idl) {
140 $idl = OpenSRF::Utils::SettingsClient->new->config_value("IDL");
141}
142
143Fieldmapper->import(IDL => $idl);
144
145my $ses = OpenSRF::AppSession->create('open-ils.cstore');
146OpenILS::Utils::CStoreEditor::init();
147my $editor = OpenILS::Utils::CStoreEditor->new();
148
48566282 149print $outfh <<HEADER if ($format eq 'XML');
456d236e
JF
150<?xml version="1.0" encoding="$encoding"?>
151<collection xmlns='http://www.loc.gov/MARC21/slim'>
152HEADER
153
154my %orgs;
155my %shelves;
156my %statuses;
157my %outypes;
158
159my $flesh = {};
160
161if ($holdings) {
162 get_bib_locations();
163}
164
165my $start = time;
166my $last_time = time;
167my %count = ('bib' => 0, 'did' => 0);
168my $speed = 0;
169
170if ($all_records) {
171 my $top_record = 0;
172 if ($type eq 'biblio') {
173 $top_record = $editor->search_biblio_record_entry([
174 {deleted => 'f'},
175 {order_by => { 'bre' => 'id DESC' }, limit => 1}
176 ])->[0]->id;
177 } elsif ($type eq 'authority') {
178 $top_record = $editor->search_authority_record_entry([
179 {deleted => 'f'},
180 {order_by => { 'are' => 'id DESC' }, limit => 1}
181 ])->[0]->id;
182 }
183 for (my $i = 0; $i++ < $top_record;) {
184 export_record($i);
185 }
186} else {
187 while ( my $i = <> ) {
188 export_record($i);
189 }
190}
191
48566282 192print $outfh "</collection>\n" if ($format eq 'XML');
456d236e
JF
193
194$speed = $count{did} / (time - $start);
195my $time = time - $start;
196print STDERR <<DONE;
197
198Exports Attempted : $count{bib}
199Exports Completed : $count{did}
200Overall Speed : $speed
201Total Time Elapsed: $time seconds
202
203DONE
204
205sub export_record {
206 my $id = shift;
207
208 my $bib;
209
210 my $r = $ses->request( "open-ils.cstore.direct.$type.record_entry.retrieve", $id, $flesh );
211 my $s = $r->recv(timeout => $timeout);
212 if (!$s) {
213 warn "\n!!!!! Failed trying to read record $id\n";
214 return;
215 }
216 if ($r->failed) {
217 warn "\n!!!!!! Failed trying to read record $id: " . $r->failed->stringify . "\n";
218 return;
219 }
220 if ($r->timed_out) {
221 warn "\n!!!!!! Timed out trying to read record $id\n";
222 return;
223 }
224 $bib = $s->content;
225 $r->finish;
226
227 $count{bib}++;
228 return unless $bib;
229
230 if ($format eq 'ARE' or $format eq 'BRE') {
48566282 231 print $outfh OpenSRF::Utils::JSON->perl2JSON($bib);
456d236e
JF
232 stats();
233 $count{did}++;
234 return;
235 }
236
237 try {
238
239 my $r = MARC::Record->new_from_xml( $bib->marc, $encoding, $format );
240 if ($type eq 'biblio') {
241 add_bib_holdings($bib, $r);
242 }
243
244 if ($format eq 'XML') {
245 my $xml = $r->as_xml_record;
246 $xml =~ s/^<\?.+?\?>$//mo;
48566282 247 print $outfh $xml;
456d236e 248 } elsif ($format eq 'UNIMARC') {
48566282 249 print $outfh $r->as_usmarc;
456d236e 250 } elsif ($format eq 'USMARC') {
48566282 251 print $outfh $r->as_usmarc;
456d236e
JF
252 }
253
254 $count{did}++;
255
256 } otherwise {
257 my $e = shift;
28ab66e8 258 my $errorid = $id;
7ac3e65f 259 chomp($errorid);
e759b05b
JF
260 chomp($e);
261 warn "\nERROR ON RECORD $errorid: $e\n";
456d236e
JF
262 import MARC::File::XML; # reset SAX parser so that one bad record doesn't kill the entire export
263 };
264
265 if ($export_mfhd and $type eq 'biblio') {
266 my $mfhds = $editor->search_serial_record_entry({record => $id, deleted => 'f'});
267 foreach my $mfhd (@$mfhds) {
268 try {
269 my $r = MARC::Record->new_from_xml( $mfhd->marc, $encoding, $format );
270
271 if ($format eq 'XML') {
272 my $xml = $r->as_xml_record;
273 $xml =~ s/^<\?.+?\?>$//mo;
48566282 274 print $outfh $xml;
456d236e 275 } elsif ($format eq 'UNIMARC') {
48566282 276 print $outfh $r->as_usmarc;
456d236e 277 } elsif ($format eq 'USMARC') {
48566282 278 print $outfh $r->as_usmarc;
456d236e
JF
279 }
280 } otherwise {
281 my $e = shift;
e759b05b
JF
282 my $errorid = chomp($id);
283 chomp($e);
284 warn "\nERROR ON MFHD RECORD $errorid: $e\n";
456d236e
JF
285 import MARC::File::XML; # reset SAX parser so that one bad record doesn't kill the entire export
286 };
287 }
288 }
289
290 stats() if (! ($count{bib} % 50 ));
291}
292
293sub stats {
294 try {
295 no warnings;
296
297 $speed = $count{did} / (time - $start);
298
299 my $speed_now = ($count{did} - $count{did_last}) / (time - $count{time_last});
300 my $cn_speed = $count{cn} / (time - $start);
301 my $cp_speed = $count{cp} / (time - $start);
302
303 printf STDERR "\r $count{did} of $count{bib} @ \%0.4f/s ttl / \%0.4f/s rt ".
304 "($count{cn} CNs @ \%0.4f/s :: $count{cp} CPs @ \%0.4f/s)\r",
305 $speed,
306 $speed_now,
307 $cn_speed,
308 $cp_speed;
309 } otherwise {};
310 $count{did_last} = $count{did};
311 $count{time_last} = time;
312}
313
314sub get_bib_locations {
315 print STDERR "Retrieving Org Units ... ";
316 my $r = $ses->request( 'open-ils.cstore.direct.actor.org_unit.search', { id => { '!=' => undef } } );
317
318 while (my $o = $r->recv) {
319 die $r->failed->stringify if ($r->failed);
320 $o = $o->content;
321 last unless ($o);
322 $orgs{$o->id} = $o;
323 }
324 $r->finish;
325 print STDERR "OK\n";
326
327 print STDERR "Retrieving Copy statuses ... ";
328 $r = $ses->request( 'open-ils.cstore.direct.config.copy_status.search', { id => { '!=' => undef } } );
329
330 while (my $sta = $r->recv) {
331 die $r->failed->stringify if ($r->failed);
332 $sta = $sta->content;
333 last unless ($sta);
334 $statuses{$sta->id} = $sta;
335 }
336 $r->finish;
337 print STDERR "OK\n";
338
339 print STDERR "Retrieving OU types ... ";
340 $r = $ses->request( 'open-ils.cstore.direct.actor.org_unit_type.search', { id => { '!=' => undef } } );
341
342 while (my $outy = $r->recv) {
343 die $r->failed->stringify if ($r->failed);
344 $outy = $outy->content;
345 last unless ($outy);
346 $outypes{$outy->id} = $outy;
347 }
348 $r->finish;
349 print STDERR "OK\n";
350
351 print STDERR "Retrieving Shelving locations ... ";
352 $r = $ses->request( 'open-ils.cstore.direct.asset.copy_location.search', { id => { '!=' => undef } } );
353
354 while (my $s = $r->recv) {
355 die $r->failed->stringify if ($r->failed);
356 $s = $s->content;
357 last unless ($s);
358 $shelves{$s->id} = $s;
359 }
360 $r->finish;
361 print STDERR "OK\n";
362
363 $flesh = { flesh => 2, flesh_fields => { bre => [ 'call_numbers' ], acn => [ 'copies' ] } };
364}
365
366sub add_bib_holdings {
367 my $bib = shift;
368 my $r = shift;
369
370 my $cn_list = $bib->call_numbers;
371 if ($cn_list && @$cn_list) {
372
373 $count{cn} += @$cn_list;
374
375 my $cp_list = [ map { @{ $_->copies } } @$cn_list ];
376 if ($cp_list && @$cp_list) {
377
378 my %cn_map;
379 push @{$cn_map{$_->call_number}}, $_ for (@$cp_list);
380
381 for my $cn ( @$cn_list ) {
382 my $cn_map_list = $cn_map{$cn->id};
383
384 COPYMAP: for my $cp ( @$cn_map_list ) {
385 $count{cp}++;
386
387
388 my $owninglib = $cn->owning_lib;
389 my $circlib = $cp->circ_lib;
390 my $printlib = $cp->circ_lib;
391
392 if($cfg){
393 my $thisorg = $orgs{$circlib};
394
395 if($collapse_to_depth){
396 while ( $outypes{ $thisorg->ou_type }->depth > $collapse_to_depth ){
397 my $localcfg = $cfg->param(-block=> $thisorg->shortname);
398 if( $localcfg->{'DontCollapse'} ){
399 last;
400 }
401 if($thisorg->parent_ou){
402 $thisorg = $orgs{$thisorg->parent_ou};
403 $printlib = $thisorg->id;
404 }
405 }
406 }
407
408 $thisorg = $orgs{$circlib};
409
806177d6 410
456d236e 411 while( $thisorg ){
456d236e
JF
412 # load the local config from the .ini file for exclusions
413 my $localcfg = $cfg->param(-block=> $thisorg->shortname);
414
415 # if we see this setting, just skip that org
416
48566282
RJ
417 if( $localcfg->{'ExcludeEntireOrg'} )
418 { skipnote($bib->id,"ExcludeEntireOrg"); next COPYMAP; }
456d236e
JF
419
420 # what follows are exclusion rules
806177d6 421
456d236e
JF
422 # Excluded Flags
423 if($localcfg->{'Flags'}){
424 # this little line is just forcing scalars into an array so we can 'use strict' with Config::Simple
425 my @flags = ( (ref($localcfg->{'Flags'}) eq "ARRAY") ? @{$localcfg->{'Flags'}} : ($localcfg->{'Flags'}));
48566282
RJ
426 if( grep { $_ eq 'reference' } @flags && $cp->ref eq 't')
427 { skipnote($bib->id,"Flags: reference"); next COPYMAP; }
428 elsif( grep { $_ eq 'unholdable' } @flags && $cp->holdable eq 'f')
429 { skipnote($bib->id,"Flags: unholdable"); next COPYMAP; }
430 elsif( grep { $_ eq 'circulate' } @flags && $cp->circulate eq 'f')
431 { skipnote($bib->id,"Flags: circulate"); next COPYMAP; }
432 elsif( grep { $_ eq 'hidden' } @flags && $cp->opac_visible eq 'f')
433 { skipnote($bib->id,"Flags: hidden"); next COPYMAP; }
456d236e
JF
434 }
435 # Excluded Circ Modifiers
436 if($localcfg->{'CircMods'}){
806177d6 437 my $circmod = $cp->circ_modifier || "";
456d236e 438 my @circmods = ( (ref($localcfg->{'CircMods'}) eq "ARRAY") ? @{$localcfg->{'CircMods'}} : ($localcfg->{'CircMods'}) );
48566282
RJ
439 if( grep { $_ eq $circmod } @circmods && @circmods)
440 { skipnote($bib->id,"CircMods"); next COPYMAP; }
456d236e 441 }
3add9381
JF
442 # Inverse rule -- only include specified Circ Mods
443 if($localcfg->{'OnlyIncludeCircMods'}){
806177d6 444 my $circmod = $cp->circ_modifier || "";
3add9381 445 my @circmods = ( (ref($localcfg->{'CircMods'}) eq "ARRAY") ? @{$localcfg->{'CircMods'}} : ($localcfg->{'CircMods'}) );
48566282
RJ
446 unless( grep { $_ eq $circmod } @circmods && @circmods)
447 { skipnote($bib->id,"OnlyIncludeCircMods"); next COPYMAP; }
3add9381 448 }
456d236e
JF
449 # Excluded Copy Statuses
450 if($localcfg->{'Statuses'}){
451 my @statuses = ( (ref($localcfg->{'Statuses'}) eq "ARRAY") ? @{$localcfg->{'Statuses'}} : ($localcfg->{'Statuses'}) );
48566282
RJ
452 if( grep { $_ eq $statuses{$cp->status}->name } @statuses && @statuses)
453 { skipnote($bib->id,"Statuses"); next COPYMAP; }
456d236e
JF
454 }
455 # Excluded Locations
456 if($localcfg->{'Locations'}){
457 my @locations = ( (ref($localcfg->{'Locations'}) eq "ARRAY") ? @{$localcfg->{'Locations'}} : ($localcfg->{'Locations'}) );
48566282
RJ
458 if( grep { $_ eq $shelves{$cp->location}->name } @locations && @locations)
459 { skipnote($bib->id,"Locations"); next COPYMAP; }
456d236e
JF
460 }
461 # Inverse rule - Only use the specified locations
462 if($localcfg->{'OnlyIncludeLocations'}){
463 my @locations = ( (ref($localcfg->{'OnlyIncludeLocations'}) eq "ARRAY") ? @{$localcfg->{'OnlyIncludeLocations'}} : ($localcfg->{'Locations'}) );
48566282
RJ
464 unless( grep { $_ eq $shelves{$cp->location}->name } @locations && @locations)
465 { skipnote($bib->id,"OnlyIncludeLocations"); next COPYMAP; }
456d236e
JF
466 }
467 # exclude based on a regex match to location names
468 if($localcfg->{'LocationRegex'}){
469 my @locregex = ( (ref($localcfg->{'LocationRegex'}) eq "ARRAY") ? @{$localcfg->{'LocationRegex'}} : ($localcfg->{'LocationRegex'}) );
470 my $reg = $localcfg->{'LocationRegex'};
48566282
RJ
471 if( grep { $shelves{$cp->location}->name =~ m/($reg)/ } @locregex && @locregex)
472 { skipnote($bib->id,"LocationRegex"); next COPYMAP; }
456d236e
JF
473 }
474 # include based on a regex match to location names
475 if($localcfg->{'OnlyIncludeLocationRegex'}){
476 my @locregex = ( (ref($localcfg->{'OnlyIncludeLocationRegex'}) eq "ARRAY") ? @{$localcfg->{'OnlyIncludeLocationRegex'}} : ($localcfg->{'OnlyIncludeLocationRegex'}) );
477 my $reg = $localcfg->{'OnlyIncludeLocationRegex'};
48566282
RJ
478 unless( grep { $shelves{$cp->location}->name =~ m/($reg)/ } @locregex && @locregex)
479 { skipnote($bib->id,"OnlyIncludeLocationRegex"); next COPYMAP; }
456d236e
JF
480 }
481 # Exclude based on a callno regex
482 if($localcfg->{'CallNoRegex'}){
483 my @callnoregex = ( (ref($localcfg->{'CallNoRegex'}) eq "ARRAY") ? @{$localcfg->{'CallNoRegex'}} : ($localcfg->{'CallNoRegex'}) );
484 my $reg = $localcfg->{'CallNoRegex'};
48566282
RJ
485 if( grep { $cn->label =~ m/($reg)/ } @callnoregex && @callnoregex)
486 { skipnote($bib->id,"CallNoRegex"); next COPYMAP; }
456d236e
JF
487 }
488 # Include based on a callno regex
489 if($localcfg->{'OnlyIncludeCallNoRegex'}){
490 my @callnoregex = ( (ref($localcfg->{'OnlyIncludeCallNoRegex'}) eq "ARRAY") ? @{$localcfg->{'OnlyIncludeCallNoRegex'}} : ($localcfg->{'OnlyIncludeCallNoRegex'}) );
491 my $reg = $localcfg->{'OnlyIncludeCallNoRegex'};
48566282
RJ
492 unless( grep { $cn->label =~ m/($reg)/ } @callnoregex && @callnoregex)
493 { skipnote($bib->id,"OnlyIncludeCallNoRegex"); next COPYMAP; }
456d236e 494 }
806177d6 495
456d236e
JF
496 # Trim call number to a float and exclude based on Dewey Range
497 if($localcfg->{'DeweyGT'} || $localcfg->{'DeweyLT'}){
498 my $gt = $localcfg->{'DeweyGT'};
499 my $lt = $localcfg->{'DeweyLT'};
500
501 # FIXME if either config has an array just ditch for now
48566282
RJ
502 if (ref($gt) eq "ARRAY" or ref($lt) eq "ARRAY")
503 { skipnote($bib->id,""); next COPYMAP; }
456d236e
JF
504 $gt =~ s/[^0-9\.]//g if $gt; #trim off anything not deweyish
505 $lt =~ s/[^0-9\.]//g if $lt; #trim off anything not deweyish
506
507 my $callno = $cn->label;
508 $callno =~ s/[^0-9\.]//g; #trim off anything not deweyish
509 print STDERR $callno;
510 #note that we are making big assumptions about the call numbers in the db
511
512 # we have a range, exclude what's inbetween
513 if($lt && $gt){
48566282
RJ
514 if($callno > $gt and $callno < $lt)
515 { skipnote($bib->id,"Dewey LTGT"); next COPYMAP; }
456d236e
JF
516 # we only have a top threshold, exclude everything below it
517 } elsif ($lt){
48566282
RJ
518 if($callno < $lt)
519 { skipnote($bib->id,"Dewey LT"); next COPYMAP; }
456d236e
JF
520 # we only have a bottom threshold, exclude everything above it
521 } elsif ($gt){
48566282
RJ
522 if($callno > $gt)
523 { skipnote($bib->id,"Dewey GT"); next COPYMAP; }
456d236e
JF
524 }
525 }
526
456d236e
JF
527 if($thisorg->parent_ou){
528 $thisorg = $orgs{$thisorg->parent_ou}
529 } else {
530 $thisorg = ();
531 }
532
533 }
534 }
535
536 $r->append_fields(
537 MARC::Field->new(
538 852, '4', '',
539 a => $location,
540 b => $orgs{$printlib}->shortname,
541 #b => $orgs{$owninglib}->shortname,
542 #b => $orgs{$circlib}->shortname,
543 c => $shelves{$cp->location}->name,
544 j => $cn->label,
545 ($cp->circ_modifier ? ( g => $cp->circ_modifier ) : ()),
546 p => $cp->barcode,
547 ($cp->price ? ( y => $dollarsign.$cp->price ) : ()),
548 ($cp->copy_number ? ( t => $cp->copy_number ) : ()),
549 ($cp->ref eq 't' ? ( x => 'reference' ) : ()),
550 ($cp->holdable eq 'f' ? ( x => 'unholdable' ) : ()),
551 ($cp->circulate eq 'f' ? ( x => 'noncirculating' ) : ()),
552 ($cp->opac_visible eq 'f' ? ( x => 'hidden' ) : ()),
553 z => $statuses{$cp->status}->name,
554 )
555 );
556
5eb3fcb8
JF
557 if($force901){
558 $r->delete_field( $r->field('901') );
559 $r->append_fields(
560 MARC::Field->new(
561 '901', ' ', ' ',
562 a => $bib->tcn_value,
563 b => $bib->tcn_source,
564 c => $bib->id
565 )
566 );
567
568 }
569
456d236e
JF
570 stats() if (! ($count{cp} % 100 ));
571 } # for cnmap
572 } # for cnlist
573 } # if block
574 } # if block
575} # sub
48566282
RJ
576
577sub skipnote {
578 printf(STDOUT "Skipped %s due to config: %s\n",$1,$2);
579}