From: Liam Whalen Date: Mon, 2 Nov 2015 17:48:04 +0000 (-0800) Subject: [RT17143] Clean up of 0011 date1 X-Git-Url: https://git.sitka.bclibraries.ca/gitweb/?p=sitka%2Fsitka-tools.git;a=commitdiff_plain;h=fbde651eec1a85769ac622a411683847a055e654 [RT17143] Clean up of 0011 date1 When the first date1 cleanup was performed, I had the code insert a value of 0011 for any record that it could not determine a four digit date from 260 or 264 $c. This code attempts to change the date1 values in those records to valid dates. Signed-off-by: Liam Whalen --- diff --git a/data_cleanup/date1/date1_0011_cleanup.pl b/data_cleanup/date1/date1_0011_cleanup.pl new file mode 100644 index 0000000..d50c822 --- /dev/null +++ b/data_cleanup/date1/date1_0011_cleanup.pl @@ -0,0 +1,274 @@ +#!/usr/bin/perl +# vim:et:ts=4:sw=4: +use strict; +use warnings; + +use DBI; +use OpenSRF::Utils::Config; +use Getopt::Long; +use Net::Domain; +use XML::LibXML; +use XML::LibXML::XPathContext; +use OpenSRF::AppSession; +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); + +require '/srv/openils/bin/oils_header.pl'; +use vars qw/$apputils/; + +my $output = ''; + +my ($gather, $hostname, $core_config, $tmpdir) = + (0, Net::Domain::hostfqdn(), '/srv/openils/conf/opensrf_core.xml', '/tmp/'); + +my ($staff_username, $staff_password) = ''; + +GetOptions( + 'gather' => \$gather, + 'hostname=s' => \$hostname, + 'config_file=s' => \$core_config, + 'tempdir=s' => \$tmpdir, + 'staff_username=s' => \$staff_username, + 'staff_password=s' => \$staff_password, +); + +(my $conf_dir = $core_config) =~ s#(.*)/.*#$1#; +OpenSRF::Utils::Config->load(config_file => $core_config); +my $conf = OpenSRF::Utils::Config->current; +my $settings_config = $conf->bootstrap->settings_config; + +my $xmlparser = XML::LibXML->new(); +my $confxml = $xmlparser->parse_file($core_config); +my $confxpc = XML::LibXML::XPathContext->new($confxml); +my $osrfxml = $xmlparser->parse_file($settings_config); + +my $dbh = init_database_connections(); + +osrf_connect($core_config); + +clean_date1_records($dbh); + +$dbh->disconnect; + +sub clean_date1_records { + my ($dbh) = @_; + + #Get a list of records with bad 008 date1 values. + my $sth = $dbh->prepare("SELECT DISTINCT mrfr.record + FROM metabib.real_full_rec + WHERE tag = '008' AND substring(value, 8, 4) = '0011'"); + $sth->execute; + my $records = $sth->fetchall_arrayref([0]); + $sth->finish; + + my $authtoken = new_auth_token(); + + my $marc = ''; + my $record_id = ''; + for (@$records) { + print "Getting MARC for record: " . $_->[0] . "\n"; + $record_id = $_->[0]; + my ($xml, $create_date) = get_marc_by_id($authtoken, $record_id); + $marc = MARC::Record->new_from_xml($xml, 'UTF-8'); + + my ($year, $month, $day) = $create_date =~ /\d\d(\d\d)-(\d\d)-(\d\d)/; + + my $date_entered = "$year$month$day"; + + if (length($date_entered) < 6) { + #We will use this bogus date entered + #to allow us to easily identify + #bad 008/00-05 create by this update. + $date_entered = '000123'; + } + + my $field_260 = $marc->field('260'); + my $field_264 = $marc->field('264'); + my $pubdate = ''; + my $four_digit_capture = qr/^\D*(\d{4}).*$/; + my $exactly_four_digits = qr/^\d{4}$/; + + if ($field_264) { + $pubdate = $field_264->subfield('c'); + } + + $pubdate =~ s/$four_digit_capture/$1/; + + #There is a chance we have a bogus 264 and + #a valid 260, so reset pubdate if 264 does + #not contain exactly 4 digit characters + #There is still a chance that a 5 digita + #date is in 264 $c, but this will be an + #outlier. + if ($pubdate !~ /$exactly_four_digits/) { + $pubdate = ''; + } + + if ($field_260 && !$pubdate) { + $pubdate = $field_260->subfield('c'); + } + + print $pubdate . "\n"; + exit; + + $pubdate =~ s/$four_digit_capture/$1/; + + #If there is not exactly four digits + #from 260 we reset pubdate to a bogus + #value that we can use to identify + #bad record data + if ($pubdate !~ /$exactly_four_digits/) { + $pubdate = '0011'; + } + + if (length($pubdate) != 4) { + #We will use this bogus pubdate + #To help us identify any records + #that had 260 or 264 $c values + #less than 4 digits + $pubdate = '0011'; + } + + my $field_008 = $marc->field('008'); + + my $data_008 = $field_008->data(); + + my $data_008_00_to_05 = substr($data_008, 0, 6); + + my $data_008_06 = substr($data_008, 6, 1); + + $data_008_00_to_05 =~ s/[^0-9]//g; + + #If we have less than 6 characters at the start + #then we have bad data. Replace it with the + #create_date from the BRE object. + if (length($data_008_00_to_05) < 6) { + $data_008_00_to_05 = $date_entered; + } + + #If we have invalid 008/06 characters + #then we have bad data. Use n as the 008/06 + #which indicates Dates unknown + + if ($data_008_06 !~ /[bcdeikmnpqrstu|]/) { + $data_008_06 = 'n'; + } + + my $data_008_after_10 = substr($data_008, 11); + + my $data_008_with_pubdate = "$data_008_00_to_05$data_008_06$pubdate$data_008_after_10"; + + $field_008->update($data_008_with_pubdate); + + update_marc_by_id($authtoken, $record_id, $marc->as_xml()); + + } + + clear_auth_token($authtoken); +} + +sub init_database_connections { + print "\nInitializing database connection\n"; + # Check database connections + my @databases = $osrfxml->findnodes('//database'); + + # If we have no database connections, this is probably the OpenSRF version + # of opensrf.xml + if (!@databases) { + my $de = "* WARNING: There are no database connections defined in " . + "opensrf.xml. These are defined in services such as " . + "open-ils.cstore and open-ils.reporter. Please ensure that " . + "your opensrf_core.xml and opensrf.xml configuration files " . + "are based on the examples shipped with Evergreen instead of " . + "OpenSRF.\n"; + $output .= $de; + warn $de; + } + + foreach my $database (@databases) { + unless ($database->parentNode->parentNode->localname eq 'open-ils.cstore') { + next; + } + + my $db_name = $database->findvalue("./db"); + if (!$db_name) { + $db_name = $database->findvalue("./name"); + } + my $db_host = $database->findvalue("./host"); + my $db_port = $database->findvalue("./port"); + my $db_user = $database->findvalue("./user"); + my $db_pw = $database->findvalue("./pw"); + + my $osrf_xpath; + foreach my $node ($database->findnodes("ancestor::node()")) { + next unless $node->nodeType == XML::LibXML::XML_ELEMENT_NODE; + $osrf_xpath .= "/" . $node->nodeName; + } + + my $dbh = db_connect($db_name, $db_host, $db_port, $db_user, $db_pw, $osrf_xpath); + + return $dbh; + } +} + +sub db_connect { + my ($db_name, $db_host, $db_port, $db_user, $db_pw, $osrf_xpath) = @_; + + my $dsn = "dbi:Pg:dbname=$db_name;host=$db_host;port=$db_port"; + my $dbh; + + $dbh = DBI->connect($dsn, $db_user, $db_pw); + + # Short-circuit if we didn't connect successfully + unless($dbh) { + warn "* $osrf_xpath :: Unable to connect to database $dsn, user=$db_user, password=$db_pw\n"; + return -1; + } + + return $dbh; +} + +sub new_auth_token { + if ($staff_username eq '' || $staff_password eq '') { + print "staff_username and staff_password need to be set at the command line\n"; + exit; + } + my $authtoken = oils_login($staff_username, $staff_password, 'staff') + or die "Unable to login to Evergreen as user $staff_username"; + return $authtoken; +} + +sub clear_auth_token { + my ($authtoken) = @_; + $apputils->simplereq( + 'open-ils.auth', + 'open-ils.auth.session.delete', + $authtoken + ); +} + +sub get_marc_by_id { + my ($authtoken, $record_id) = @_; + my $bre = $apputils->simplereq( + 'open-ils.pcrud', + 'open-ils.pcrud.search.bre', + $authtoken, + { + id => $record_id + } + ); + + return ($bre->marc, $bre->create_date); +} + +sub update_marc_by_id { + my ($authtoken, $record_id, $marc) = @_; + + my $ret = $apputils->simplereq( + 'open-ils.cat', + 'open-ils.cat.biblio.record.marc.replace', + $authtoken, + $record_id, + $marc + ); +}