Commit | Line | Data |
---|---|---|
3b257416 JF |
1 | |
2 | DROP TABLE reindex.to_do; | |
3 | CREATE TABLE reindex.to_do (id int); | |
4 | ||
5 | DROP TABLE reindex.failed; | |
6 | CREATE TABLE reindex.failed (id int, reason TEXT); | |
7 | ||
8 | -- INSERT ALL THE BIBS | |
69800a8e | 9 | INSERT INTO reindex.to_do SELECT id from biblio.record_entry where not deleted; |
3b257416 JF |
10 | |
11 | CREATE OR REPLACE FUNCTION reindex.cleanup_marc(TEXT) RETURNS TEXT as $func$ | |
12 | use MARC::Record; | |
13 | use MARC::File::XML (BinaryEncoding => 'UTF-8'); | |
14 | use MARC::Charset; | |
15 | use Unicode::Normalize; | |
69800a8e | 16 | use Encode qw(decode_utf8); |
3b257416 JF |
17 | use strict; |
18 | ||
19 | MARC::Charset->assume_unicode(1); | |
20 | ||
21 | my $xml = shift; | |
22 | my $r = MARC::Record->new_from_xml( $xml ); | |
23 | ||
24 | for my $f ( $r->field('901') ) { | |
25 | $r->delete_field($f) | |
26 | } | |
27 | ||
28 | $xml = $r->as_xml_record(); | |
29 | ||
30 | $xml =~ s/\n//sgo; | |
31 | $xml =~ s/^<\?xml.+\?\s*>//go; | |
32 | $xml =~ s/>\s+</></go; | |
33 | $xml =~ s/\p{Cc}//go; | |
34 | ||
35 | $xml = decode_utf8($xml); | |
36 | ||
37 | $xml = NFC($xml); | |
38 | ||
39 | # Convert raw ampersands to entities | |
40 | $xml =~ s/&(?!\S+;)/&/gso; | |
41 | ||
42 | # Convert Unicode characters to entities | |
43 | $xml =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe; | |
44 | ||
45 | $xml =~ s/[\x00-\x1f]//go; | |
46 | ||
47 | ||
48 | return $xml; | |
49 | $func$ LANGUAGE PLPERLU; | |
50 | ||
51 | CREATE OR REPLACE FUNCTION reindex.reindex(count INTEGER) RETURNS timestamp AS $$ | |
52 | DECLARE | |
53 | myid INTEGER := 0; | |
54 | i INTEGER := 0; | |
55 | BEGIN | |
56 | FOR myid IN SELECT reindex.to_do.id FROM reindex.to_do LIMIT count LOOP | |
57 | i = i + 1; | |
58 | BEGIN | |
59 | RAISE NOTICE 'Reindexing id: % -- % of % records', myid, i, count; | |
60 | DELETE FROM reindex.to_do WHERE reindex.to_do.id = myid; | |
61 | -- change this or add more functions as needed | |
62 | UPDATE biblio.record_entry set marc = public.entityize(reindex.cleanup_marc(marc)) where biblio.record_entry.id = myid; | |
63 | EXCEPTION WHEN unique_violation THEN | |
64 | INSERT INTO reindex.failed (id, reason) VALUES (myid, 'unique violation'); | |
65 | DELETE FROM reindex.to_do WHERE reindex.to_do.id = myid; | |
66 | RAISE NOTICE '% had a unique key violation and could not be reindexed!', myid; | |
b1f99bb8 | 67 | WHEN OTHERS THEN |
3b257416 JF |
68 | INSERT INTO reindex.failed (id, reason) VALUES (myid, 'unhandled error'); |
69 | DELETE FROM reindex.to_do WHERE reindex.to_do.id = myid; | |
70 | RAISE NOTICE '% had a unique key violation and could not be reindexed!', myid; | |
71 | END; | |
72 | END LOOP; | |
73 | RETURN now(); | |
74 | ||
75 | END; | |
76 | $$ LANGUAGE plpgsql; |