package MARC::File::MicroLIF; =head1 NAME MARC::File::MicroLIF - MicroLIF-specific file handling =cut use strict; use warnings; use integer; use vars qw( $ERROR ); use MARC::File; use vars qw( @ISA ); @ISA = qw( MARC::File ); use MARC::Record qw( LEADER_LEN ); =head1 SYNOPSIS use MARC::File::MicroLIF; my $file = MARC::File::MicroLIF->in( $filename ); while ( my $marc = $file->next() ) { # Do something } $file->close(); undef $file; =head1 EXPORT None. =cut =for internal The buffer must be large enough to handle any valid record because we don't check for cases like a CR/LF pair or an end-of-record/CR/LF trio being only partially in the buffer. The max valid record is the max MARC record size (99999) plus one or two characters per tag (CR, LF, or CR/LF). It's hard to say what the max number of tags is, so here we use 6000. (6000 tags can be squeezed into a MARC record only if every tag has only one subfield containing a maximum of one character, or if data from multiple tags overlaps in the MARC record body. We're pretty safe.) =cut use constant BUFFER_MIN => (99999 + 6000 * 2); =head1 METHODS =head2 in() Opens a MicroLIF file for reading. =cut sub in { my $class = shift; my $self = $class->SUPER::in( @_ ); if ( $self ) { bless $self, $class; $self->{exhaustedfh} = 0; $self->{inputbuf} = ''; $self->{header} = undef; # get the MicroLIF header, but handle the case in # which it's missing. my $header = $self->_get_chunk( 1 ); if ( defined $header ) { if ( $header =~ /^LDR/ ) { # header missing, put this back $self->_unget_chunk( $header . "\n" ); # XXX should we warn of a missing header? } else { $self->{header} = $header; } } else { # can't read from the file undef $self; } } return $self; } # new # fill the buffer if we need to sub _fill_buffer { my $self = shift; my $ok = 1; if ( !$self->{exhaustedfh} && length( $self->{inputbuf} ) < BUFFER_MIN ) { # append the next chunk of bytes to the buffer my $read = read $self->{fh}, $self->{inputbuf}, BUFFER_MIN, length($self->{inputbuf}); if ( !defined $read ) { # error! $ok = undef; $MARC::File::ERROR = "error reading from file " . $self->{filename}; } elsif ( $read < 1 ) { $self->{exhaustedfh} = 1; } } return $ok; } =for internal Gets the next chunk of data. If C<$want_line> is true then you get the next chunk ending with any combination of \r and \n of any length. If it is false or not passed then you get the next chunk ending with \x60 followed by any combination of \r and \n of any length. All trailing \r and \n are stripped. =cut sub _get_chunk { my $self = shift; my $want_line = shift || 0; my $chunk = undef; if ( $self->_fill_buffer() && length($self->{inputbuf}) > 0 ) { # the buffer always has at least one full line in it, so we're # guaranteed that if there are no line endings then we're # on the last line. if ( $want_line ) { if ( $self->{inputbuf} =~ /^([^\x0d\x0a]*)([\x0d\x0a]+)/ ) { $chunk = $1; $self->{inputbuf} = substr( $self->{inputbuf}, length($1)+length($2) ); } } else { # couldn't figure out how to make this work as a regex my $pos = -1; while ( !$chunk ) { $pos = index( $self->{inputbuf}, '`', $pos+1 ); last if $pos < 0; if ( substr($self->{inputbuf}, $pos+1, 1) eq "\x0d" or substr($self->{inputbuf}, $pos+1, 1) eq "\x0a" ) { $chunk = substr( $self->{inputbuf}, 0, $pos+1 ); # include the '`' but not the newlines while ( substr($self->{inputbuf}, $pos+1, 1) eq "\x0d" or substr($self->{inputbuf}, $pos+1, 1) eq "\x0a" ) { ++$pos; } # $pos now pointing at last newline char $self->{inputbuf} = substr( $self->{inputbuf}, $pos+1 ); } } } if ( !$chunk ) { $chunk = $self->{inputbuf}; $self->{inputbuf} = ''; $self->{exhaustedfh} = 1; } } return $chunk; } # $chunk is put at the beginning of the buffer exactly as # passed in. No line endings are added. sub _unget_chunk { my $self = shift; my $chunk = shift; $self->{inputbuf} = $chunk . $self->{inputbuf}; return; } sub _next { my $self = shift; my $lifrec = $self->_get_chunk(); # for ease, make the newlines match this platform $lifrec =~ s/[\x0a\x0d]+/\n/g if defined $lifrec; return $lifrec; } =head2 header() If the MicroLIF file has a file header then the header is returned. If the file has no header or the file has not yet been opened then C is returned. =cut sub header { my $self = shift; return $self->{header}; } =head2 decode() Decodes a MicroLIF record and returns a USMARC record. Can be called in one of three different ways: $object->decode( $lif ) MARC::File::MicroLIF->decode( $lif ) MARC::File::MicroLIF::decode( $lif ) =cut sub decode { my $self = shift; my $location = ''; my $text = ''; ## decode can be called in a variety of ways ## this bit of code covers all three if ( ref($self) =~ /^MARC::File/ ) { $location = 'in record '.$self->{recnum}; $text = shift; } else { $location = 'in record 1'; $text = $self=~/MARC::File/ ? shift : $self; } my $marc = MARC::Record->new(); # for ease, make the newlines match this platform $text =~ s/[\x0a\x0d]+/\n/g if defined $text; my @lines = split( /\n/, $text ); for my $line ( @lines ) { ($line =~ s/^([0-9A-Za-z]{3})//) or $marc->_warn( "Invalid tag number: ".substr( $line, 0, 3 )." $location" ); my $tagno = $1; ($line =~ s/\^`?$//) or $marc->_warn( "Tag $tagno $location is missing a trailing caret." ); if ( $tagno eq "LDR" ) { $marc->leader( substr( $line, 0, LEADER_LEN ) ); } elsif ( $tagno =~ /^\d+$/ and $tagno < 10 ) { $marc->add_fields( $tagno, $line ); } else { $line =~ s/^(.)(.)//; my ($ind1,$ind2) = ($1,$2); my @subfields; my @subfield_data_pairs = split( /_(?=[a-z0-9])/, $line ); if ( scalar @subfield_data_pairs < 2 ) { $marc->_warn( "Tag $tagno $location has no subfields--discarded." ); } else { shift @subfield_data_pairs; # Leading _ makes an empty pair for my $pair ( @subfield_data_pairs ) { my ($subfield,$data) = (substr( $pair, 0, 1 ), substr( $pair, 1 )); push( @subfields, $subfield, $data ); } $marc->add_fields( $tagno, $ind1, $ind2, @subfields ); } } } # for return $marc; } 1; __END__ =head1 TODO =over 4 =back =head1 RELATED MODULES L =head1 LICENSE This code may be distributed under the same terms as Perl itself. Please note that these modules are not products of or supported by the employers of the various contributors to the code. =head1 AUTHOR Andy Lester, C<< >> =cut