# Copyrights 2008-2020 by [Mark Overmeer ]. # For other contributors see ChangeLog. # See the manual pages for details on the licensing terms. # Pod stripped from pm file by OODoc 2.02. # This code is part of distribution XML-LibXML-Simple. Meta-POD processed # with OODoc into POD and HTML manual-pages. See README.md # Copyright Mark Overmeer. Licensed under the same terms as Perl itself. package XML::LibXML::Simple; use vars '$VERSION'; $VERSION = '1.01'; use base 'Exporter'; use strict; use warnings; our @EXPORT = qw(XMLin); our @EXPORT_OK = qw(xml_in); use XML::LibXML (); use File::Basename qw/fileparse/; use File::Spec (); use Carp; use Scalar::Util qw/blessed/; use Data::Dumper; #to be removed my %known_opts = map +($_ => 1), qw(keyattr keeproot forcecontent contentkey noattr searchpath forcearray grouptags nsexpand normalisespace normalizespace valueattr nsstrip parser parseropts hooknodes suppressempty); my @default_attributes = qw(name key id); my $default_content_key = 'content'; #------------- sub new(@) { my $class = shift; my $self = bless {}, $class; my $opts = $self->{opts} = $self->_take_opts(@_); # parser object cannot be reused !defined $opts->{parser} or error __x"parser option for XMLin only"; $self; } #------------- sub XMLin { my $self = @_ > 1 && blessed $_[0] && $_[0]->isa(__PACKAGE__) ? shift : __PACKAGE__->new; my $target = shift; my $this = $self->_take_opts(@_); my $opts = $self->_init($self->{opts}, $this); my $xml = $self->_get_xml($target, $opts) or return; if(my $cb = $opts->{hooknodes}) { $self->{XCS_hooks} = $cb->($self, $xml); } my $top = $self->collapse($xml, $opts); if($opts->{keeproot}) { my $subtop = $opts->{forcearray_always} && ref $top ne 'ARRAY' ? [$top] : $top; $top = +{ $xml->localName => $subtop }; } $top; } *xml_in = \&XMLin; sub _get_xml($$) { my ($self, $source, $opts) = @_; $source = $self->default_data_source($opts) unless defined $source; $source = \*STDIN if $source eq '-'; my $parser = $opts->{parser} || $self->_create_parser($opts->{parseropts}); my $xml = blessed $source && ( $source->isa('XML::LibXML::Document') || $source->isa('XML::LibXML::Element' )) ? $source : ref $source eq 'SCALAR' ? $parser->parse_string($$source) : ref $source ? $parser->parse_fh($source) : $source =~ m{^\s*\<.*?\>\s*$}s ? $parser->parse_string($source) : $parser->parse_file ($self->find_xml_file($source, @{$opts->{searchpath}})); $xml = $xml->documentElement if $xml->isa('XML::LibXML::Document'); $xml; } sub _create_parser(@) { my $self = shift; my @popt = @_ != 1 ? @_ : ref $_[0] eq 'HASH' ? %{$_[0]} : @{$_[0]}; XML::LibXML->new ( line_numbers => 1 , no_network => 1 , expand_xinclude => 0 , expand_entities => 1 , load_ext_dtd => 0 , ext_ent_handler => sub { alert __x"parsing external entities disabled"; '' } , @popt ); } sub _take_opts(@) { my $self = shift; my %opts; @_ % 2==0 or die "ERROR: odd number of options.\n"; while(@_) { my ($key, $val) = (shift, shift); my $lkey = lc $key; $lkey =~ s/_//g; $known_opts{$lkey} or croak "Unrecognised option: $key"; $opts{$lkey} = $val; } \%opts; } # Returns the name of the XML file to parse if no filename or XML string # was provided explictly. sub default_data_source($) { my ($self, $opts) = @_; my ($basename, $script_dir, $ext) = fileparse $0, qr[\.[^\.]+]; # Add script directory to searchpath unshift @{$opts->{searchpath}}, $script_dir if $script_dir; "$basename.xml"; } sub _init($$) { my ($self, $global, $this) = @_; my %opt = (%$global, %$this); if(defined $opt{contentkey}) { $opt{collapseagain} = $opt{contentkey} =~ s/^\-// } else { $opt{contentkey} = $default_content_key } $opt{normalisespace} ||= $opt{normalizespace} || 0; $opt{searchpath} ||= []; ref $opt{searchpath} eq 'ARRAY' or $opt{searchpath} = [ $opt{searchpath} ]; my $fa = delete $opt{forcearray} || 0; my (@fa_regex, %fa_elem); if(ref $fa) { foreach (ref $fa eq 'ARRAY' ? @$fa : $fa) { if(ref $_ eq 'Regexp') { push @fa_regex, $_ } else { $fa_elem{$_} = 1 } } } else { $opt{forcearray_always} = $fa } $opt{forcearray_regex} = \@fa_regex; $opt{forcearray_elem} = \%fa_elem; # Special cleanup for {keyattr} which could be arrayref or hashref, # which behave differently. my $ka = $opt{keyattr} || \@default_attributes; $ka = [ $ka ] unless ref $ka; if(ref $ka eq 'ARRAY') { if(@$ka) { $opt{keyattr} = $ka } else { delete $opt{keyattr} } } elsif(ref $ka eq 'HASH') { # Convert keyattr => { elem => '+attr' } # to keyattr => { elem => [ 'attr', '+' ] } my %at; while(my($k,$v) = each %$ka) { $v =~ /^(\+|-)?(.*)$/; $at{$k} = [ $2, $1 || '' ]; } $opt{keyattr} = \%at; } # Special cleanup for {valueattr} which could be arrayref or hashref my $va = delete $opt{valueattr} || {}; $va = +{ map +($_ => 1), @$va } if ref $va eq 'ARRAY'; $opt{valueattrlist} = $va; # make sure there's nothing weird in {grouptags} !$opt{grouptags} || ref $opt{grouptags} eq 'HASH' or croak "Illegal value for 'GroupTags' option -expected a hashref"; $opt{parseropts} ||= {}; \%opt; } sub find_xml_file($@) { my ($self, $file) = (shift, shift); my @search_path = @_ ? @_ : '.'; my ($filename, $filedir) = fileparse $file; if($filename eq $file) { foreach my $path (@search_path) { my $fullpath = File::Spec->catfile($path, $file); return $fullpath if -e $fullpath; } } elsif(-e $file) # Ignore searchpath if dir component { return $file; } local $" = ':'; die "data source $file not found in @search_path\n"; } sub _add_kv($$$$) { my ($d, $k, $v, $opts) = @_; if(defined $d->{$k}) { # Combine duplicate attributes into arrayref if required if(ref $d->{$k} eq 'ARRAY') { push @{$d->{$k}}, $v } else { $d->{$k} = [ $d->{$k}, $v ] } } elsif(ref $v eq 'ARRAY') { push @{$d->{$k}}, $v } elsif(ref $v eq 'HASH' && $k ne $opts->{contentkey} && $opts->{forcearray_always}) { push @{$d->{$k}}, $v } elsif($opts->{forcearray_elem}{$k} || grep $k =~ $_, @{$opts->{forcearray_regex}} ) { push @{$d->{$k}}, $v } else { $d->{$k} = $v } $d->{$k}; } # Takes the parse tree that XML::LibXML::Parser produced from the supplied # XML and recurse through it 'collapsing' unnecessary levels of indirection # (nested arrays etc) to produce a data structure that is easier to work with. sub _expand_name($) { my $node = shift; my $uri = $node->namespaceURI || ''; (length $uri ? "{$uri}" : '') . $node->localName; } sub collapse($$) { my ($self, $xml, $opts) = @_; $xml->isa('XML::LibXML::Element') or return; my (%data, $text); my $hooks = $self->{XCS_hooks}; unless($opts->{noattr}) { ATTR: foreach my $attr ($xml->attributes) { my $value; if($hooks && (my $hook = $hooks->{$attr->unique_key})) { $value = $hook->($attr); defined $value or next ATTR; } else { $value = $attr->value; } $value = $self->normalise_space($value) if !ref $value && $opts->{normalisespace}==2; my $name = !$attr->isa('XML::LibXML::Attr') ? $attr->nodeName : $opts->{nsexpand} ? _expand_name($attr) : $opts->{nsstrip} ? $attr->localName : $attr->nodeName; _add_kv \%data, $name => $value, $opts; } } my $nr_attrs = keys %data; my $nr_elems = 0; CHILD: foreach my $child ($xml->childNodes) { if($child->isa('XML::LibXML::Text')) { $text .= $child->data; next CHILD; } $child->isa('XML::LibXML::Element') or next CHILD; $nr_elems++; my $v; if($hooks && (my $hook = $hooks->{$child->unique_key})) { $v = $hook->($child) } else { $v = $self->collapse($child, $opts) } next CHILD if ! defined $v && $opts->{suppressempty}; my $name = $opts->{nsexpand} ? _expand_name($child) : $opts->{nsstrip} ? $child->localName : $child->nodeName; _add_kv \%data, $name => $v, $opts; } $text = $self->normalise_space($text) if defined $text && $opts->{normalisespace}==2; return $opts->{forcecontent} ? { $opts->{contentkey} => $text } : $text if $nr_attrs+$nr_elems==0 && defined $text; $data{$opts->{contentkey}} = $text if defined $text && $nr_elems==0; # Roll up 'value' attributes (but only if no nested elements) if(keys %data==1) { my ($k) = keys %data; return $data{$k} if $opts->{valueattrlist}{$k}; } # Turn arrayrefs into hashrefs if key fields present if($opts->{keyattr}) { while(my ($key, $val) = each %data) { $data{$key} = $self->array_to_hash($key, $val, $opts) if ref $val eq 'ARRAY'; } } # disintermediate grouped tags if(my $gr = $opts->{grouptags}) { ELEMENT: while(my ($key, $val) = each %data) { my $sub = $gr->{$key} or next; if(ref $val eq 'ARRAY') { next ELEMENT if grep { keys %$_!=1 || !exists $_->{$sub} } @$val; $data{$key} = { map { %{$_->{$sub}} } @$val }; } else { ref $val eq 'HASH' && keys %$val==1 or next; my ($child_key, $child_val) = %$val; $data{$key} = $child_val if $gr->{$key} eq $child_key; } } } # Fold hashes containing a single anonymous array up into just the array return $data{anon} if keys %data == 1 && exists $data{anon} && ref $data{anon} eq 'ARRAY'; # Suppress empty elements? if(! keys %data && exists $opts->{suppressempty}) { my $sup = $opts->{suppressempty}; return +(defined $sup && $sup eq '') ? '' : undef; } # Roll up named elements with named nested 'value' attributes if(my $va = $opts->{valueattrlist}) { while(my($key, $val) = each %data) { $va->{$key} && ref $val eq 'HASH' && keys %$val==1 or next; $data{$key} = $val->{$va->{$key}}; } } $nr_elems+$nr_attrs ? \%data : !defined $text ? {} : $opts->{forcecontent} ? { $opts->{contentkey} => $text } : $text; } sub normalise_space($) { my $self = shift; local $_ = shift; s/^\s+//s; s/\s+$//s; s/\s\s+/ /sg; $_; } # Attempts to 'fold' an array of hashes into an hash of hashes. Returns a # reference to the hash on success or the original array if folding is # not possible. Behaviour is controlled by 'keyattr' option. # sub array_to_hash($$$$) { my ($self, $name, $in, $opts) = @_; my %out; my $ka = $opts->{keyattr} or return $in; if(ref $ka eq 'HASH') { my $newkey = $ka->{$name} or return $in; my ($key, $flag) = @$newkey; foreach my $h (@$in) { unless(ref $h eq 'HASH' && defined $h->{$key}) { warn "<$name> element has no '$key' key attribute\n" if $^W; return $in; } my $val = $h->{$key}; if(ref $val) { warn "<$name> element has non-scalar '$key' key attribute\n" if $^W; return $in; } $val = $self->normalise_space($val) if $opts->{normalisespace}==1; warn "<$name> element has non-unique value in '$key' " . "key attribute: $val\n" if $^W && defined $out{$val}; $out{$val} = { %$h }; $out{$val}{"-$key"} = $out{$val}{$key} if $flag eq '-'; delete $out{$val}{$key} if $flag ne '+'; } } else # Arrayref { my $default_keys = "@default_attributes" eq "@$ka"; ELEMENT: foreach my $h (@$in) { ref $h eq 'HASH' or return $in; foreach my $key (@$ka) { my $val = $h->{$key}; defined $val or next; if(ref $val) { warn "<$name> element has non-scalar '$key' key attribute" if $^W && ! $default_keys; return $in; } $val = $self->normalise_space($val) if $opts->{normalisespace} == 1; warn "<$name> element has non-unique value in '$key' " . "key attribute: $val" if $^W && $out{$val}; $out{$val} = { %$h }; delete $out{$val}{$key}; next ELEMENT; } return $in; # No keyfield matched } } $opts->{collapseagain} or return \%out; # avoid over-complicated structures like # dir => { libexecdir => { content => '$exec_prefix/libexec' }, # localstatedir => { content => '$prefix' }, # } # into # dir => { libexecdir => '$exec_prefix/libexec', # localstatedir => '$prefix', # } my $contentkey = $opts->{contentkey}; # first go through the values, checking that they are fit to collapse foreach my $v (values %out) { next if !defined $v; next if ref $v eq 'HASH' && keys %$v == 1 && exists $v->{$contentkey}; next if ref $v eq 'HASH' && !keys %$v; return \%out; } $out{$_} = $out{$_}{$contentkey} for keys %out; \%out; } 1; __END__