package WWW::Search::PubMed; =head1 NAME WWW::Search::PubMed - Search the NCBI PubMed abstract database. =head1 SYNOPSIS use WWW::Search; my $s = new WWW::Search ('PubMed'); $s->native_query( 'ACGT' ); while (my $r = $s->next_result) { print $r->title . "\n"; print $r->description . "\n"; } =head1 DESCRIPTION WWW::Search::PubMed provides a WWW::Search backend for searching the NCBI/PubMed abstracts database. =head1 VERSION This document describes WWW::Search::PubMed version 1.004, released 31 October 2007. =head1 REQUIRES L L =cut our($VERSION) = '1.004'; use strict; use warnings; require WWW::Search; require WWW::SearchResult; use WWW::Search::PubMed::Result; use base qw(WWW::Search); use XML::DOM; our $debug = 0; use constant ARTICLES_PER_REQUEST => 20; use constant QUERY_ARTICLE_LIST_URI => 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax=500'; # term=ACTG use constant QUERY_ARTICLE_INFO_URI => 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed'; # &id=12167276&retmode=xml =begin private =item C<< native_setup_search ( $query, $options ) >> Sets up the NCBI search using the supplied C<$query> string. =end private =cut sub native_setup_search { my $self = shift; my $query = shift; my $options = shift; $self->user_agent( "WWW::Search::PubMed/${VERSION} libwww-perl/${LWP::VERSION}; " ); my $ua = $self->user_agent(); my $url = QUERY_ARTICLE_LIST_URI . '&term=' . WWW::Search::escape_query($query); my $response = $ua->get( $url ); my $success = $response->is_success; if ($success) { my $parser = new XML::DOM::Parser; my $content = $response->content; $self->{'_xml_parser'} = $parser; my $doc = $parser->parse( $content ); $self->{'_count'} = eval { ($doc->getElementsByTagName('Count')->item(0)->getChildNodes)[0]->getNodeValue() } || 0; my @articles; my $ids = $doc->getElementsByTagName('Id'); my $n = $ids->getLength; foreach my $i (0 .. $n - 1) { my $node = $ids->item( $i ); my @children = $node->getChildNodes(); push(@articles, + $children[0]->getNodeValue() ); } $self->{'_article_ids'} = \@articles; } else { return undef; } } =begin private =item C<< native_retrieve_some >> Requests search results from NCBI, adding the results to the WWW::Search object's cache. =end private =cut sub native_retrieve_some { my $self = shift; return undef unless scalar (@{ $self->{'_article_ids'} || [] }); my $ua = $self->user_agent(); my $url = QUERY_ARTICLE_INFO_URI . '&id=' . join(',', splice(@{ $self->{'_article_ids'} },0,ARTICLES_PER_REQUEST)) . '&retmode=xml'; warn 'Fetching URL: ' . $url if ($debug); my $response = $ua->get( $url ); if ($response->is_success) { my $content = $response->content; if ($debug) { open (my $fh, ">/tmp/pubmed.article.info"); print { $fh } $content; close($fh); warn "Saved response in /tmp/pubmed.article.info\n"; } my $doc = $self->{'_xml_parser'}->parse( $content ); my $articles = $doc->getElementsByTagName('PubmedArticle'); my $n = $articles->getLength; warn "$n articles found\n" if ($debug); my $count = 0; foreach my $i (0 .. $n - 1) { my $article = $articles->item( $i ); my $id = ($article->getElementsByTagName('PMID')->item(0)->getChildNodes)[0]->getNodeValue(); warn "$id\n" if ($debug); my $title = ($article->getElementsByTagName('ArticleTitle')->item(0)->getChildNodes)[0]->getNodeValue(); warn "\t$title\n" if ($debug); my $url = 'http://www.ncbi.nlm.nih.gov:80/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=' . $id . '&dopt=Abstract'; my @authors; my $authornodes = $article->getElementsByTagName('Author'); my $n = $authornodes->getLength; foreach my $i (0 .. $n - 1) { my ($author, $fname, $lname); eval { $author = $authornodes->item($i); $lname = ($author->getElementsByTagName('LastName')->item(0)->getChildNodes)[0]->getNodeValue(); $fname = substr( ($author->getElementsByTagName('ForeName')->item(0)->getChildNodes)[0]->getNodeValue(), 0, 1) . '.'; }; if ($@) { warn $@ if ($debug); next unless ($lname); } else { push(@authors, join(' ', $lname, $fname)); } } my $author = join(', ', @authors); warn "\t$author\n" if ($debug); my $journal = $self->get_text_node( $article, 'MedlineTA' ); my $page = $self->get_text_node( $article, 'MedlinePgn' ); my $volume = $self->get_text_node( $article, 'Volume' ); my $issue = $self->get_text_node( $article, 'Issue' ); my $pmid = $self->get_text_node( $article, 'PMID' ); my $abstract = $self->get_text_node( $article, 'AbstractText' ); my @date; { my $date = $article->getElementsByTagName('PubDate')->item(0); push(@date, $self->get_text_node( $date, 'Year' )); push(@date, $self->get_text_node( $date, 'Month' )); push(@date, $self->get_text_node( $date, 'Day' )); } my $hit = new WWW::Search::PubMed::Result; my $source = ''; my $date = join(' ', grep defined, @date); $hit->date( $date ); $hit->year( $date[0] ) if (defined($date[0])); $hit->month( $date[1] ) if (defined($date[1])); $hit->day( $date[2] ) if (defined($date[2])); $source = "${journal}. " . ($date ? "${date}; " : '') . ($volume ? "${volume}" : '') . ($issue ? "(${issue})" : '') . ($page ? ":$page" : ''); $source = "(${source})" if ($source); warn "\t$source\n" if ($debug); $hit->add_url( $url ); $hit->title( $title ); $hit->pmid( $pmid ); $hit->abstract( $abstract ) if ($abstract); my $desc = join(' ', grep {$_} ($author, $source)); $hit->description( $desc ); push( @{ $self->{'cache'} }, $hit ); $count++; warn "$count : $title\n" if ($debug); } return $count; } else { warn "Uh-oh." . $response->error_as_HTML(); return undef; } } =begin private =item C<< get_text_node ( $node, $name ) Returns the text contained in the named descendent of the XML $node. =end private =cut sub get_text_node { my $self = shift; my $node = shift; my $name = shift; my $text = eval { ($node->getElementsByTagName($name)->item(0)->getChildNodes)[0]->getNodeValue() }; if ($@) { warn "XML[$name]: $@" if ($debug); return undef; } else { warn "XML[$name]: $text\n" if ($debug); return $text; } } 1; __END__ =head1 SEE ALSO L L L L =head1 COPYRIGHT Copyright (c) 2003-2007 Gregory Todd Williams. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 AUTHOR Gregory Todd Williams C<< >> =cut