package Monkey::APAParser; # End of sentence our $EOS = qr/[\.\?\!]/; our $IS_MONTH = qr/\b( jan(?:uary)? | feb(?:r?uary)? | mar(?:ch)? | apr(?:il)? | may | june? | july? | aug(?:ust)? | sep(?:t(?:ember)?)? | oct(?:ober)? | nov(?:ember)? | dec(?:ember)? )\b/ix; our $TOKENS = qr/[\s\p{IsPunct}]*(?:__[A-Z]+__[\s\p{IsPunct}]*)+/; our $TITLE = qr/(__TITLE__(?:\s+__[A-Z]+__)*\.\s*)/; our $BOOKTITLE = qr/(__BOOKTITLE__(?:\s+__[A-Z]+__)*\.\s*)/; sub new { my ($class) = @_; return bless {}, $class; } sub cleanup { my ($self, $string, $key) = @_; warn "TOKEN IN STRING:\n$self->{_original}\n$key: $string\n\n" if $key && $key ne 'string' && $key ne 'template' && $string =~ /__[A-Z]+__/; warn "DATE IN STRING:\n$self->{_original}\n$key: $string\n\n" if $key && $key ne 'date' && $key ne 'year' && $key ne 'string' && $key ne 'template' && $string =~ /$IS_MONTH/i; $string =~ s/Prodeedings/Proceedings/is; $string =~ s/\s+/ /sg; $string =~ s/^\s*//sg; $string =~ s/[\s\.,]*$//sg; $string =~ tr/\x91\x92\x93\x94\x96\x97/''""\-\-/s; $string =~ s/''+//sg; # Get rid of double quotes and wiki emphasis # Fix et al. $string =~ s/ et al$/ et al./i; warn "EMPTY STRING:\n$self->{_original}\n$key: $string\n\n" unless !$key || $string =~ /\S/; return $string; } sub cleanup_all { my ($self) = @_; foreach (keys %$self) { if( /^_/ ) { delete $self->{$_} if /^_/ } else { $self->{$_} = $self->cleanup($self->{$_}, $_) } } return $self; } sub canonicalize_names { my ($self, $string) = @_; # Squeeze the string $string =~ s/\s+/ /; # Save the character entities! $string =~ s/\&(\#?\w+;)/\$$1/sg; # Break out the individual names so we can # put them back together. my @names = grep $_, split /\s*(?:,|&|and)\s*/, $string; # Names seem inverted if there is a segment # consisting only of initials and no full names # or starting with a "von" or "de". if( grep { !/\p{IsUpper}\S*\p{IsLower}/ || /^\s*\p{IsLower}+\b/ } @names ) { my @inverted; while (@names) { my @name = reverse splice @names, 0, 2; push @inverted, join ' ', @name; } @names = @inverted; } # Reconnect the names my $names = join ' and ', @names; # Fix any stray initials missing periods. $names =~ s/(\p{IsUpper})\s/$1./sg; # Fix any squished initials. $names =~ s/(\p{IsUpper}\.)\s+/$1 /sg; # Replace the character entities $names =~ s/\$(\#?\w+;)/\&$1/sg; return $names; } sub token_tail { my ($self, $string) = @_; my $tail = $1 if $string =~ s/($TOKENS)$//; return ($tail, $string) if wantarray; return $tail; } sub parse { my ($self, $string) = @_; $self->{_original} = $string; $string = $self->cleanup( $string ); $self->{string} = $string; my @methods = ( # Electronic resource (web pages, FTP sites) should be at the # end because they have been tacked onto the older APA style. # Get rid of it so we can move onto the classical stuff. 'electronic_resource', # Now do the easy bits that float through the string, # especially bits like pages and edition that have # periods that interfere with the EOS pattern 'year', 'author_or_institution', 'edition', 'pages', # The title comes after the year, so it's easy to find. 'title', # Journal articles are easy to identify as well, so # try them next. 'journal_article', # Getting trickier 'inside_book', 'publisher', 'author', 'conference', # Mop up the rest 'interrupted_title', 'tail', ); eval { while( my $method = shift @methods ) { $method = "parse_$method"; $self->$method; } }; return "$@\n$self->{_original}" if $@; foreach (qw(author editor)) { $self->{$_} = $self->canonicalize_names($self->{$_}) if exists $self->{$_} && $self->{$_} =~ /\S/; } $self->{template} = $self->{string}; $self->cleanup_all; return "Failed to parse" unless $self->{string} =~ /^$TOKENS$/; return "No type" unless $self->{bibtype}; delete $self->{string}; return; } sub parse_electronic_resource { my ($self) = @_; my $UrlCharacter = qr"[\p{IsAlnum}\;/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\'\(\)\%\#\|]"; my $UrlProtocols = "http|https|ftp|news|mailto|telnet|gopher"; my $UrlPattern = qr<((?:$UrlProtocols):$UrlCharacter+)>; if( $self->{string} =~ s/$UrlPattern/__URL__/ ) { $self->{url} = $1; # Lose the whole electronic reference part. $self->{string} =~ s/(Accessed|Retrieved|Available).*?__URL__.*//i; # Best bet so far $self->{bibtype} = 'url'; } } # Years can be of the form (2004b), (August, 2004), (n.d.), # (n.d-a), (forthcoming) sub parse_year { my ($self) = @_; die "Invalid year" unless $self->{string} =~ s/\(([^\)]*\b(\d{4}[a-z]?|n\.d[\.\-].*?|forthcoming)\s*)\)/__YEAR__/; $self->{year} = $1; } # Whatever comes before the year is the author, or the # institution that wrote the technical report. sub parse_author_or_institution { my ($self) = @_; die "Invalid author" unless $self->{string} =~ s/^(.*?)\s+__YEAR__/__AUTHOR__ __YEAR__/; $self->{_author_or_institution} = $1; die "Invalid author\n__AUTHOR__=$self->{_author_of_institution}" unless $self->{_author_or_institution} =~ /\S/; } sub parse_edition { my ($self) = @_; if( $self->{string} =~ s/\(\s*(\d+[a-z]+)\s+ed\b\.?\s*\)/__EDITION__/i || $self->{string} =~ s/,\s+(\d+[a-zA-Z]+)\s+ed\b\.?/, __EDITION__/i ) { $self->{edition} = $1; } } sub parse_pages { my ($self) = @_; if( $self->{string} =~ s/\(?(pp?)\.?\s*(\d+)(?:-+(\d+))\s*\)?/__PAGES__/ || $self->{string} =~ s/(\p{IsPunct})\s*(\d+)-+(\d+)\s*\P{IsAlnum}*$/$1 __PAGES__ / ) { ($self->{pages_start}, $self->{pages_end}) = ($2, $3); } # Single page? elsif( $self->{string} =~ s/,\s*(\d+)\s*$/, __PAGES__/ ) { ($self->{pages_start}, $self->{pages_end}) = ($1, $1); } } # The title comes immediately after the year. Titles should # be terminated with a EOS. They may be wrapped in "quotes", # in which case the title is terminated at the end quote. # # NOTE: For this to work, bits that may have periods in it # like (3rd ed.) or (pp. 10-42) need to have already been removed. sub parse_title { my ($self) = @_; if ( $self->{string} =~ s/__YEAR__[\.,\s]+(['"])(.*?)\1\s*\p{IsPunct}?\s*/__YEAR__ __TITLE__. / || $self->{string} =~ s/__YEAR__[\.,\s]+(.*?$EOS)\s*/__YEAR__ __TITLE__. / ) { $self->{bibtitle} = $2 || $1; die "Invalid title\n__TITLE__==$self->{bibtitle}" unless $self->{bibtitle} =~ /\S/; # We may have sucked up the __EDITION__. Put it back. my $tail; ($tail, $self->{bibtitle}) = $self->token_tail( $self->{bibtitle} ); $self->{string} =~ s/__TITLE__/__TITLE__ $tail/ if $tail; } # Although this is crap, some people write: # Web page title (year). http://... elsif( $self->{url} ) { $self->{bibtitle} = $self->{_author_or_institution}; delete $self->{_author_or_institution}; } else { die "Invalid title"; } } sub parse_journal_article { my ($self) = @_; # First, look for the pattern 3(4), which indicates the volume # and issue of a journal article. Note that sometimes issues are # combined, so it may appear as 3(4&5) or 3(4-5). # # If that fails, just look for , 3, 10-14, meaning the volume # and then the pages. if( $self->{string} =~ s/(\d+)\s*\(([\d\p{IsPunct}\s]+)\)\s*/__VOLUME__(__ISSUE__)/ || $self->{string} =~ s/,\s*(\d+)\s*,\s*__PAGES__/, __VOLUME__, __PAGES__/ ) { $self->{volume} = $1; $self->{issue} = $2 if $2; $self->{bibtype} = 'article'; # Grab the journal title, which preceeds the volume. die "Invalid journal" unless $self->{string} =~ s/$TITLE(.*?),?\s*__VOLUME__/$1\__JOURNAL__, __VOLUME__/; $self->{journal} = $2; } } # A pair XXX: YYY at the end of the string usually is the # publisher information. In the case of a technical report, # the stuff after the colon is the author(s). sub parse_publisher { my ($self) = @_; my @bits = split /([:\.\!\?])/, $self->{string}; my @tail; push @tail, pop @bits while $bits[$#bits] =~ /^$TOKENS$|^[\s\p{IsPunct}]+$/; return unless @bits >= 3; my ($location, $colon, $publisher) = splice @bits,-3; push @bits, $1 while $location =~ s/^(.*$TOKENS\s*)//; return unless $colon eq ':'; return unless $location =~ /\p{IsAlpha}/; push @bits, ' __LOCATION__: '; my $tail; ($tail, $publisher) = $self->token_tail( $publisher ); return unless $publisher =~ /\p{IsAlpha}/; push @bits, ' __PUBLISHER__'; push @bits, $tail if $tail; $self->{location} = $location; # It may be a technical report, so decide on this later $self->{_author_or_publisher} = $publisher; # If it has a publisher, let's assume it is a book # until we know otherwise. $self->{bibtype} = 'book' unless $self->{bibtype}; # Rebuild the string $self->{string} = join '', @bits, @tail; } sub parse_author { my ($self) = @_; # Already claimed as the title by a webpage. return unless exists $self->{_author_or_institution}; # Use some heuristics to guess if it is a technical # report, in which case the bit after the last colon # is the author. Otherwise it is the publisher. if(( $self->{_author_or_publisher} =~ /\b[A-Z]\./ || $self->{_author_or_publisher} =~ /et[.\s]+al/ || $self->{_author_or_institution} !~ /,/ ) && !( $self->{_author_or_institution} =~ /et[.\s]+al/i || $self->{_author_or_institution} =~ /\(\s*ed.\s*\)/i || $self->{_author_or_publisher} =~ /P\.\O\.\s*Box/i ) ) { $self->{bibtype} = 'techreport'; # Ok, swap things around. $self->{string} =~ s/__PUBLISHER__/__AUTHOR__/; $self->{string} =~ s/^__AUTHOR__/__INSTITUTION__/; $self->{author} = $self->{_author_or_publisher} if $self->{_author_or_publisher} =~ /\S/; $self->{institution} = $self->{_author_or_institution}; } # It may be a book without authors, just editors. elsif( $self->{_author_or_institution} =~ s/\(eds?\.\)[\.,]?//i ) { $self->{editor} = $self->{_author_or_institution}; $self->{string} =~ s/__AUTHOR__/__EDITOR__/; $self->{publisher} = $self->{_author_or_publisher} if $self->{_author_or_publisher} =~ /\S/; } # Otherwise, the authors are authors and the publisher is # a publisher. else { $self->{author} = $self->{_author_or_institution}; $self->{publisher} = $self->{_author_or_publisher} if $self->{_author_or_publisher} =~ /\S/; } } # Inside a book: Chapter or conference paper sub parse_inside_book { my ($self) = @_; $self->{string} =~ s/(,|$EOS) In\s+/$1 __IN__ /i; # It may inside a book, with editors. if( $self->{string} =~ s/__IN__\s+(.*?)\(eds?\.?\)\s*[,\.]?\s*/__IN__ __EDITORS__, /i ) { $self->{editor} = $1; $self->{bibtype} = 'inbook'; # The book title should come after. Again this relies on all # bits with periods having been removed already. return "Invalid book title: $self->{string}" unless $self->{string} =~ s/__EDITORS__,\s*(.*?)$EOS\s*/__EDITORS__, __BOOKTITLE__. / || $self->{string} =~ s/__EDITORS__,\s*(.*?)\s*__/__EDITORS__, __BOOKTITLE__. __/; $self->{booktitle} = $1; } # It may be inside a book, but with no editors cited. # If it's "In" something, but there are no (Eds.), then # assume the bit after "In" is the book title. elsif( $self->{string} =~ s/__IN__\s*(.*?)\./__IN__ __BOOKTITLE__\./i || $self->{string} =~ s/__IN__\s*(.*?)\,/__IN__ __BOOKTITLE__\./i ) { $self->{booktitle} = $1; $self->{bibtype} = 'inbook'; } # It may be conference proceedings written by someone too lazy # to write the In. elsif( $self->{string} =~ s/(Proceedings.*?)\./__IN__ __BOOKTITLE__\./i ) { $self->{booktitle} = $1; $self->{bibtype} = 'proceedings'; } # We may have sucked up the __EDITION__ or __PAGES__. Put it back. if( $self->{booktitle} ) { my $tail; ($tail, $self->{booktitle}) = $self->token_tail( $self->{booktitle} ); $self->{string} =~ s/__BOOKTITLE__/__BOOKTITLE__ $tail/ if $tail; } } sub parse_conference { my ($self) = @_; # It isn't a book or a chapter if the book title contains # 'Proceedings'. Then it is a conference. if( $self->{booktitle} =~ /Proceedings/ ) { # It's a conference submission if it is "In" the proceedings. $self->{bibtype} = $self->{string} =~ /__IN__ / ? 'inproceedings' : 'proceedings'; $self->parse_event_location; } if( $self->{string} =~ s/$TITLE(.*?Conference),\s*/$1\__ORGANIZATION__, / ) { $self->{organization} = $2; $self->parse_event_location; } } # If it is a conference, the conference location and # date come afterwards. sub parse_event_location { my ($self) = @_; # To be safe, grab everything we haven't parsed. # Put __ first to prevent sucking up __PAGES__ # Put \. first to prevent inhaling past \. # Put eat to the end of string last my $location; if( $self->{booktitle} =~ /$IS_MONTH/i ) { my @bits = split ',', $self->{booktitle}; $self->{booktitle} = shift @bits; $location = join ',', @bits; $self->{string} =~ s/$BOOKTITLE/$1\__EVENTLOCATION__. /; } elsif( $self->{string} =~ s/($BOOKTITLE|__ORGANIZATION__,\s*)(__PAGES__\.?\s*)?(.*?)(\s*__|\.\s*)/$1$3\__EVENTLOCATION__$5/ # || $self->{string} =~ s/($BOOKTITLE|__ORGANIZATION__,\s*)(.*?)\.\s*/$1\__EVENTLOCATION__. / || $self->{string} =~ s/($BOOKTITLE|__ORGANIZATION__,\s*)(__PAGES__\.?\s*)?([^\._]+)$/$1$3\__EVENTLOCATION__/ ) { $location = $4; } if( $location ) { $self->parse_organization; ($self->{date}, $self->{event_location}) = $self->extract_date( $location ); if( $self->{date} =~ /\S/ ) { # Mark the date in the string template $self->{string} =~ s/__EVENTLOCATION__/__EVENTLOCATION__, __DATE__/; } else { delete $self->{date}; } unless( $self->{event_location} =~ /\p{IsAlpha}/ ) { delete $self->{event_location}; $self->{string} =~ s/\s*__EVENTLOCATION__//; } } } # Organizations are only relevant to conferences. It should come # right after the event location and date. sub parse_organization { my ($self) = @_; if( $self->{string} =~ s/__EVENTLOCATION__\.?\s+([^_].+?)(_|$)/__EVENTLOCATION__. __ORGANIZATION__, $2/ ) { $self->{organization} = $1; unless( $self->{organization} =~ /\p{IsAlpha}/ ) { delete $self->{organization}; $self->{string} =~ s/__ORGANIZATION__, //; } } } # Extracts a date fragment from a CSV string and returns # the two strings. sub extract_date { my ($self, $string) = @_; my $date; # Search for the month inside the string. my @bits = split /\s*,\s*/, $string; my $month; for( $month=0; $month < @bits; $month++ ) { last if $bits[$month] =~ /$IS_MONTH/; } if( $month < @bits ) { # Scan backwards looking for any days or year. # $start will end up one before the actual date my $start = $month-1; $start-- while $bits[$start] =~ /^[\d\s\-]+$/; # Scan forwards looking for any days or year # $end will end up one after the actual date. my $end = $month+1; $end++ while $bits[$end] =~ /^[\d\s\-]+$/; # Rip out the date. Since the date spans from # ($start+1)..($end-1) inclusive, the length is # ($end-1)-($self->{string}+1)+1 =$end - 1 - $self->{string} - 1 + 1 my @date = splice @bits, $start+1, ($end-$start-1); $date = join ', ', @date; # Remove the year $date =~ s/(,\s*?)\d{4}//; } my $remains = join ', ', @bits if grep /\S/, @bits; return ($date, $remains) if wantarray; return $date; } # Titles with rhetorical questions fail to eat the whole title. # e.g. Do you like cheese? Of course you do! # will parse to # __YEAR__ __TITLE__. Of course you do! __... sub parse_interrupted_title { my ($self) = @_; if( $self->{bibtitle} =~ /[\?\!]$/ && $self->{string} =~ s/$TITLE(.*?)\s+__/$1\__/ ) { my $fragment = $2; if( $fragment =~ /$TOKENS/ ) { $self->{string} =~ s/$TITLE\__/$1$fragment __/; } elsif( $fragment =~ /\p{IsAlpha}/ ) { $self->{bibtitle} .= " $fragment"; } } } sub parse_tail { my ($self) = @_; my @bits = split /[\.,]\s*/, $self->{string}; my @tail; unshift @tail, pop @bits while @bits && $bits[$#bits] !~ /$TOKENS/; if( grep /\p{IsAlpha}/, @tail ) { $self->{institution} = join ', ', @tail; $self->{string} = join ', ', @bits, '__INSTITUTION__'; $self->{bibtype} = 'techreport'; } } 1;