package Monkey::APAParser;

# End of sentence
our $EOS = qr/[\.\?\!]/;
our $IS_MONTH = qr/\b(
      jan(?:uary)?
    | feb(?:r?uary)?
    | mar(?:ch)?
    | apr(?:il)?
    | may
    | june?
    | july?
    | aug(?:ust)?
    | sep(?:t(?:ember)?)?
    | oct(?:ober)?
    | nov(?:ember)?
    | dec(?:ember)?
)\b/ix;
our $TOKENS = qr/[\s\p{IsPunct}]*(?:__[A-Z]+__[\s\p{IsPunct}]*)+/;
our $TITLE = qr/(__TITLE__(?:\s+__[A-Z]+__)*\.\s*)/;
our $BOOKTITLE = qr/(__BOOKTITLE__(?:\s+__[A-Z]+__)*\.\s*)/;

sub new {
    my ($class) = @_;

    return bless {}, $class;
}

sub cleanup {
    my ($self, $string, $key) = @_;

    warn "TOKEN IN STRING:\n$self->{_original}\n$key: $string\n\n"
        if    $key
           && $key ne 'string'
           && $key ne 'template'
           && $string =~ /__[A-Z]+__/;

    warn "DATE IN STRING:\n$self->{_original}\n$key: $string\n\n"
        if    $key
           && $key ne 'date'
           && $key ne 'year'
           && $key ne 'string'
           && $key ne 'template'
           && $string =~ /$IS_MONTH/i;

    $string =~ s/Prodeedings/Proceedings/is;

    $string =~ s/\s+/ /sg;
    $string =~ s/^\s*//sg;
    $string =~ s/[\s\.,]*$//sg;
    $string =~ tr/\x91\x92\x93\x94\x96\x97/''""\-\-/s;
    $string =~ s/''+//sg; # Get rid of double quotes and wiki emphasis

    # Fix et al.
    $string =~ s/ et al$/ et al./i;

    warn "EMPTY STRING:\n$self->{_original}\n$key: $string\n\n"
        unless !$key || $string =~ /\S/;

    return $string;
}

sub cleanup_all {
    my ($self) = @_;

    foreach (keys %$self) {
        if( /^_/ ) {
            delete $self->{$_} if /^_/
        } else {
            $self->{$_} = $self->cleanup($self->{$_}, $_)
        }
    }
    return $self;
}

sub canonicalize_names {
    my ($self, $string) = @_;

    # Squeeze the string
    $string =~ s/\s+/ /;

    # Save the character entities!
    $string =~ s/\&(\#?\w+;)/\$$1/sg;

    # Break out the individual names so we can
    # put them back together.
    my @names = grep $_, split /\s*(?:,|&|and)\s*/, $string;

    # Names seem inverted if there is a segment
    # consisting only of initials and no full names
    # or starting with a "von" or "de".
    if( grep {
            !/\p{IsUpper}\S*\p{IsLower}/ || /^\s*\p{IsLower}+\b/
        } @names
    ) {
        my @inverted;
        while (@names) {
            my @name = reverse splice @names, 0, 2;
            push @inverted, join ' ', @name;
        }
        @names = @inverted;
    }

    # Reconnect the names
    my $names = join ' and ', @names;

    # Fix any stray initials missing periods.
    $names =~ s/(\p{IsUpper})\s/$1./sg;

    # Fix any squished initials.
    $names =~ s/(\p{IsUpper}\.)\s+/$1 /sg;

    # Replace the character entities
    $names =~ s/\$(\#?\w+;)/\&$1/sg;

    return $names;
}

sub token_tail {
    my ($self, $string) = @_;

    my $tail = $1 if $string =~ s/($TOKENS)$//;
    return ($tail, $string) if wantarray;
    return $tail;
}

sub parse {
    my ($self, $string) = @_;

    $self->{_original} = $string;

    $string = $self->cleanup( $string );
    $self->{string} = $string;

    my @methods = (
        # Electronic resource (web pages, FTP sites) should be at the
        # end because they have been tacked onto the older APA style.
        # Get rid of it so we can move onto the classical stuff.
        'electronic_resource',

        # Now do the easy bits that float through the string,
        # especially bits like pages and edition that have
        # periods that interfere with the EOS pattern
            'year',
            'author_or_institution',
            'edition',
            'pages',

        # The title comes after the year, so it's easy to find.
        'title',

        # Journal articles are easy to identify as well, so
        # try them next.
        'journal_article',

        # Getting trickier
        'inside_book',
        'publisher',
        'author',
        'conference',

        # Mop up the rest
        'interrupted_title',
        'tail',
    );

    eval {
        while( my $method = shift @methods ) {
            $method = "parse_$method";
            $self->$method;
        }
    };
    return "$@\n$self->{_original}" if $@;

    foreach (qw(author editor)) {
        $self->{$_} = $self->canonicalize_names($self->{$_})
            if exists $self->{$_} && $self->{$_} =~ /\S/;
    }

    $self->{template} = $self->{string};
    $self->cleanup_all;

    return "Failed to parse"
        unless $self->{string} =~ /^$TOKENS$/;

    return "No type" unless $self->{bibtype};

    delete $self->{string};

    return;
}

sub parse_electronic_resource {
    my ($self) = @_;

    my $UrlCharacter = qr"[\p{IsAlnum}\;/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\'\(\)\%\#\|]";
    my $UrlProtocols = "http|https|ftp|news|mailto|telnet|gopher";
    my $UrlPattern = qr<((?:$UrlProtocols):$UrlCharacter+)>;

    if( $self->{string} =~ s/$UrlPattern/__URL__/ ) {
        $self->{url} = $1;

        # Lose the whole electronic reference part.
        $self->{string} =~ s/(Accessed|Retrieved|Available).*?__URL__.*//i;

        # Best bet so far
        $self->{bibtype} = 'url';
    }
}

# Years can be of the form (2004b), (August, 2004), (n.d.),
# (n.d-a), (forthcoming)
sub parse_year {
    my ($self) = @_;
    die "Invalid year"
        unless $self->{string} =~ s/\(([^\)]*\b(\d{4}[a-z]?|n\.d[\.\-].*?|forthcoming)\s*)\)/__YEAR__/;
    $self->{year} = $1;
}

# Whatever comes before the year is the author, or the
# institution that wrote the technical report.
sub parse_author_or_institution {
    my ($self) = @_;

    die "Invalid author"
        unless $self->{string} =~ s/^(.*?)\s+__YEAR__/__AUTHOR__ __YEAR__/;
    $self->{_author_or_institution} = $1;

    die "Invalid author\n__AUTHOR__=$self->{_author_of_institution}"
        unless $self->{_author_or_institution} =~ /\S/;
}


sub parse_edition {
    my ($self) = @_;
    if(
           $self->{string} =~ s/\(\s*(\d+[a-z]+)\s+ed\b\.?\s*\)/__EDITION__/i
        || $self->{string} =~ s/,\s+(\d+[a-zA-Z]+)\s+ed\b\.?/, __EDITION__/i
    ) {
        $self->{edition} = $1;
    }
}

sub parse_pages {
    my ($self) = @_;
    if( $self->{string} =~ s/\(?(pp?)\.?\s*(\d+)(?:-+(\d+))\s*\)?/__PAGES__/
        || $self->{string} =~ s/(\p{IsPunct})\s*(\d+)-+(\d+)\s*\P{IsAlnum}*$/$1 __PAGES__ /
    ) {
        ($self->{pages_start}, $self->{pages_end}) = ($2, $3);
    }

    # Single page?
    elsif( $self->{string} =~ s/,\s*(\d+)\s*$/, __PAGES__/ ) {
        ($self->{pages_start}, $self->{pages_end}) = ($1, $1);
    }
}

# The title comes immediately after the year. Titles should
# be terminated with a EOS. They may be wrapped in "quotes",
# in which case the title is terminated at the end quote.
#
# NOTE: For this to work, bits that may have periods in it
# like (3rd ed.) or (pp. 10-42) need to have already been removed.
sub parse_title {
    my ($self) = @_;
    if (
           $self->{string} =~ s/__YEAR__[\.,\s]+(['"])(.*?)\1\s*\p{IsPunct}?\s*/__YEAR__ __TITLE__. /
        || $self->{string} =~ s/__YEAR__[\.,\s]+(.*?$EOS)\s*/__YEAR__ __TITLE__. /
    ) {
        $self->{bibtitle} = $2 || $1;
        die "Invalid title\n__TITLE__==$self->{bibtitle}"
            unless $self->{bibtitle} =~ /\S/;

        # We may have sucked up the __EDITION__. Put it back.
        my $tail;
        ($tail, $self->{bibtitle}) = $self->token_tail( $self->{bibtitle} );
        $self->{string} =~ s/__TITLE__/__TITLE__ $tail/ if $tail;
    }

    # Although this is crap, some people write:
    # Web page title (year). http://...
    elsif( $self->{url} ) {
        $self->{bibtitle} = $self->{_author_or_institution};
        delete $self->{_author_or_institution};
    }

    else {
        die "Invalid title";
    }
}

sub parse_journal_article {
    my ($self) = @_;

    # First, look for the pattern 3(4), which indicates the volume
    # and issue of a journal article. Note that sometimes issues are
    # combined, so it may appear as 3(4&5) or 3(4-5).
    #
    # If that fails, just look for , 3, 10-14, meaning the volume
    # and then the pages.
    if(    $self->{string} =~ s/(\d+)\s*\(([\d\p{IsPunct}\s]+)\)\s*/__VOLUME__(__ISSUE__)/
        || $self->{string} =~ s/,\s*(\d+)\s*,\s*__PAGES__/, __VOLUME__, __PAGES__/
    ) {
        $self->{volume} = $1;
        $self->{issue} = $2 if $2;
        $self->{bibtype} = 'article';

        # Grab the journal title, which preceeds the volume.
        die "Invalid journal"
            unless $self->{string} =~ s/$TITLE(.*?),?\s*__VOLUME__/$1\__JOURNAL__, __VOLUME__/;
        $self->{journal} = $2;
    }
}

# A pair XXX: YYY at the end of the string usually is the
# publisher information. In the case of a technical report,
# the stuff after the colon is the author(s).
sub parse_publisher {
    my ($self) = @_;

    my @bits = split /([:\.\!\?])/, $self->{string};

    my @tail;
    push @tail, pop @bits while $bits[$#bits] =~ /^$TOKENS$|^[\s\p{IsPunct}]+$/;
    return unless @bits >= 3;

    my ($location, $colon, $publisher) = splice @bits,-3;
    push @bits, $1 while $location =~ s/^(.*$TOKENS\s*)//;

    return unless $colon eq ':';

    return unless $location =~ /\p{IsAlpha}/;
    push @bits, ' __LOCATION__: ';

    my $tail;

    ($tail, $publisher) = $self->token_tail( $publisher );

    return unless $publisher =~ /\p{IsAlpha}/;
    push @bits, ' __PUBLISHER__';
    push @bits, $tail if $tail;

    $self->{location} = $location;

    # It may be a technical report, so decide on this later
    $self->{_author_or_publisher} = $publisher;

    # If it has a publisher, let's assume it is a book
    # until we know otherwise.
    $self->{bibtype} = 'book' 
        unless $self->{bibtype};

    # Rebuild the string
    $self->{string} = join '', @bits, @tail;
}

sub parse_author {
    my ($self) = @_;

    # Already claimed as the title by a webpage.
    return unless exists $self->{_author_or_institution};

    # Use some heuristics to guess if it is a technical
    # report, in which case the bit after the last colon
    # is the author. Otherwise it is the publisher.
    if((    $self->{_author_or_publisher} =~ /\b[A-Z]\./
         || $self->{_author_or_publisher} =~ /et[.\s]+al/
         || $self->{_author_or_institution} !~ /,/
       ) && !(
               $self->{_author_or_institution} =~ /et[.\s]+al/i
            || $self->{_author_or_institution} =~ /\(\s*ed.\s*\)/i
            || $self->{_author_or_publisher} =~ /P\.\O\.\s*Box/i
       )
    ) {
        $self->{bibtype} = 'techreport';

        # Ok, swap things around.
        $self->{string} =~ s/__PUBLISHER__/__AUTHOR__/;
        $self->{string} =~ s/^__AUTHOR__/__INSTITUTION__/;
        $self->{author} = $self->{_author_or_publisher}
            if $self->{_author_or_publisher} =~ /\S/;
        $self->{institution} = $self->{_author_or_institution};
    } 

    # It may be a book without authors, just editors.
    elsif( $self->{_author_or_institution} =~ s/\(eds?\.\)[\.,]?//i ) {
        $self->{editor} = $self->{_author_or_institution};
        $self->{string} =~ s/__AUTHOR__/__EDITOR__/;
        $self->{publisher} = $self->{_author_or_publisher}
            if $self->{_author_or_publisher} =~ /\S/;
    }

    # Otherwise, the authors are authors and the publisher is
    # a publisher.
    else {
        $self->{author} = $self->{_author_or_institution};
        $self->{publisher} = $self->{_author_or_publisher}
            if $self->{_author_or_publisher} =~ /\S/;
    }
}

# Inside a book: Chapter or conference paper
sub parse_inside_book {
    my ($self) = @_;

    $self->{string} =~ s/(,|$EOS) In\s+/$1 __IN__ /i;

    # It may inside a book, with editors.
    if( $self->{string} =~ s/__IN__\s+(.*?)\(eds?\.?\)\s*[,\.]?\s*/__IN__ __EDITORS__, /i ) {
        $self->{editor} = $1;
        $self->{bibtype} = 'inbook';

        # The book title should come after. Again this relies on all
        # bits with periods having been removed already.
        return "Invalid book title: $self->{string}" unless
               $self->{string} =~ s/__EDITORS__,\s*(.*?)$EOS\s*/__EDITORS__, __BOOKTITLE__. /
            || $self->{string} =~ s/__EDITORS__,\s*(.*?)\s*__/__EDITORS__, __BOOKTITLE__. __/;
        $self->{booktitle} = $1;
    }

    # It may be inside a book, but with no editors cited.
    # If it's "In" something, but there are no (Eds.), then
    # assume the bit after "In" is the book title.
    elsif(
           $self->{string} =~ s/__IN__\s*(.*?)\./__IN__ __BOOKTITLE__\./i
        || $self->{string} =~ s/__IN__\s*(.*?)\,/__IN__ __BOOKTITLE__\./i
    ) {
        $self->{booktitle} = $1;
        $self->{bibtype} = 'inbook';
    }

    # It may be conference proceedings written by someone too lazy
    # to write the In.
    elsif( $self->{string} =~ s/(Proceedings.*?)\./__IN__ __BOOKTITLE__\./i ) {
        $self->{booktitle} = $1;
        $self->{bibtype} = 'proceedings';
    }

    # We may have sucked up the __EDITION__ or __PAGES__. Put it back.
    if( $self->{booktitle} ) {
        my $tail;
        ($tail, $self->{booktitle}) = $self->token_tail( $self->{booktitle} );
        $self->{string} =~ s/__BOOKTITLE__/__BOOKTITLE__ $tail/ if $tail;
    }
}

sub parse_conference {
    my ($self) = @_;

    # It isn't a book or a chapter if the book title contains
    # 'Proceedings'. Then it is a conference.
    if( $self->{booktitle} =~ /Proceedings/ ) {
        # It's a conference submission if it is "In" the proceedings.
        $self->{bibtype} = $self->{string} =~ /__IN__ /
            ? 'inproceedings' : 'proceedings';

        $self->parse_event_location;
    }

    if( $self->{string} =~ s/$TITLE(.*?Conference),\s*/$1\__ORGANIZATION__, / ) {
        $self->{organization} = $2;
        $self->parse_event_location;
    }
}

# If it is a conference, the conference location and
# date come afterwards.
sub parse_event_location {
    my ($self) = @_;

    # To be safe, grab everything we haven't parsed.
    #   Put __ first to prevent sucking up __PAGES__
    #   Put \. first to prevent inhaling past \.
    #   Put eat to the end of string last

    my $location;
    if( $self->{booktitle} =~ /$IS_MONTH/i ) {
        my @bits = split ',', $self->{booktitle};
        $self->{booktitle} = shift @bits;
        $location = join ',', @bits;

        $self->{string} =~ s/$BOOKTITLE/$1\__EVENTLOCATION__. /;
    }

    elsif(
           $self->{string} =~ s/($BOOKTITLE|__ORGANIZATION__,\s*)(__PAGES__\.?\s*)?(.*?)(\s*__|\.\s*)/$1$3\__EVENTLOCATION__$5/
#        || $self->{string} =~ s/($BOOKTITLE|__ORGANIZATION__,\s*)(.*?)\.\s*/$1\__EVENTLOCATION__. /
        || $self->{string} =~ s/($BOOKTITLE|__ORGANIZATION__,\s*)(__PAGES__\.?\s*)?([^\._]+)$/$1$3\__EVENTLOCATION__/
    ) {
        $location = $4;
    }

    if( $location ) {
        $self->parse_organization;
        ($self->{date}, $self->{event_location}) = $self->extract_date( $location );

        if( $self->{date} =~ /\S/ ) {
            # Mark the date in the string template
            $self->{string} =~ s/__EVENTLOCATION__/__EVENTLOCATION__, __DATE__/;
        } else {
            delete $self->{date};
        }

        unless( $self->{event_location} =~ /\p{IsAlpha}/ ) {
            delete $self->{event_location};
            $self->{string} =~ s/\s*__EVENTLOCATION__//;
        }
    }
}

# Organizations are only relevant to conferences. It should come
# right after the event location and date.
sub parse_organization {
    my ($self) = @_;
    if( $self->{string} =~ s/__EVENTLOCATION__\.?\s+([^_].+?)(_|$)/__EVENTLOCATION__. __ORGANIZATION__, $2/ ) {
        $self->{organization} = $1;
        unless( $self->{organization} =~ /\p{IsAlpha}/ ) {
            delete $self->{organization};
            $self->{string} =~ s/__ORGANIZATION__, //;
        }
    }
}

# Extracts a date fragment from a CSV string and returns
# the two strings.
sub extract_date {
    my ($self, $string) = @_;
    my $date;

    # Search for the month inside the string.
    my @bits = split /\s*,\s*/, $string;
    my $month;
    for( $month=0; $month < @bits; $month++ ) {
        last if $bits[$month] =~ /$IS_MONTH/;
    }

    if( $month < @bits ) {
        # Scan backwards looking for any days or year.
        # $start will end up one before the actual date
        my $start = $month-1;
        $start-- while $bits[$start] =~ /^[\d\s\-]+$/;

        # Scan forwards looking for any days or year
        # $end will end up one after the actual date.
        my $end = $month+1;
        $end++ while $bits[$end] =~ /^[\d\s\-]+$/;

        # Rip out the date. Since the date spans from
        # ($start+1)..($end-1) inclusive, the length is
        # ($end-1)-($self->{string}+1)+1  =$end - 1 - $self->{string} - 1 + 1
        my @date = splice @bits, $start+1, ($end-$start-1);
        $date = join ', ', @date;

        # Remove the year
        $date =~ s/(,\s*?)\d{4}//;
    }

    my $remains = join ', ', @bits if grep /\S/, @bits;
    return ($date, $remains) if wantarray;
    return $date;
}

# Titles with rhetorical questions fail to eat the whole title.
# e.g. Do you like cheese? Of course you do!
#          will parse to
#      __YEAR__ __TITLE__. Of course you do! __...
sub parse_interrupted_title {
    my ($self) = @_;
    if(    $self->{bibtitle} =~ /[\?\!]$/
        && $self->{string} =~ s/$TITLE(.*?)\s+__/$1\__/
    ) {
        my $fragment = $2;
        if( $fragment =~ /$TOKENS/ ) {
            $self->{string} =~ s/$TITLE\__/$1$fragment __/;
        }
        elsif( $fragment =~ /\p{IsAlpha}/ ) {
            $self->{bibtitle} .= " $fragment";
        }
    }
}

sub parse_tail {
    my ($self) = @_;
    my @bits = split /[\.,]\s*/, $self->{string};

    my @tail;
    unshift @tail, pop @bits while @bits && $bits[$#bits] !~ /$TOKENS/;

    if( grep /\p{IsAlpha}/, @tail ) {
        $self->{institution} = join ', ', @tail;
        $self->{string} = join ', ', @bits, '__INSTITUTION__';
        $self->{bibtype} = 'techreport';
    }
}

1;
