#!/usr/bin/env perl
use v5.14.1;

use PICA::Data;
use PICA::Schema;
use PICA::Schema::Builder;
use Getopt::Long;
use Pod::Usage;
use JSON::PP;
use List::Util 'sum';

my $number = 0;
if (my ($i) = grep {$ARGV[$_] =~ /^-(\d+)$/} (0 .. @ARGV - 1)) {
    $number = -(splice @ARGV, $i, 1);
}

GetOptions(
    'from|f=s'   => \(my $from),
    'to|t:s'     => \(my $to),
    'schema|s=s' => \(my $schema),
    'build|b'    => \(my $build),
    'unknown|u!' => \(my $report_unknown),
    'count|c'    => \(my $count),
    'path|p=s'   => \(my $path),
    "n|number:i" => \$number,
    'C'          => \(my $color),
    'M'          => \(my $nocolor),
    'help|?'     => \(my $help)
) or pod2usage(2);
pod2usage(-verbose => 99, -sections => "SYNOPSIS|OPTIONS|DESCRIPTION")
    if $help
    or (!@ARGV and -t);

my %types = (
    bin    => 'Binary',
    dat    => 'Binary',
    binary => 'Binary',
    plain  => 'Plain',
    pp     => 'Plain',
    plus   => 'Plus',
    xml    => 'XML',
    ppxml  => 'PPXML',
    json   => 'JSON',
    ndjson => 'JSON',
);

my $input = '-';
$from = shift @ARGV if @ARGV && !$from && $types{lc $ARGV[0]};
if (!defined $path) {
    my $pattern = '[012.][0-9.][0-9.][A-Z@.](\$[^|]+)?';
    my @pathes;
    while (@ARGV && $ARGV[0] =~ /^$pattern(\|$pattern)*$/) {
        push @pathes, shift @ARGV;
    }
    $path = join '|', @pathes;
}

my @pathes;
my $sfpath;
if (defined $path) {
    @pathes = map {
        my $p = eval {PICA::Path->new($_)};
        $p || die "invalid pica path: $_\n";
    } split /\|/, $path;

    $sfpath = sum map {length $_->subfields > 0} @pathes;
    die
        "path expressions must either all select fields or select subfields!\n"
        if $sfpath && $sfpath ne @pathes;
}

$input = shift @ARGV if @ARGV;

$from = $1 if !$from && $input =~ /\.([a-z]+)$/ && $types{lc $1};

$from = 'plain' unless $from;
pod2usage("unknown serialization type: $from") unless $types{lc $from};

$to = $from unless $to or $count;
pod2usage("unknown serialization type: $to") unless !$to or $types{lc $to};

if ($sfpath) {
    $to    = undef;
    $count = undef;
}

my $builder;
if ($build) {
    $builder = PICA::Schema::Builder->new;
    $to      = undef;
}

$input = *STDIN if $input eq '-';
my $parser = "PICA::Parser::${types{$from}}"->new($input, bless => 1);

my %options;
$options{color}
    = {tag => 'blue', occurrence => 'blue', code => 'red', value => 'green',}
    if !$nocolor && ($color || -t *STDOUT);

binmode *STDOUT, ':encoding(UTF-8)';

my $writer;
if ($to) {
    $writer = "PICA::Writer::${types{$to}}"->new(%options);
}

my %schema_options = (ignore_unknown => !$report_unknown);

my $stats = {records => 0, holdings => 0, items => 0, fields => 0};

my $invalid = 0;
if ($schema) {
    my $fh = IO::File->new($schema);
    $schema = PICA::Schema->new(decode_json(join "\n", <$fh>));
}

while (my $record = $parser->next) {
    if ($sfpath) {
        say $_ for map {@{$record->match($_, split => 1) // []}} @pathes;
        next;
    }

    $record = $record->fields(@pathes) if @pathes;

    $writer->write($record) if $writer;
    if ($schema) {
        my @errors = $schema->check($record, %schema_options);
        if (@errors) {
            for (@errors) {
                my $msg
                    = defined $record->{_id} ? $record->{_id} . ": $_" : $_;
                print "$msg\n";
            }
            $invalid++;
        }
    }
    $builder->add($record) if $builder;

    if ($count) {
        $stats->{holdings} += @{$record->holdings};
        $stats->{items}    += @{$record->items};
        $stats->{fields}   += @{$record->{record}};
    }
    $stats->{records}++;
    last if $number and $stats->{records} >= $number;
}

$writer->end() if $writer;

if ($count) {
    $stats->{invalid} = $invalid if defined $invalid;
    print $stats->{$_} . " $_\n"
        for grep {defined $stats->{$_}}
        qw(records invalid holdings items fields);
}

print JSON::PP->new->indent->space_after->canonical->convert_blessed->encode(
    $builder->schema)
    if $builder;

__END__

=head1 NAME

picadata - parse and validate PICA+ data

=head1 SYNOPSIS

picadata [[--from] TYPE] [--schema FILE] [--to [TYPE]] {OPTIONS} [FILE]

=head1 DESCRIPTION

Parse, validate and/or serialize PICA+ data from the command line, e.g.:

  picadata pica.xml -s schema.json   # validate against Avram schema
  picadata pica.dat -t xml           # convert binary to XML
  picadata -c -f plain < pica.plain  # parse and count records
  picadata -p 003@ pica.xml -t       # extract field 003@

=head1 OPTIONS

=head2 --from, -f

PICA serialization type (plain, plus, binary, XML, ppxml) with XML as default.
Guessed from input filename unless specified. See format documentation at
L<http://format.gbv.de/pica>

=head2 --to, -t

PICA serialization type to enable writing parsed PICA data.

=head2 --count, -c

Count number of records, holdings, items, and fields.

=head2 --number, -n

Stop parsing after C<n> records. Can be abbreviated as C<-1>, C<-2>...

=head2 --path, -p

Limit the record to fields specified by a simple PICA Path expression. Multiple
expressions can be separated by or C<|>. Expressions must either all reference
fields or all reference subfields, the latter to emit subfield values.

=head2 --number, -n

Stop ofter parsing C<n> records. Option can be abbreviated as C<-1>, C<-2>...

=head2 --schema, -s

L<Avram Schema|http://format.gbv.de/schema/avram/specification> to validate
against.

=head2 --unknown, -u

Report unknown fields and subfields when validating (disabled by default).

=head2 --build, -b

Build an Avram schema from given records.

=head2 -C

Colorize output. Only supported for PICA plain and PICA plus format.

=head2 -M

Monochrome (don't colorize output).

=head1 SEE ALSO

See L<catmandu> for a more elaborated command line tool for data processing
(transformation, API access...), including PICA+ with L<Catmandu::PICA>.

=cut
