#!/usr/bin/env perl
#
#   A script that performs semantic similarity in PXF|BFF data structures
#
#   Last Modified: Sep/29/2023
#
#   Version 0.00
#
#   Copyright (C) 2023 Manuel Rueda - CNAG (manuel.rueda@cnag.crg.eu)
#
#   License: Artistic License 2.0
#
#   If this program helps you in your research, please cite.

package main;

use strict;
use warnings;
use autodie;
use feature qw(say);
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;
use Data::Dumper;
use Sys::Hostname;
use POSIX qw(strftime);
use Term::ANSIColor qw(:constants);
use File::ShareDir::ProjectDistDir qw(dist_dir);
use FindBin qw($Bin);
use lib "$Bin/../lib";
use Pheno::Ranker qw($VERSION write_json);

# Defining a few variables
my $out_file_cohort  = 'matrix.txt';
my $out_file_patient = 'rank.txt';
my $export_basename = 'export';
my $align_basename  = 'alignment';
my $log_file        = 'pheno-ranker-log.json';
my $color           = 1;
my $age             = 0;
my $cli             = 1;

# Reading arguments
GetOptions(
    'reference|r=s{1,}'              => \my @reference_files,           # array
    'target|t=s'                     => \my $target_file,               # string
    'weights|w=s'                    => \my $weights_file,              # string
    'append-prefixes=s{1,}'          => \my @append_prefixes,           # array
    'o=s'                            => \$out_file_cohort,                     # string
    'max-out:i'                      => \my $max_out,                   # integer
    'max-number-var:i'               => \my $max_number_var,            # integer
    'include-hpo-ascendants'         => \my $include_hpo_ascendants,    # flag
    'export|e:s'                     => \my $export,                    # opt-string (defined)
    'align|a:s'                      => \my $align,                     # opt-string (defined)
    'sort-by=s'                      => \my $sort_by,                   # string
    'patients-of-interest|poi=s{1,}' => \my @patients_of_interest,      # array
    'poi-out-dir=s'                  => \my $poi_out_dir,               # string
    'include-terms=s{1,11}'          => \my @include_terms,             # array
    'exclude-terms=s{1,11}'          => \my @exclude_terms,             # array
    'config=s'                       => \my $config_file,               # string
    'age!'                           => \$age,                          # flag
    'help|?'                         => \my $help,                      # flag
    'log:s'                          => \my $log,                       # opt-string (defined)
    'man'                            => \my $man,                       # flag
    'debug=i'                        => \my $debug,                     # integer
    'verbose|'                       => \my $verbose,                   # flag
    'color!'                         => \$color,                        # flag
    'version|V'                      => sub { say "$0 Version $VERSION"; exit; }
) or pod2usage(2);
pod2usage(1)                              if $help;
pod2usage( -verbose => 2, -exitval => 0 ) if $man;
pod2usage(
    -message => "Please specify a reference-cohort(s) with <--r>\n",
    -exitval => 1
) unless @reference_files;

# Set the name of the output

my $out_file = $target_file ? $out_file_patient : $out_file_cohort;

# Turning color off if argument <--no-color>
$ENV{'ANSI_COLORS_DISABLED'} = 1 unless $color;

# Start printing to STDOUT
say BOLD CYAN program_header($VERSION), RESET if $verbose;

######################
# START PHENO-RANKER #
######################

# Load data as hashref
my $data = {
    reference_files        => \@reference_files,
    target_file            => $target_file,
    weights_file           => $weights_file,
    include_hpo_ascendants => $include_hpo_ascendants,
    hpo_file               => undef,
    align                  => $align,
    align_basename         => $align_basename,
    export                 => $export,
    export_basename        => $export_basename,
    out_file               => $out_file,
    max_out                => $max_out,
    max_number_var         => $max_number_var,
    sort_by                => $sort_by,
    patients_of_interest   => \@patients_of_interest,
    poi_out_dir            => $poi_out_dir,
    include_terms          => \@include_terms,
    exclude_terms          => \@exclude_terms,
    config_file            => $config_file,
    age                    => $age,                      # Solution, use ageRange in PXF/BFF, measures' values more difficult
    cli                    => $cli,
    append_prefixes        => \@append_prefixes,
    log                    => $log,
    debug                  => $debug,
    verbose                => $verbose
};

# Create object
my $ranker = Pheno::Ranker->new($data);

# Run method
$ranker->run();

# Create log if <--log>
write_log( $log ? $log : $log_file, $data, $VERSION )
  if defined $log;

####################
# END PHENO-RANKER #
####################

sub write_log {

    my ( $log, $data, $VERSION ) = @_;

    # NB: Darwin does not have nproc to show #logical-cores, using sysctl instead
    my $os = $^O;
    chomp(
        my $ncpuhost =
          lc($os) eq 'darwin' ? qx{/usr/sbin/sysctl -n hw.logicalcpu}
        : $os eq 'MSWin32' ? qx{wmic cpu get NumberOfLogicalProcessors}
        :                    qx{/usr/bin/nproc} // 1
    );

    # For the Windows command, the result will also contain the string
    # "NumberOfLogicalProcessors" which is the header of the output.
    # So we need to extract the actual number from it:
    if ( $os eq 'MSWin32' ) {
        ($ncpuhost) = $ncpuhost =~ /(\d+)/;
    }
    $ncpuhost = 0 + $ncpuhost;    # coercing it to be a number

    my $info = {
        date      => ( strftime "%a %b %e %H:%M:%S %Y", localtime ),
        ncpuhost  => $ncpuhost,
        hostname  => hostname,
        id        => time . substr( "00000$$", -5 ),                   # string
        version   => $VERSION,
             user => $ENV{'LOGNAME'}
          || $ENV{'USER'}
          || $ENV{'USERNAME'}
          || 'dummy-user'
    };

    # Saving file
    say BOLD GREEN "Writing <$log> file\n" if $data->{verbose};
    write_json(
        {
            filepath => $log,
            data     => { info => $info, data => $data }
        }
    );
}

sub program_header {

    my $VERSION = shift;
    my $str     = <<EOF;
****************************************
*   Rank against cohort(s) (BFF/PXF)   *
*          - PHENO-RANKER -            *
*          Version: $VERSION              *
*      (C) 2023 Manuel Rueda, PhD      *
*       The Artistic License 2.0       *
****************************************
EOF
    return $str;
}

=head1 NAME

pheno-ranker: A script that performs semantic similarity in PXF/BFF data structures and beyond (JSON|YAML)

=head1 SYNOPSIS

pheno-ranker -r <individuals.json> -t <patient.json> [-options]

     Arguments:                       
     * Cohort mode:
       -r|reference                   BFF/PXF file(s) (JSON|YAML array or object)

     * Patient mode:
       -t|target                      BFF/PXF file (JSON|YAML object or array of 1 object)

     Options:
       -age                           Include age-related variables [>no-age|age]
       -a|align                       Write alignment file(s). If no argument is given the files will be named [alignment.*]
       -append-prefixes               The prefixes to be added to the primary_key of individuals when #cohorts >= 2 [C]
       -config                        YAML config file to change default parameters [conf/config.yaml)
       -e|export                      Export miscellanea JSON files. If no argument is given the files will be named [export.*]
       -exclude-terms                 Exclude BFF/PXF terms (e.g., --exclude-terms sex id)
       -include-hpo-ascendants        Include ascendant terms from the Human Phenotype Ontology (HPO)
       -include-terms                 Include BFF/PXF terms (e.g., --ixclude-terms diseases)
       -max-number-var                Maximum number of variables to be used in binary string [10000]
       -max-out                       Print only N of comparisons (used with --t)  [50]
       -o                             Output file [-r matrix.txt|-t rank.txt]
       -poi|patients-of-interest      Export JSON files for the selected individual ids (dry-run)
       -poi-out-dir                   Directory where to write JSON files (to be used with --poi)
       -sort-by                       Sort reference-patient comparison by Hamming-distance or Jaccard-index [>hamming|jaccard]
       -w|weights                     YAML file with weights

     Generic Options:
       -debug                         Print debugging (from 1 to 5, being 5 max)
       -h|help                        Brief help message
       -log                           Save log file (JSON). If no argument is given then the log is named [pheno-ranker-log.json]
       -man                           Full documentation
       -no-color                      Don't print colors to STDOUT [>color|no-color]
       -v|verbose                     Verbosity on
       -V|version                     Print version


=head1 DESCRIPTION

pheno-ranker: A script that performs semantic similarity in PXF/BFF data structures and beyond (JSON|YAML)

=head1 SUMMARY

Pheno-Ranker is a lightweight and easily to install tool specifically designed for performing semantic similarity analysis on phenotypic data structured in JSON format, such as Beacon v2 Models or Phenopackets v2.

=head1 INSTALLATION

=head2 Containerized (Recommended Method)

=head3 Method 1: From Docker Hub

Download a docker image (latest version - amd64|x86-64) from L<Docker Hub|https://hub.docker.com/r/manuelrueda/pheno-ranker> by executing:

  docker pull manuelrueda/pheno-ranker:latest
  docker image tag manuelrueda/pheno-ranker:latest cnag/pheno-ranker:latest

See additional instructions below.

=head3 Method 2: With Dockerfile

Please download the C<Dockerfile> from the repo:

  wget https://raw.githubusercontent.com/cnag-biomedical-informatics/pheno-ranker/main/Dockerfile

And then run:

  docker build -t cnag/pheno-ranker:latest .

=head3 Additional instructions for Methods 1 and 2

To run the container (detached) execute:

  docker run -tid -e USERNAME=root --name pheno-ranker cnag/pheno-ranker:latest

To enter:

  docker exec -ti pheno-ranker bash

The command-line executable can be found at:

  /usr/share/pheno-ranker/bin/pheno-ranker

The default container user is C<root> but you can also run the container as C<$UID=1000> (C<dockeruser>). 

  docker run --user 1000 -tid --name pheno-ranker cnag/pheno-ranker:latest
 
=head3 Mounting volumes

Docker containers are fully isolated. If you need the mount a volume to the container please use the following syntax (C<-v host:container>). 
Find an example below (note that you need to change the paths to match yours):

  docker run -tid --volume /media/mrueda/4TBT/data:/data --name pheno-ranker-mount cnag/pheno-ranker:latest

Then I will do something like this:

  # First I create an alias to simplify invocation (from the host)
  alias pheno-ranker='docker exec -ti pheno-ranker-mount /usr/share/pheno-ranker/bin/pheno-ranker'

  # Now I use the alias to run the command (note that I use the flag --o to specify the filepath)
  pheno-ranker -r /data/individuals.json -o /data/matrix.txt

=head2 Non containerized

The script runs on command-line Linux and it has been tested on Debian/RedHat/MacOS based distributions (only showing commands for Debian's). Perl 5 is installed by default on Linux, 
but we will install a few CPAN modules with C<cpanminus>.

=head3 From Github

  git clone https://github.com/cnag-biomedical-informatics/pheno-ranker.git
  cd pheno-ranker

Install system level dependencies:
  
  sudo apt-get install cpanminus libbz2-dev zlib1g-dev libperl-dev libssl-dev

Now you have two choose between one of the 3 options below:

B<Option 1:> Install dependencies (they're harmless to your system) as C<sudo>:

  cpanm --notest --sudo --installdeps .
  bin/pheno-ranker --help            

B<Option 2:> Install the dependencies at C<~/perl5>:

  cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
  cpanm --notest --installdeps .
  bin/pheno-ranker --help

B<Option 3:> Install the dependencies in a "virtual environment" (at C<local/>) . We'll be using the module C<Carton> for that:

  mkdir local
  cpanm --notest --local-lib=local/ Carton
  export PATH=$PATH:local/bin; export PERL5LIB=$(pwd)/local/lib/perl5:$PERL5LIB
  carton install
  carton exec -- bin/pheno-ranker -help

=head3 From CPAN

First install system level dependencies:

  sudo apt-get install cpanminus libbz2-dev zlib1g-dev libperl-dev libssl-dev

Now you have two choose between one of the 3 options below:

B<Option 1:> System-level installation:

  cpanm --notest --sudo Pheno::Ranker
  pheno-ranker -h

B<Option 2:> Install Pheno-Ranker and the dependencies at C<~/perl5>

  cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
  cpanm --notest Pheno::Ranker
  pheno-ranker --help

B<Option 3:> Install Pheno-Ranker and the dependencies in a "virtual environment" (at C<local/>) . We'll be using the module C<Carton> for that:

  mkdir local
  cpanm --notest --local-lib=local/ Carton
  echo "requires 'Pheno::Ranker';" > cpanfile
  export PATH=$PATH:local/bin; export PERL5LIB=$(pwd)/local/lib/perl5:$PERL5LIB
  carton install
  carton exec -- pheno-ranker -help

=head3 System requirements

  * Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOs, OpenSuse) should do as well.
  * Perl 5 (>= 5.26 core; installed by default in most Linux distributions). Check the version with "perl -v".
  * >= 4GB of RAM
  * 1 core
  * At least 16GB HDD

=head1 HOW TO RUN PHENO-RANKER

For executing pheno-ranker you will need a PXF/BFF file(s) in JSON|YAML format. The reference cohort must be a JSON array, where each individual data are consolidated in one object.

There are two modes of operation:

=over 4

=item Cohort mode:
 
B<Intra-cohort:> With C<--r> argument and 1 cohort.

B<Inter-cohort:> With C<--r> and multiple cohort files. It can be used in combination with C<--append-prefixes> to add prefixes to each individual id.

=item Patient Mode:

With C<-r> reference cohort(s) and C<--t> patient data.

=back

B<Examples:>

 $ ./pheno-ranker -r phenopackets.json  # intra-cohort

 $ ./pheno-ranker -r phenopackets.yaml -o my_matrix.txt # intra-cohort

 $ ./pheno-ranker -r phenopackets.json -w weights.yaml --exclude-terms sex ethnicity exposures # intra-cohort with weights

 $ $path/pheno-ranker -r individuals.json others.yaml --append-prefixes CANCER CONTROL  # inter-cohort

 $ $path/pheno-ranker -r individuals.json -t patient.yaml -max-out 100 # mode patient


=head2 COMMON ERRORS AND SOLUTIONS

 * Error message: Foo
   Solution: Bar

 * Error message: Foo
   Solution: Bar

=head1 CITATION

The author requests that any published work that utilizes C<Convert-Pheno> includes a cite to the the following reference:

Rueda, M et al., (2023). Advancing Semantic Similarity Analysis of Phenotypic Data Stored in GA4GH Standards and Beyond. I<Manuscript in preparation>.

=head1 AUTHOR 

Written by Manuel Rueda, PhD. Info about CNAG can be found at L<https://www.cnag.eu>.

=head1 COPYRIGHT AND LICENSE

This PERL file is copyrighted. See the LICENSE file included in this distribution.

=cut
