#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2020-07-20 16:43:00 +0300 (Mon, 20 Jul 2020) $
#$Revision: 8230 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.1.0/scripts/cif2cod $
#------------------------------------------------------------------------------
#*
#* Parse a CIF file, prepare a COD database table entry from it.
#*
#* USAGE:
#*    $0 --options input.cif inputs*.cif
#**

use strict;
use warnings;
use COD::CIF::Parser qw( parse_cif );
use COD::CIF::Data::CIF2COD qw( cif2cod validate_SQL_types );
use COD::CIF::Tags::CanonicalNames qw( canonicalize_all_names );
use COD::SOptions qw( getOptions get_value );
use COD::SUsage qw( usage options );
use COD::ErrorHandler qw( process_warnings
                          process_errors
                          process_parser_messages
                          report_message );
use COD::ToolsVersion;
use XML::Simple;

my $print_header = 1; # Indicates whether to print out a header with column names
my $print_keywords = 0;
my $include_keywords_with_undefined_values = 0;

my $user_columns; # User specified list of columns to be printed
my $schema; # XML schema of data table for validation

my $use_parser = 'c';
my $input_format = 'cif';
my $die_on_errors   = 1;
my $die_on_warnings = 0;
my $die_on_notes    = 0;

my %options = (
    'require_only_doi' => 0,
    'reformat_space_group' => 0,
    'use_attached_hydrogens' => 1,
    'use_datablocks_without_coord' => 0,
);
#* OPTIONS:
#*   -C, --cod-number 1000000
#*                     Use the specified number, 1000000 in this example, as
#*                     a COD number for this structure; do not take the number
#*                     from the data block name.
#*
#*   -c, --always-continue
#*                     Continue processing and return successful return status
#*                     even if errors are diagnosed.
#*   -c-, --always-die
#*                     Stop and return error status if errors are diagnosed.
#*
#*   -h, --print-header
#*                     Print header with data column names at the first line
#*                     for CSV output format (default).
#*   -h-, --no-print-header, --dont-print-header
#*                     Do not print column names on the first (header) line
#*                     for CSV output format.
#*
#*   -k, --keywords
#*                     Print values in separate lines prefixed with keywords.
#*
#*   --include-keywords-with-undefined-values
#*                     Print the keyword line with an empty value if the value
#*                     is undefined.
#*   --exclude-keywords-with-undefined-values
#*                     Do not print the keyword line at all if the value is
#*                     undefined (default).
#*
#*   --use-attached-hydrogens
#*                     Include number of implicit hydrogens, specified using
#*                     _atom_site_attached_hydrogens tag, into the formula
#*                     sum (default).
#*   --no-use-attached-hydrogens,
#*   --dont-use-attached-hydrogens,
#*   --ignore-attached-hydrogens
#*                     Ignore number of implicit hydrogens, specified using
#*                     _atom_site_attached_hydrogens tag, in calculation of
#*                     the formula sum.
#*
#*   --reformat-space-group
#*                     Correct the formatting of Hermann-Mauguin symmetry
#*                     space group symbol.
#*   --no-reformat-space-group,
#*   --dont-reformat-space-group,
#*   --leave-space-group
#*                     Do not correct the formatting of Hermann-Mauguin
#*                     symmetry space group symbol (default).
#*
#*   --use-datablocks-without-coordinates,
#*   --use-all-datablocks
#*                     Do not filter out data blocks without coordinates.
#*   --no-use-datablocks-without-coordinates,
#*   --do-not-use-datablocks-without-coordinates,
#*   --dont-use-datablocks-without-coordinates,
#*   --skip-datablocks-without-coordinates
#*                     Filter out data blocks without coordinates (default).
#*
#*   --require-only-doi
#*                     Do not require all bibliographic details (authors,
#*                     journal name, title, year, volume and first page of
#*                     the publication) to be present if publication DOI is
#*                     specified.
#*   --require-full-bibliography
#*                     Require author names, journal name, volume, publication
#*                     title, year and first page to be present (default).
#*
#*   --columns file,flags,Robs
#*   --columns "file flags Robs"
#*                     Print only columns specified in this option.
#*
#*   --validate-by-schema doc/CODDictionary.xml
#*                     Validate the output by supplied XML schema, where the
#*                     SQL data types are given in the following manner:
#*
#*                     <CODDictionary>
#*                       <CODParameter name="file">
#*                         <SQLDataType>mediumint(7) unsigned</SQLDataType>
#*                         ...
#*                       </CODParameter>
#*                     </CODDictionary>
#*
#*   --continue-on-errors
#*                     Do not terminate script if errors are raised.
#*   --die-on-errors,
#*   --no-continue-on-errors,
#*   --do-not-continue-on-errors,
#*   --dont-continue-on-errors,
#*   --exit-on-errors
#*                     Terminate script immediately if errors are raised (default).
#*   --continue-on-warnings
#*                     Do not terminate script if warnings are raised (default).
#*   --die-on-warnings
#*                     Terminate script immediately if warnings are raised.
#*   --continue-on-notes
#*                     Do not terminate script if notes are raised (default).
#*   --die-on-notes
#*                     Terminate script immediately if notes are raised.
#*
#*   --use-perl-parser
#*                     Use the Perl parser for parsing CIFs.
#*   --use-c-parser
#*                     Use the faster C parser for parsing CIFs (default).
#*
#*   --cif-input
#*                     Use CIF format for input (default).
#*   --json-input
#*                     Use JSON format for input.
#*
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
@ARGV = getOptions(
    '-C,--cod-number' => \$options{cod_number},

    '-h,--print-header'       => sub { $print_header = 1 },
    '-h-,--dont-print-header' => sub { $print_header = 0 },
    '--no-print-header'       => sub { $print_header = 0 },

    '-k,--keywords'   => sub { $print_keywords = 1 },

    '--include-keywords-with-undefined-values' =>
            sub { $include_keywords_with_undefined_values = 1 },
    '--exclude-keywords-with-undefined-values' =>
            sub { $include_keywords_with_undefined_values = 0 },

    '--use-attached-hydrogens' =>
            sub { $options{'use_attached_hydrogens'} = 1 },
    '--dont-use-attached-hydrogens' =>
            sub { $options{'use_attached_hydrogens'} = 0 },
    '--no-use-attached-hydrogens' =>
            sub { $options{'use_attached_hydrogens'} = 0 },
    '--ignore-attached-hydrogens' =>
            sub { $options{'use_attached_hydrogens'} = 0 },

    '--require-only-doi' =>
            sub{ $options{'require_only_doi'} = 1 },
    '--require-full-bibliography' =>
            sub{ $options{'require_only_doi'} = 0 },

    '--reformat-space-group' =>
            sub { $options{'reformat_space_group'} = 1 },
    '--no-reformat-space-group' =>
            sub { $options{'reformat_space_group'} = 0 },
    '--dont-reformat-space-group' =>
            sub { $options{'reformat_space_group'} = 0 },
    '--leave-space-group' =>
            sub { $options{'reformat_space_group'} = 0 },

    '--columns' => \$user_columns,

    '--use-datablocks-without-coordinates' =>
            sub{ $options{'use_datablocks_without_coord'} = 1 },
    '--use-all-datablocks' =>
            sub{ $options{'use_datablocks_without_coord'} = 1 },

    '--do-not-use-datablocks-without-coordinates' =>
            sub{ $options{'use_datablocks_without_coord'} = 0 },
    '--dont-use-datablocks-without-coordinates' =>
            sub{ $options{'use_datablocks_without_coord'} = 0 },
    '--no-use-datablocks-without-coordinates' =>
            sub{ $options{'use_datablocks_without_coord'} = 0 },
    '--skip-datablocks-without-coordinates' =>
            sub{ $options{'use_datablocks_without_coord'} = 0 },

    '--validate-by-schema'
            => sub { $schema = get_field_datatypes( get_value() ) },

    '-c,--always-continue' => sub { $die_on_errors   = 0;
                                    $die_on_warnings = 0;
                                    $die_on_notes    = 0 },
    '-c-,--always-die'     => sub { $die_on_errors   = 1;
                                    $die_on_warnings = 1;
                                    $die_on_notes    = 1 },

    '--continue-on-errors'          => sub { $die_on_errors = 0 },
    '--die-on-errors'               => sub { $die_on_errors = 1 },
    '--no-continue-on-errors'       => sub { $die_on_errors = 1 },
    '--do-not-continue-on-errors'   => sub { $die_on_errors = 1 },
    '--dont-continue-on-errors'     => sub { $die_on_errors = 1 },
    '--exit-on-errors'              => sub { $die_on_errors = 1 },

    '--continue-on-warnings' => sub { $die_on_warnings = 0 },
    '--die-on-warnings'      => sub { $die_on_warnings = 1 },

    '--continue-on-notes'    => sub { $die_on_notes = 0 },
    '--die-on-notes'         => sub { $die_on_notes = 1 },

    '--use-perl-parser'       => sub{ $use_parser = 'perl' },
    '--use-c-parser'          => sub{ $use_parser = 'c' },

    '--cif-input'  => sub { $input_format = 'cif' },
    '--json-input' => sub { $input_format = 'json' },

    '--options'       => sub { options; exit },
    '--help,--usage'  => sub { usage; exit },
    '--version'      => sub { print 'cod-tools version ',
                              $COD::ToolsVersion::Version, "\n";
                              exit }
);

my $die_on_error_level = {
    ERROR   => $die_on_errors,
    WARNING => $die_on_warnings,
    NOTE    => $die_on_notes
};

if( $input_format eq 'json' ) {
    $use_parser = 'json';
}

my @print_columns;

if( defined $user_columns ) {
    @print_columns = map { split ' ', $_ } split m/,/, $user_columns;
} else {
    @print_columns = @COD::CIF::Data::CIF2COD::default_data_fields;
}

binmode( STDOUT, ':encoding(UTF-8)' );
binmode( STDERR, ':encoding(UTF-8)' );

if( $print_header && !$print_keywords ) {
    local $, = ';';
    local $\ = "\n";
    print @print_columns;
}

@ARGV = ( '-' ) unless @ARGV;

my $first = 1;

for my $filename (@ARGV) {

    my $parser_options = { 'parser' => $use_parser, 'no_print' => 1 };

    my ( $data, $err_count, $messages ) = parse_cif( $filename, $parser_options );
    process_parser_messages( $messages, $die_on_error_level );

    canonicalize_all_names( $data );

    my @extracted;
    foreach my $dataset (@{$data}) {

        my $dataname = 'data_' . $dataset->{name};

        local $SIG{__WARN__} = sub { process_warnings( {
                                       'message'       => @_,
                                       'program'       => $0,
                                       'filename'      => $filename,
                                       'add_pos'       => $dataname
                                     }, $die_on_error_level ) };

        my $extracted_dataset;
        eval {
            $extracted_dataset = cif2cod( $dataset, \%options );
            if( $schema ) {
                validate_SQL_types( $extracted_dataset, $schema );
            }
        };
        if ($@) {
            process_errors( {
              'message'       => $@,
              'program'       => $0,
              'filename'      => $filename,
              'add_pos'       => $dataname
            }, $die_on_errors )
        }
        push @extracted, $extracted_dataset if defined $extracted_dataset;
    }

    for my $data (@extracted) {
        my %data = %{$data};

        # Print out the collected data fields:
        my @data_fields = @print_columns;
        if( $print_keywords ) {
            print "\n" unless $first;
            $first = 0;
            my $separator = ' ';
            for my $key (@data_fields) {
                my $value = $data{$key};
                if( defined $value ) {
                    # Remove trailing dot from numbers:
                    # TODO: check if this is legit for all values:
                    # some non-numeric values that look like numbers
                    # might get "damaged"
                    $value =~ s/^\s*([0-9]+)[.]\s*$/$1/;
                    print $key . $separator . $value . "\n";
                } elsif ( $include_keywords_with_undefined_values &&
                          $key ne '' ) {
                        print $key . $separator . "\n";
                }
            }
        } else {
            my $separator = '';
            for my $key (@data_fields) {
                my $value = $data{$key};
                if( defined $value ) {
                    # Remove trailing dot from numbers:
                    $value =~ s/^\s*([0-9]+)[.]\s*$/$1/;
                    if( $key ne 'text' ) {
                        $value =~ s/\\/\\\\/g;
                    }
                    $value =~ s/"/\\"/g;
                    $value = '"' . $value . '"';
                } else {
                    $value = 'NULL';
                }
                print $separator, $value;
                $separator = ';';
            }
            print "\n";
        }
    }
}

sub get_field_datatypes
{
    my( $xml_filename ) = @_;
    my $xml = XMLin( $xml_filename );
    my $msg_template = {
        program => $0,
        filename => $xml_filename,
        err_level => 'WARNING',
    };
    if( !exists $xml->{'xsi:schemaLocation'} ||
        $xml->{'xsi:schemaLocation'} !~ m|^http://(?:www.)?crystallography.net/xml/schema/codsql/codsql_v[0-9]+[.][0-9]+[.]xsd\s| ) {
        $msg_template->{message} =
            'XML document is not derived from codsql.xsd';
        report_message( $msg_template );
        return;
    }
    my $fields;
    if( exists $xml->{Table}{data} ) {
        $fields = $xml->{Table}{data}{Field};
    } elsif( $xml->{Table}{name} eq 'data' ) {
        $fields = $xml->{Table}{Field};
    } else {
        $msg_template->{message} =
            'database description does not contain \'data\' table';
        report_message( $msg_template );
        return;
    }
    return { map { $_ => $fields->{$_}{SQLDataType} }
                  keys %{$fields} };
}
