#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2017-10-24 15:04:52 +0300 (An, 24 spal. 2017) $ 
#$Revision: 5642 $
#$URL: svn://www.crystallography.net/cod-tools/tags/v2.3/scripts/cif_correct_tags $
#------------------------------------------------------------------------------
#*
#* Corrects misspelt tags in a CIF file and outputs made changes into the
#* standard I/O streams. By default, only tags from CIF Core and CIF COD
#* dictionaries with extra '^_+' prefixes are corrected. Additionally, a
#* replacement list file can be provided for the correction of misspelt tags.
#*
#* USAGE:
#*    $0 --options input.cif input*.cif
#**

use strict;
use warnings;
use COD::CIF::JSON qw( cif2json );
use COD::CIF::Parser qw( parse_cif );
use COD::CIF::Tags::CanonicalNames qw( canonicalize_all_names );
use COD::CIF::Tags::DictTags;
use COD::CIF::Tags::COD;
use COD::CIF::Tags::Manage qw( rename_tag exclude_tag );
use COD::CIF::Tags::Print qw( print_cif fold );
use COD::SOptions qw( getOptions );
use COD::SUsage qw( usage options );
use COD::ErrorHandler qw( process_warnings
                          process_errors
                          process_parser_messages );
use COD::ToolsVersion;

my $keep_tag_order = 0;
my $Id = '$Id: cif_correct_tags 5642 2017-10-24 12:04:52Z antanas $';
my $replacement_list;
my $use_parser = 'c';
my $input_format = 'cif';
my $output_format = 'cif';

#* OPTIONS:
#*   --keep-tag-order
#*                     Keep the original tag order in CIF file (default).
#*   --sort-tags
#*                     Reorder tags in CIF file according to COD.
#*   -r, --replacement-list 'replacement-file.lst'
#*                     Name of the multi-line replacement list file with 
#*                     entries of form '_incorrect_tag _correct_tag' to be 
#*                     used in the correction of misspelt tags.
#*   --use-perl-parser
#*                     Use the Perl-only CIF parser.
#*   --use-c-parser
#*                     Use the speed-optimised C/Perl parser. Default.
#*
#*   --cif-input
#*                     Use CIF format for input (default).
#*   --json-input
#*                     Use JSON format for input.
#*   --cif-output
#*                     Use CIF format for output (default).
#*   --json-output  
#*                     Use JSON format for output.
#*   --cif
#*                     Use CIF format for both input and output (default).
#*   --json
#*                     Use JSON format for both input and output.
#*
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
@ARGV = getOptions(
    '--keep-tag-order'  => sub { $keep_tag_order = 1; },
    '--sort-tags'       => sub { $keep_tag_order = 0; },
    '-r,--replacement-list' => \$replacement_list,
    '--use-perl-parser' => sub{ $use_parser = 'perl' },
    '--use-c-parser'    => sub{ $use_parser = 'c' },
    
    '--cif-input'   => sub { $input_format = 'cif' },
    '--json-input'  => sub { $input_format = 'json' },

    '--cif-output'  => sub { $output_format = 'cif' },
    '--json-output' => sub { $output_format = 'json' },
    
    '--cif'  => sub { $input_format = $output_format = 'cif' },
    '--json' => sub { $input_format = $output_format = 'json' },
    
    '--options'         => sub { options; exit },
    '--help,--usage'    => sub { usage; exit },
    '--version'         => sub { print 'cod-tools version ',
                                 $COD::ToolsVersion::Version, "\n";
                                 exit }
);

my $die_on_error_level = {
    ERROR   => 1,
    WARNING => 0,
    NOTE    => 0
};

if( $input_format eq 'json' ) {
    $use_parser = 'json';
}

binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

sub insert_report_to_comments
{
    my ($dataset, $insert_reports) = @_;
    if ( @$insert_reports > 0 ) {
        my $comments_tag = '_cod_depositor_comments';
        my $values = $dataset->{values};
        my $reports_value = join("\n\n",@$insert_reports);

        my $message =
            "The following automatic conversions were performed:\n" .
            join( "\n", map { '' . $_ }
                  fold( 70, ' +', ' ', $reports_value ));

        if( exists $values->{$comments_tag} ) {
            $values->{$comments_tag}[0] .= "\n\n" . $message;
        } else {
            $values->{$comments_tag}[0] = "\n" . $message;
        }
        my $signature = $Id;
        $signature =~ s/^\$|\$$//g;
        $values->{$comments_tag}[0] .=
            "\n\n" . 'Automatic conversion script' .
            "\n" . $signature;
    }

    return;
}

my %tag_spelling = ();
eval {
    if( $replacement_list ) {
        open my $list, '<', $replacement_list or die 'ERROR, '
            . 'cannot open replacement list file for input -- '
            . lcfirst($!) . "\n";

        %tag_spelling =
            map  { split }
            grep { !/^\#/ }
            grep { '\.' }
        <$list>;

        close $list or die 'ERROR, '
            . 'error while closing replacement list file after reading -- '
            . lcfirst($!) . "\n";
    }
};
if ($@) {
    process_errors( {
      'message'       => $@,
      'program'       => $0,
      'filename'      => $replacement_list,
    }, $die_on_error_level->{'ERROR'} )
};

my @dictionary_tags = ( @COD::CIF::Tags::DictTags::tag_list,
                        @COD::CIF::Tags::COD::tag_list );
my %dictionary_tags = map { $_, $_ } @dictionary_tags;

@ARGV = ('-') unless @ARGV;

for my $filename (@ARGV) {
    my $options = { 'parser' => $use_parser, 'no_print' => 1 };
    my ( $data, $err_count, $messages ) = parse_cif( $filename, $options );
    process_parser_messages( $messages, $die_on_error_level );

    canonicalize_all_names( $data );

    for my $dataset (@$data) {

        my @insert_reports = ();
        my $tags = $dataset->{tags};
        my $values = $dataset->{values};
        my $loops = $dataset->{inloop};

        local $SIG{__WARN__} = sub {
            process_warnings( {
                'message'       => @_,
                'program'       => $0,
                'filename'      => $filename,
                'add_pos'       => 'data_' . $dataset->{'name'}
            }, $die_on_error_level )
        };

        for my $tag (@$tags) {
            if( exists $tag_spelling{$tag} ) {
                my $old_tag = $tag;
                my $correct = $tag_spelling{$tag};
                my $report_msg;
                if( !exists $values->{$correct} ) {
                    rename_tag( $dataset, $old_tag, $correct );
                    $report_msg =
                    "data name '$old_tag' replaced with '$correct' " .
                    'as specified in the replacement file ' .
                    "'$replacement_list'.";
                    push (@insert_reports, $report_msg);
                    warn "NOTE, $report_msg" . "\n";
                } else {
                    if ( exists $loops->{$old_tag} ||
                         exists $loops->{$correct} ) {
                        # comparing looped values is out of
                        #  the scope of this script
                        $report_msg = "'$old_tag' data item was left intact -- " .
                            "it should have been renamed to '$correct' as " .
                            'specified in the replacement file ' .
                            "'$replacement_list', but the replacement " .
                            'data item was already present in the data block. ' .
                            "Value comparison was skipped since the '" .
                             (exists $loops->{$old_tag} ? $old_tag : $correct ) .
                             "' data item was found within a loop structure.";
                        warn "NOTE, $report_msg" . "\n";
                    } elsif ( $values->{$old_tag}[0] eq $values->{$correct}[0] ) {
                        exclude_tag( $dataset, $old_tag );
                        $report_msg = "'$old_tag' data item was removed -- " .
                            "it should have been renamed to '$correct' as " .
                            'specified in the replacement file ' .
                            "'$replacement_list', but the replacement " .
                            "data item was already present in the data block " .
                            "and had the same value as the '$old_tag' data item.";
                        push (@insert_reports, $report_msg);
                        warn "NOTE, $report_msg" . "\n";
                    } else {
                        $report_msg = "'$old_tag' data item was left intact -- " .
                            "it should have been renamed to '$correct' as " .
                            'specified in the replacement file ' .
                            "'$replacement_list', but the replacement " .
                            "data item was already present in the data block " .
                            "and had a different value than the '$old_tag' " .
                            'data item.';
                        warn "NOTE, $report_msg" . "\n";
                    }
                }
            }
            if( $tag =~ /^_+(_[^_].*)$/ ) {
                my $old_tag = $tag;
                my $correct = $1;
                if( exists $dictionary_tags{$correct} &&
                    !exists $values->{$correct} ) {
                    rename_tag( $dataset, $tag, $correct );
                    my $report_msg = "data name '$old_tag' replaced with " .
                                     "'$correct'.";
                    push (@insert_reports, $report_msg);
                    warn "NOTE, $report_msg" . "\n";
                }
            }
        }
        insert_report_to_comments ($dataset,\@insert_reports);
    }

    eval {
        for my $dataset (@$data) {
            if( $output_format eq 'cif' ) {
                print_cif( $dataset, {
                    exclude_misspelled_tags => 0,
                    preserve_loop_order => 1,
                    fold_long_fields => 0,
                    dictionary_tags => \%dictionary_tags,
                    dictionary_tag_list => \@dictionary_tags,
                    keep_tag_order => $keep_tag_order,
                } );
            } elsif( $output_format eq 'json' ) {
                print cif2json( $dataset );
            } else {
                die "ERROR, unknown output format '$output_format'\n";
            }
        }
    };
    if ($@) {
        process_errors( {
          'message'       => $@,
          'program'       => $0,
        }, $die_on_error_level->{'ERROR'} )
    };
}
