#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2020-09-01 12:59:15 +0300 (Tue, 01 Sep 2020) $
#$Revision: 8455 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.1.0/scripts/cif_sort_atoms $
#------------------------------------------------------------------------------
#*
#* Sort atoms in a CIF file in given order. Accepts more than one sorting
#* criterion.
#*
#* USAGE:
#*    $0 --options input1.cif input*.cif
#**

use strict;
use warnings;
use File::Basename qw( basename );
use COD::AtomProperties;
use COD::CIF::Parser qw( parse_cif );
use COD::CIF::Tags::CanonicalNames qw( canonicalize_all_names );
use COD::CIF::Tags::Print qw( print_cif );
use COD::ErrorHandler qw( process_errors
                          process_warnings
                          process_parser_messages
                          report_message );
use COD::SOptions qw( getOptions get_value );
use COD::SUsage qw( usage options );
use COD::ToolsVersion;

my @order_functions;
my $direction = 1;

my $use_parser = 'c';

my $die_on_errors   = 1;
my $die_on_warnings = 0;
my $die_on_notes    = 0;
my $die_on_error_level = {
    ERROR   => $die_on_errors,
    WARNING => $die_on_warnings,
    NOTE    => $die_on_notes
};

#* OPTIONS:
#*   -l, --lexicographic
#*                     Sort by lexicographic order (default).
#*   -Z, --atomic-number
#*                     Sort by atomic number.
#*   --ascending-numerical _atom_site_occupancy
#*                     Sort by values of CIF data item in ascending numerical
#*                     order.
#*   --descending-numerical _atom_site_occupancy
#*                     Sort by values of CIF data item in descending numerical
#*                     order.
#*   --ascending-lexical _atom_site_label
#*                     Sort by values of CIF data item in ascending lexical
#*                     order.
#*   --descending-lexical _atom_site_label
#*                     Sort by values of CIF data item in descending lexical
#*                     order.
#*
#*   -r, --reverse
#*                     Reverse the ordering.
#*
#*   --use-perl-parser
#*                     Use Perl parser for CIF parsing.
#*   --use-c-parser
#*                     Use Perl & C parser for CIF parsing (default).
#*
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**

@ARGV = getOptions(
    '-l,--lexicographic' => sub { push @order_functions, \&get_lexicographic_order },
    '-Z,--atomic-number' => sub { push @order_functions, \&get_atomic_order },

    '--ascending-numerical' =>
        sub { push @order_functions,
                   make_comparator_numeric( get_value, 1 ) },
    '--descending-numerical' =>
        sub { push @order_functions,
                   make_comparator_numeric( get_value, -1 ) },
    '--ascending-lexical' =>
        sub { push @order_functions,
                   make_comparator_lexic( get_value, 1 ) },
    '--descending-lexical' =>
        sub { push @order_functions,
                   make_comparator_lexic( get_value, -1 ) },

    '-r,--reverse' => sub { $direction = -1 },

    '--use-perl-parser' => sub{ $use_parser = 'perl' },
    '--use-c-parser'    => sub{ $use_parser = 'c' },

    '--options'      => sub { options; exit },
    '--help,--usage' => sub { usage; exit },
    '--version'         => sub { print 'cod-tools version ',
                                 $COD::ToolsVersion::Version, "\n";
                                 exit }
);

@order_functions = ( \&get_lexicographic_order ) if !@order_functions;

@ARGV = ( '-' ) unless @ARGV;

binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

for my $filename (@ARGV) {
    my $options = { parser   => $use_parser,
                    no_print => 1 };

    my( $data, $err_count, $messages ) = parse_cif( $filename, $options );
    process_parser_messages( $messages, $die_on_error_level );

    if( !@{$data} ) {
        report_message( {
           'program'   => $0,
           'filename'  => $filename,
           'err_level' => 'WARNING',
           'message'   => 'file seems to be empty'
        }, 0 );
        next;
    }

    canonicalize_all_names( $data );

    for my $dataset (@$data) {
        my $values = $dataset->{values};
        my $dataname = 'data_' . $dataset->{name};

        local $SIG{__WARN__} = sub {
            process_warnings( {
                'message'  => @_,
                'program'  => $0,
                'filename' => $filename,
                'add_pos'  => $dataname
            }, $die_on_error_level )
        };

        eval {
            my $atom_loop = $dataset->{inloop}{_atom_site_label};
            my @atoms;
            for my $i (0..$#{$dataset->{values}{_atom_site_label}}) {
                my %atom = map { $_ => $dataset->{values}{$_}[$i] }
                               @{$dataset->{loops}[$atom_loop]};
                $atom{index} = $i;
                push @atoms, \%atom;
            }
            my @order = map  { $_->{index} }
                        sort { for my $order (@order_functions) {
                                   my $cmp = $order->( $a, $b );
                                   return $direction * $cmp if $cmp;
                               } }
                             @atoms;
            if( defined $atom_loop ) {
                for my $tag (@{$dataset->{loops}[$atom_loop]}) {
                    for my $key (qw( precisions types values )) {
                        $dataset->{$key}{$tag} =
                            [ @{$dataset->{$key}{$tag}}[@order] ];
                    }
                }
            }

            print_cif( $dataset,
                       {
                            preserve_loop_order => 1,
                            keep_tag_order => 1
                       } );
        };
        if ( $@ ) {
            process_errors( {
              'message'       => $@,
              'program'       => $0,
              'filename'      => $filename,
              'add_pos'       => $dataname
            }, $die_on_errors )
        }
    }
}

sub get_lexicographic_order
{
    my( $a, $b ) = @_;
    return $a->{_atom_site_label} cmp $b->{_atom_site_label};
}

sub get_atomic_order
{
    my( $a, $b ) = @_;

    my( $a_type, $b_type ) =
        map { /^([A-Za-z]{1,2})/ ? $1 : $_ }
        map { exists $_->{_atom_site_type_symbol} ?
                     $_->{_atom_site_type_symbol} :
                     $_->{_atom_site_label} }
            ( $a, $b );

    if( !defined $a_type || !defined $b_type ) {
        return (defined $b_type) - (defined $a_type);
    }

    my( $a_number, $b_number );
    if( exists $COD::AtomProperties::atoms{$a_type} ) {
        $a_number = $COD::AtomProperties::atoms{$a_type}{atomic_number};
    } else {
        warn "unknown chemical type '$a_type'\n";
    }
    if( exists $COD::AtomProperties::atoms{$b_type} ) {
        $b_number = $COD::AtomProperties::atoms{$b_type}{atomic_number};
    } else {
        warn "unknown chemical type '$b_type'\n";
    }
    if( defined $a_number && defined $b_number ) {
        return $a_number <=> $b_number || $a_type cmp $b_type;
    } else {
        return (defined $b_number) - (defined $a_number) ||
               $a_type cmp $b_type;
    }
}

sub make_comparator_numeric
{
    my( $data_name, $direction ) = @_;
    return sub { $direction * ( $a->{$data_name} =~ /^[.?]$/ ||
                                $b->{$data_name} =~ /^[.?]$/
                                ? $a->{$data_name} cmp $b->{$data_name}
                                : drop_precision( $a->{$data_name} ) <=>
                                  drop_precision( $b->{$data_name} ) ) };
}

sub make_comparator_lexic
{
    my( $data_name, $direction ) = @_;
    return sub { $direction * ( $a->{$data_name} cmp $b->{$data_name} ) };
}

sub drop_precision
{
    my( $value ) = @_;
    $value =~ s/\(.+\)$//;
    return $value;
}
