#!/usr/bin/perl
#=======================================================================
# Copyright (c) 2000-2001 Daniele Giacomini daniele@swlibero.org
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#=======================================================================
# textchk [OPTION]... FILE_TO_BE_ANALYZED [REPORT_FILE [DIAG_FILE]]
#
# Analyze the document searching common mistakes (syntax and style)
# that can be found with regular expressions rules.
#=======================================================================

# We want to use gettext.
use POSIX;
use Locale::gettext;
setlocale (LC_MESSAGES, "");
textdomain ("textchk");

#sub gettext
#{
#    return $_[0];
#}

# Program version.
$VERSION = "2001.08.29";

# Program canonical name.
$program_canonical_name="Textchk";

# File containing error rules and special exceptions.
$current_error_and_exception_file = "./.textchk.rules";
$personal_error_and_exception_file = "$ENV{HOME}/.textchk.rules";
$site_error_and_exception_file = "/etc/textchk.rules";

$special_case_file = "./.textchk.special";

#-----------------------------------------------------------------------
# Variables after this point are not intended to be changed.
#-----------------------------------------------------------------------

# Field separator.
$FIELD_SEPARATOR = "____";

# Program name.
$program_canonical_name = "Textchk";


#-----------------------------------------------------------------------
# Show the right syntax for the use of this program.
sub help_syntax
{
    printf STDOUT gettext
	( "Usage: %s --input-type=TYPE FILE_TO_BE_ANALYZED [REPORT_FILE [DIAG_FILE]]\n"
        . "       %s --help\n"
        . "       %s --version\n"
        . "\n"
        . "Check for common syntax and style mistakes inside a text.\n"
        . "\n"
        . "Options:\n"
        . "--help              display this help and exit.\n"
        . "--version           display version information and exit.\n"
        . "--input-type=TYPE   define the input type:\n"
	. "    standard        input is a text file where every paragraph\n"
	. "                      is on a single line;\n"
	. "    man             input is a man page file;\n"
	. "    html            input is an HTML file;\n"
	. "    texi, texinfo   input is a Texinfo source file."
        . "\n"
        . "Arguments:\n"
        . "\n"
        . "FILE_TO_BE_ANALYZED the input file.\n"
        . "\n"
        . "REPORT_FILE         summary file of reported errors.\n"
        . "\n"
        . "DIAG_FILE           processing information output file.\n"
        . "                      It helps to find troubles with the declared\n"
        . "                      rules.\n"
	. "\n"
	. "Report bugs to <daniele\@swlibero.org>\n"),
	$0, $0, $0;
}

#-----------------------------------------------------------------------
# Show version information.
sub version_info
{
    printf STDOUT gettext
	( "%s %s\n"
	. "\n"
	. "Copyright (C) 2000 Daniele Giacomini <daniele\@swlibero.org>\n"
	. "This is free software; see the source for copying conditions.\n"
	. "There is NO warranty; not even for MERCHANTABILITY or FITNESS\n"
	. "FOR A PARTICULAR PURPOSE.\n"
	. "\n"
	. "Written by Daniele Giacomini <daniele\@swlibero.org>\n" ),
	$program_canonical_name, $VERSION;
}

#-----------------------------------------------------------------------
# Define a temporary file.
sub temporary_file
{

    local ($temp_dir) = "";
    local ($random_number) = 0;
    local ($random_file_name) = "";

    # Check for temporary dir.
    if (-d $ENV{TEMPDIR}
        && -r $ENV{TEMPDIR}
        && -w $ENV{TEMPDIR}
        && -x $ENV{TEMPDIR})
      {
        # This directory is good.
        $temp_dir = $ENV{TEMPDIR};
      }
    elsif (-d "/tmp" && -r "/tmp" && -w "/tmp" && -x "/tmp")
      {
        # This directory is good.
	$temp_dir = "/tmp";
      }
    elsif (-r "." && -w "." && -x ".")
      {
	# This directory is good.
	$temp_dir = ".";
      }
    else
      {
        # We cannot use any temporary file!
        printf STDERR gettext ("%s: cannot create any temporary file!\n"),
		      $0;
        exit 1;
      }

    # If we are here, we have a valid temporary directory.
    # We try to find a good name for the file.
    while (1)
      {
        # Define the random number (six digits).
        $random_number = int ((rand) * 1000000);
        $random_file_name = "$temp_dir/TF$random_number.tmp";

        # Check if it is new.
	if (-e $random_file_name)
	  {
	    # The file exists already.
	    next;
          }
        else
          {
            if (open (TEMP_FILE, "> $random_file_name"))
	      {
	        # It works.
	        close (TEMP_FILE);
	        last;
	      }
	    else
	      {
	        # Don't know what to do.
    		printf STDERR
		    gettext ("%s: cannot create the temporary file %s\n"),
		    $0, $random_file_name;
	        next;
	      }

	    # This point cannot be reached.
    	    printf STDERR
		gettext ("%s: function %s unknown error 1\n"),
		$0, "&temporary_file()";
	  }

	# This point cannot be reached.
    	printf STDERR
	    gettext ("%s: function %s unknown error 2\n"),
	    $0, "&temporary_file()";
      }		

    # Return the file name.
    return "$random_file_name";
}

#-------------------------------------------------------------------
# Man pages must be adapted before the analysis.
# &textchk_man (SOURCE_FILE_NAME, REPORT_FILE, DIAG_FILE)
sub textchk_man
{
    local ($source_file) = $_[0];
    local ($report_file) = $_[1];
    local ($diag_file) = $_[2];
    local ($directory_name) = "";
    local ($file_name) = "";
    local ($temporary_file1) = "";
    local ($temporary_file2) = "";

    # To use man without ambiguity between file names and command
    # names, we need to give a path for the file.
    $directory_name=`dirname $source_file`;
    chomp ($directory_name);
    $file_name=`basename $source_file`;
    chomp ($file_name);
    $source_file="$directory_name/$file_name";
    $temporary_file1 = &temporary_file;
    $temporary_file2 = &temporary_file;

    # We use groff to create a temporary file.
    system ("echo \".ll 10000c\" > $temporary_file1");
    system ("cat $source_file >> $temporary_file1");
    system ("groff -Tlatin1 -mandoc $temporary_file1 | col -bx > $temporary_file2");

    # Finally we start the analysis.
    &textchk_standard ($temporary_file2, $report_file, $diag_file);

    # Temporary file deletion.
    unlink ("$temporary_file1");
    unlink ("$temporary_file2");
}

#-------------------------------------------------------------------
# HTML pages must be adapted before the analysis.
# &textchk_html (SOURCE_FILE_NAME, REPORT_FILE, DIAG_FILE)
sub textchk_html
{
    local ($source_file) = $_[0];
    local ($report_file) = $_[1];
    local ($diag_file) = $_[2];
    local ($directory_name) = "";
    local ($file_name) = "";
    local ($temporary_file) = "";

    # To use man without ambiguity between file names and command
    # names, we need to give a path for the file.
    $directory_name=`dirname $source_file`;
    chomp ($directory_name);
    $file_name=`basename $source_file`;
    chomp ($file_name);
    $source_file="$directory_name/$file_name";
    $temporary_file = &temporary_file;

    # We use lynx to create a temporary file.
    system ("lynx -dump -nolist -width=99999 $source_file > $temporary_file");

    # Finally we start the analysis.
    &textchk_standard ($temporary_file, $report_file, $diag_file);

    # Temporary file deletion.
    unlink ("$temporary_file");
}

#-------------------------------------------------------------------
# Texinfo documents must be adapted before the analysis.
# &textchk_texinfo (SOURCE_FILE_NAME, REPORT_FILE, DIAG_FILE)
sub textchk_texinfo
{
    local ($source_file) = $_[0];
    local ($report_file) = $_[1];
    local ($diag_file) = $_[2];
    local ($directory_name) = "";
    local ($file_name) = "";
    local ($temporary_file) = "";

    # To use man without ambiguity between file names and command
    # names, we need to give a path for the file.
    $directory_name=`dirname $source_file`;
    chomp ($directory_name);
    $file_name=`basename $source_file`;
    chomp ($file_name);
    $source_file="$directory_name/$file_name";
    $temporary_file = &temporary_file;

    # We use makeinfo to create a temporary file.
    system ("makeinfo --fill-column 999999 --no-headers	--no-split --force --output=$temporary_file $source_file");

    # Finally we start the analysis.
    &textchk_standard ($temporary_file, $report_file, $diag_file);

    # Temporary file deletion.
    unlink ("$temporary_file");
}

#-----------------------------------------------------------------------
# &textchk_standard (FILE_TO_BE_CHECKED, REPORT_FILE, DIAG_FILE)
sub textchk_standard
{
    local ($file_to_be_checked) = $_[0];
    local ($report_file) = $_[1];
    local ($diag_file) = $_[2];

    # The rule files are loaded inside these arrays.
    local (@array_kind_of_errors) = ();
    local (@array_errors) = ();
    local (@array_explanations) = ();
    local (@array_exceptions) = ();
    local (@array_particularities) = ();
    local ($index_errors) = 0;
    local ($index_exceptions) = 0;
    local ($index_particularities) = 0;

    # Scalar variables used to read and handle errors, exceptions and
    # particularities.
    local ($record) = "";
    local ($error_re) = "";
    local ($error_kind) = "";
    local ($error_explanation) = "";
    local ($exception) = "";
    local ($particularity) = "";

    # Scalar used to read the file subject to analysis.
    local ($line) = "";

    # Parts of text that is considered a mistake.
    local ($head) = "";
    local ($content) = "";
    local ($tail) = "";
    local ($whole) = "";

    # Flags that tells the presence or persistence of an error.
    local ($match_found) = 0;
    local ($wrong) = 0;

    # Flag that states that a line is all uppercase.
    local ($all_uppercase) = 0;

    # Open the file containing the error and exception rules is loaded.
    open (ERROREXCEPTION,
	  "cat $current_error_and_exception_file $personal_error_and_exception_file $site_error_and_exception_file 2> /dev/null |");

    # Scan the file.
    while ($record = <ERROREXCEPTION>) {

	# Delete the newline code.
	chomp ($record);

	# If the record is empty, or it contains a comment, repeat the
	# loop.
	if ($record =~ m/^\s*$/)
	  {
    	    next;
	  };
	if ($record =~ m/^\s*#.*$/)
	  {
            next;
	  };

	# Analyze depending on the type of record.
	if ($record =~ m/^ERR/
	    || $record =~ m/^DBL/)
	  {
	    # This is an error description.

	    # Extract the content.
	    if ($record =~ m/^ERR$FIELD_SEPARATOR(.*)$FIELD_SEPARATOR(.*)$/)
	      {
		$error_re = $1;
		$error_explanation = $2;
		$error_kind = "ERR";
	      }
	    elsif ($record =~ m/^ERR$FIELD_SEPARATOR(.*)$/)
	      {
		$error_re = $1;
		$error_explanation = "";
		$error_kind = "ERR";
	      }
	    elsif ($record =~ m/^DBL$FIELD_SEPARATOR(.*)$FIELD_SEPARATOR(.*)$/)
	      {
		$error_re = $1;
		$error_explanation = $2;
		$error_kind = "DBL";
	      }
	    elsif ($record =~ m/^DBL$FIELD_SEPARATOR(.*)$/)
	      {
		$error_re = $1;
		$error_explanation = "";
		$error_kind = "DBL";
	      }
	    else
	      {
		# There is something wrong on the record format.
	        printf STDOUT gettext ("Syntax error on record:\n%s\n"),
		              $record;
    		next;
	      }

	    # Put into the arrays.
	    $index_errors = $#array_errors+1;
	    $array_kind_of_errors[$index_errors] = $error_kind;
	    $array_errors[$index_errors] = $error_re;
	    $array_explanations[$index_errors] = $error_explanation;

	    # !!!!!!!!!!!!
	    #print STDOUT ( $index_errors . " " . $array_kind_of_errors[$index_errors] . " " . $array_errors[$index_errors] . " " . $array_explanations[$index_errors] . "\n" );

	    # Prepare exceptions array.
	    $array_exceptions[$index_errors] = ();
	  }
	elsif ($record =~ m/^EXC/)
	  {
	    # This is an exception description.

	    # Extract the content.
	    if ($record =~ m/^EXC$FIELD_SEPARATOR(.*)$FIELD_SEPARATOR(.*)$/)
	      {
		$exception = $1;
		printf STDOUT
                  (gettext ("Explanation field not required inside exception records:\n%s\n"),
		   $record);

	      }
	    elsif ($record =~ m/^EXC$FIELD_SEPARATOR(.*)$/)
	      {
		$exception = $1;
	      }
	    else
	      {
		# There is something wrong on the record format.
		printf STDOUT gettext ("Syntax error on record:\n%s\n"),
			      $record;
    		next;
	      }

	    # Put into the arrays.
	    $index_exceptions = $#{$array_exceptions[$index_errors]} +1 ;
	    $array_exceptions[$index_errors][$index_exceptions] = $exception;
	  }
      }

    # Close the file.
    close (ERROREXCEPTION);
    
    # The file containing the particularities is loaded.
    open (PARTICULARITIES, "cat $special_case_file 2> /dev/null |");

    # Scan the file.
    while ($particularity = <PARTICULARITIES>)
      {
	# Delete the newline code.
	chomp ($particularity);

	# If the record is empty, repeat the loop (no comments are
	# allowed).
        if ($particularity =~ m/^\s*$/)
	  {
            next;
          };

	# Put into the array.
	$index_particularities = $#array_particularities+1;
	$array_particularities[$index_particularities] = $particularity;
      }

    # Close the file.
    close (PARTICULARITIES);

    # Start the document scan.
    open (DOCUMENT, "< $file_to_be_checked");
    open (REPORT, "> $report_file");
    open (DIAG, "> $diag_file");

    while ($line = <DOCUMENT>)
      {
	# Delete the newline code.
	chomp($line);

	# If the line is empty, repeat the loop.
	if ($line =~ m/^\s*$/)
	  {
    	    next;
	  };

	# If the line contains text all uppercase, save the information.
	if ($line =~ m/[a-z]/)
	  {
	    $all_uppercase = 0;
	  }
	else
	  {
	    $all_uppercase = 1;
	  };

	# Reset the flag.
	$wrong = 0;

	# Scan the error array.
	for ($index_errors = 0;
	     $index_errors <= $#array_errors;
	     $index_errors++)
	  {
	    $error_re = $array_errors[$index_errors];
	    $error_kind = $array_kind_of_errors[$index_errors];
	    $error_explanation = $array_explanations[$index_errors];

	    # Do the comparison.
	    # If the line is all uppercase, the comparison is not
	    # case sensible.
	    if ($all_uppercase)
	      {
	        if ($error_kind eq "ERR")
		  {
		    if ($line =~ m/$error_re/i)
		      {
		        $match_found = 1;
		      }
		    else
		      {
		        $match_found = 0;
		      }
		  }
		elsif ($error_kind eq "DBL")
		  {
		    if ($line =~ m/(\b$error_re\b)\s+\b\1\b/i)
		      {
		        $match_found = 1;
		      }
		    else
		      {
		        $match_found = 0;
		      }
		  }
		else
		  {
		    printf STDOUT
                      (gettext ("Strange type of error: %s\n"),
		         $error_kind);
		  }
	      }
	    else
	      {
	        if ($error_kind eq "ERR")
		  {
		    if ($line =~ m/$error_re/)
		      {
		        $match_found = 1;
		      }
		    else
		      {
		        $match_found = 0;
		      }
		  }
		elsif ($error_kind eq "DBL")
		  {
		    if ($line =~ m/(\b$error_re\b)\s+\b\1\b/)
		      {
		        $match_found = 1;
		      }
		    else
		      {
		        $match_found = 0;
		      }
		  }
		else
		  {
		    printf STDOUT
                      (gettext ("Strange type of error: %s\n"),
		         $error_kind);
		  }
	      }
	
	    # If the comparison is positive, start the search.
	    if ($match_found)
	      {
	        if ($error_kind eq "ERR")
		  {
		    # Try to extract three words before and after the problem.
		    $line =~
	    	      m/(\S*\s*\S*\s*\S*\s*)($error_re)(\s*\S*\s*\S*\s*\S*)/;

		    # Activate the flag that indicate the presence of an error.
		    $wrong = 1;

		    # Saves the tree parts.
		    $head = "$1";
		    $content = "$2";
		    $tail = "$3";
		    
		    # If the regexp does not match anymore...
		    if ($content eq "")
		      {
			$head = "";
			$content = "$line";
			$tail = "";
		      }
		    
		  }
	        elsif ($error_kind eq "DBL")
		  {
		    # Try to extract three words before and after the problem.
		    $line =~
	    	      m/(\S*\s*\S*\s*\S*\s*)\b($error_re)\b\s+\b\2\b(\s*\S*\s*\S*\s*\S*)/;

		    # Activate the flag that indicate the presence of an error.
		    $wrong = 1;

		    # Saves the tree parts.
		    $head = "$1";
		    $content = "$2 $2";
		    $tail = "$3";

		    # If the regexp does not match anymore...
		    if ($content eq "")
		      {
			$head = "";
			$content = "$line";
			$tail = "";
		      }

		  }
		else
		  {
		    printf STDOUT
                      (gettext ("Strange type of error: %s\n"),
		         $error_kind);
		  }
		  

		# This is the extracted string.
		$whole = "${head}${content}${tail}" ;

		# Write on the diagnostic file.
		print DIAG "\n";
		print DIAG "??? ${head}>>${content}<<${tail}\n";
		print DIAG "ERR $error_re\n";

	      }
	    else
	      {
		# If this model doesn't correspond, jump to the next loop.
		next;
	    
	      }
	    
	    if ($wrong)
	      {
		# It seems wrong: scan exceptions.
	    
		# $array_exceptions[$index_errors] contains another array,
		# the exceptions specifically made for a particular
		# error; this way, $#{$array_exceptions[$index_errors]} is
		# the index of the last element of this array.
		# If this array is empty, the value
		# $#{$array_exceptions[$index_errors]} should be negative.
		for ($index_exceptions = 0;
		     $index_exceptions <= $#{$array_exceptions[$index_errors]}
		       && $#{$array_exceptions[$index_errors]} >= 0;
		     $index_exceptions++)
		  {
		    $exception =
			$array_exceptions[$index_errors][$index_exceptions];

		    # Annotate inside the diagnose file.
		    print DIAG "EXC $exception\n";

		    # Do the comparison.
		    if ($whole =~ m/$exception/)
		      {
			# As the exception match,
			# this kind of error is not to be considered
			# and other exceptions are not to be checked.
			$wrong = 0;

			# Annotate on the diagnostic file.
			print DIAG "EXC $exception\n";

			# Terminate the for loop.
			last;
		      }
		  } # for
	      }

	    # Verify if the error is still there.
	    if ($wrong)
	      {
		# It seems still wrong: scan particularities.
		for ($index_particularities = 0;
		     $index_particularities <= $#array_particularities;
		     $index_particularities++)
		  {
		    $particularity = $array_particularities[$index_particularities];

		    # Do the comparison; $whole is the extracted string
		    # that contains the possible error.
		    if ($whole eq $particularity)
		      {

			# In this case, the line is correct this way;
			# the flag is reset and another error is analyzed.
			$wrong = 0;

			# Annotate inside the diagnostic file.
			print DIAG "SPC $particularity\n";
		      }
		  }
	      }

	    # Is the error still there?
	    if ($wrong)
	      {
		# The error is annotated inside the report file.
		print REPORT "$whole\n";

		# Print this information also using the standard output,
		# showing what is exactly considered the mistake.
		print STDOUT "$error_explanation\n";
		print STDOUT "    $head>>$content<<$tail\n";

		# Annotate inside the diagnostic file.
		print DIAG "!!! $head>>$content<<$tail\n";
		
		#exit from the for loop.
		last;
	      }
	  } # for
      }

    # Close files.
    close (DOCUMENT);
    close (REPORT);
    close (DIAG);
}

#----------------------------------------------------------------------
# Start of program.
#----------------------------------------------------------------------
local ($file_to_be_checked) = "";
local ($report_file) = "";
local ($diag_file) = "";
local ($n) = 0;
local ($input_type) = "";

# Scan arguments.
for ($n = 0; $n <= $#ARGV; $n++)
  {

    # Analyze argument $n.
    if ($ARGV[$n] !~ m/^-/)
      {
	# Options are terminated as this argument has no minus at the
	# beguinning. This must be a file name.
	if ($file_to_be_checked eq "")
	  {
	    $file_to_be_checked = $ARGV[$n];
	  }
	elsif ($report_file eq "")
	  {
	    $report_file = $ARGV[$n];
	  }
	elsif ($diag_file eq "")
	  {
	    $diag_file = $ARGV[$n];
	  }
	else
	  {
	    # As all files are defined, there is a mistake.
	    printf STDOUT gettext ("%s: no arguments allowed after the third file name: %s\n"),
		          $0, $ARGV[$n];
    	    exit 1;
	  }
      }
    elsif ($ARGV[$n] eq "--help")
      {
	# The user is asking for help.
    	&help_syntax;
    	exit 0;
      }
    elsif ($ARGV[$n] eq "--version")
      {
	# The user wants to know the program version.
	&version_info;
    	exit 0;
      }
    elsif ($ARGV[$n] =~ m/^--input-type=(.*)$/)
      {
	# The user tells the input type.
	# We need to verify that it wasn't already definied.
	if ($input_type eq "")
	  {
	    $input_type = $1;
	  }
	else
	  {
	    printf STDOUT gettext ("%s: only one input file type is allowed: %s\n"),
	                  $0, $ARGV[$n];
    	    exit 1;
	  }
      }
    else
      {
	# Must be an unknown option.
	printf STDOUT gettext ("%s: unknown option: %s\n"),
	              $0, $ARGV[$n];
    	exit 1;
      }
  }

# Are arguments logical?
# Define default values an prepare some values.
if ($input_type ne "standard"
    && $input_type ne "html"
    && $input_type ne "texi"
    && $input_type ne "texinfo"
    && $input_type ne "man")
  {
    printf STDOUT gettext ("%s: unknown input file type: %s\n"),
	                  $0, $input_type;
    printf STDOUT gettext ("%s: using input file type \"standard\" instead\n"),
	                  $0;
    $input_type = "standard";
  }

if ($report_file eq "")
  {
    $report_file = "$file_to_be_checked.err";
    $diag_file = "$file_to_be_checked.diag";
  }
elsif ($diag_file eq "")
  {
    $diag_file = "$report_file.diag";
  }

# Finally, do the work.
if ($input_type eq "standard")
  {
    &textchk_standard ($file_to_be_checked, $report_file, $diag_file);
  }
elsif ($input_type eq "man")
  {
    &textchk_man ($file_to_be_checked, $report_file, $diag_file);
  }
elsif ($input_type eq "html")
  {
    &textchk_html ($file_to_be_checked, $report_file, $diag_file);
  }
elsif ($input_type eq "texi"
       || $input_type eq "texinfo")
  {
    &textchk_texinfo ($file_to_be_checked, $report_file, $diag_file);
  }

#=======================================================================
