#!/usr/bin/perl -w
#
# This script checks that everything that exists is documented,
# and vice versa.
#
# It does some pretty grotty ad-hoc parsing of both Rust sources,
# and the markdown in the reference manual.
#
# At the bottom of the file, in __DATA__, is a table of expected
# exceptions.
#
#
# With --json it outputs information about what is found where,
# as a JSON document.  (The schema is not currently documented.)
# (In this mode it doesn't perform most of the checks.)
#
# Implementation note:
#
# (I experimented with a more formal approach, including having the
# `bizarre` version of d-d-macros collect the fixed strings that a keyword
# was compared with during parsing.  It was difficult to properly
# categorise the keyword and the mechanism was confusing and invasive.)

use strict;
use JSON qw(to_json);

our ($whole, $file);

our $print_progress = 1;

sub progress (@) {
    if ($print_progress) {
	print @_;
    }
}

sub open_file ($) {
    ($file) = @_;
    open F, '<', $file or die "$file: $!";
    progress "  reading $file\n";
}

sub end_file () {
    F->error and die "$file: $!";
}

sub read_file ($) {
    open_file $_[0];
    local $/ = undef;
    $whole = <F> // die "$file: $!";
    end_file();
}

our %grokked;
our %ref_lines;

sub grokked ($$@) {
    my ($info, $note, @out) = @_;

    progress "    grokked $info [$note]\n      @out\n\n";
    foreach my $kw (@out) {
	$grokked{$kw} .= $note;
    }
}

sub grok_match ($$$$) {
    my ($file, $tag, $note, $unreach) = @_;

    read_file($file);
    my @out;
    my $any;

    while ($whole =~ m{
( \ + ) // .* \#\# \ maint/check-keywords-documented \ $tag \ \#\# .* \n
(
  (?: \1 .* \n ) +
)
    }gmx) {
	$any++;
	my $arm = $2;
	$arm =~ s{ \=\> .* $unreach .* \n }{}x
	  or die "$tag arm not => $unreach";
	$arm !~ m{\=\>}
	  or die
	  "----- ARM -----\n$arm\n----- ARM -----\n".
	  "multiple arms pasted together, parsing failed?";
	push @out, $arm =~ m{ SD :: (\w+) }gmx;
    }
    die "// ## maint/check-keywords-documented $tag ## not found"
      unless $any;

    grokked $tag, $note, map { lc } @out;
}

sub grok_not_in_bool () {
    grok_match('macros/boolean.rs', 'NotInBool', 'e', qr{void::unreachable});
}

sub grok_bool_only () {
    grok_match('macros/expand.rs', 'BoolOnly', 'b', qr{append_bool_only});
}

sub grok_enum () {
    read_file('macros/syntax.rs');
    $whole =~ m{
^ pub \ enum \ SubstDetails\b .* \{ \n
(
    (?: \ {4} .* \n | \n )*
)
^ \} $
    }mx or die "pub enum SubstDetails not found";

    my $enum_body = $1;
    my @out = $enum_body =~ m{^ \ {4} (\w+) \b}gmx;

    grokked 'enum variants', 'v', map { lc } @out;
}

sub grok_cases () {
    open_file('macros/paste.rs');
    my (@outk, @outh);
    while (<F>) {
	next unless m/^define_cases\! *\{$/..m/^\}$/;
	push @outk, m{\"(\w+)\"}g;
	push @outh, m{\bAs(\w+)\b}g;
    }
    grokked 'define_cases (kwd)', 'c', @outk;
    grokked 'define_cases (heck)', 'h', @outh;
    end_file();
}

sub grok_dbg_allkw () {
    open_file('macros/dbg_allkw.rs');
    my (@out_kw, @out_bool);
    while (<F>) {
	my $dollar_re;
	my $out;
	if (m/^\s* expand\!/x) {
	    $dollar_re = qr{\$};
	    $out = \@out_kw;
	} elsif (m/^\s* (bool)\!/x) {
	    $dollar_re = qr{};
	    $out = \@out_bool;
	} else {
	    next;
	}
	my $name_re = qr{\#?\w+};
	die "couldn't grok dbg_allkw macro invocation ($dollar_re) $_ ?" unless
   m/^\ * \w+\! \ * \{ \ * c, \ * $dollar_re    ($name_re)    \ * \}$/x ||
   m/^\ * \w+\! \ * \{ \ * c, \ * $dollar_re \{ ($name_re) [^{}]* \} \ * \}$/x;
	next if $1 =~ m{^\#};
	push @$out, $1;
    }
    grokked 'dbg_all_keywords (expand!)', 'd', @out_kw;
    grokked 'dbg_all_keywords (bool!)', 'D', @out_bool;
    end_file();
}

sub grok_document () {
    open_file 'doc/reference.md';
    my $in;
    my %out;
    my @case_cols;
    my $lno = 0;
    my $last_heading_lno;
    while (<F>) {
	$lno++;

	if (m{\#\# maint/check-keywords-documented (\w+) \#\#}) {
	    $in = $1;
	    next;
	}
	if (m{^\#\# }) {
	    $in = undef;
	    $last_heading_lno = $lno;
	    next;
	}
	next unless defined $in;

	my $out = \@{ $out{$in} };
	my $found = sub {
	    my ($div_prefix_char, $lno, @l) = @_;
	    @l = grep { $_ ne 'CASE_CHANGE' } @l;
	    push @$out, @l;
	    foreach my $kw (@l) {
		push @{ $ref_lines{"$div_prefix_char:$kw"} }, $lno;
	    }
	};

	if ($in eq 'cases') {
	    next unless m{^\|};
	    my @cols = split m{\|};
	    if (!@case_cols) {
		@case_cols = @cols;
		next;
	    }
	    next if m{^\|-----};

	    foreach my $i (0..$#cols) {
		my @l = $cols[$i] =~ m{\`(\w+)\`}g;
		if ($case_cols[$i] =~ m{\`heck\`}) {
		    push @{ $out{heck} }, @l;
		} elsif ($case_cols[$i] =~ m{\`CASE_CHANGE\`}) {
		    $found->('x', $last_heading_lno, @l);
		}
	    }
	} elsif ($in eq 'conditions') {
	    next unless m{^\#\#\# };
	    $found->('c', $lno, m{\`(\w+)[^\`]*\`}g);
	} elsif ($in eq 'expansions') {
	    next unless m{^\#\#\# };
	    # remove inner { }, repeatedly
	    while (s{ ( \{ [^{}]* ) \{ [^{}]* \} ( .* \} ) }{$1 $2}x) { }
	    $found->('x', $lno, m{\`\$(\w+)\`}g);
	    $found->('x', $lno, m{\`\$\{(\w+)[^\}]+\}\`}g);
	} else {
	    die "bad maint/check-keywords-documented $in !";
	}
    }
    end_file();

    my $grokked = sub {
	my ($note, $in) = @_;
	grokked $in, $note, @{ $out{$in} };
    };
    $grokked->('E', 'expansions');
    $grokked->('C', 'cases');
    $grokked->('H', 'heck');
    $grokked->('B', 'conditions');
}

our @discrepant;
our %note_descriptions;

sub correspondences () {
    my %expected;
    while (<DATA>) {
	s{^\s+}{};
	s{\s+$}{};
	next if m{^\#};
	next unless m{\S};

	if (m{^\: \s+ (\S) \s+ (\S.*)}x) {
	    $note_descriptions{$1} = $2;
	    next;
	}

	my ($exp, @l) = split /\s+/;
	die unless @l;
	foreach my $kw (@l) {
	    die "duplicate exception for $kw " if $expected{$kw};
	    $expected{$kw} = $exp;
	}
    }

    foreach my $kw (sort keys %grokked) {
	$_ = $grokked{$kw};

	# Expected combinations, which are not troubling:

	next if m{^vbDB$}; # ordinary bool-only keyword
	next if m{^vedE$}; # ordinary expansion-only keyword
	next if m{^vdDEB$}; # ordinary keyword, both bool and expansion

	next if m{^hH$}; # heck keyword (only)
	next if m{^cCC?$}; # case changing (possibly also example)
	next if m{^chCH$}; # case changing and heck keyword

	# This one is unusual, is it in the expected exception table?

	my $exp = $expected{$kw};
	if ($exp) {
	    delete $expected{$kw};
	    next if $_ eq $exp;
	}
	push @discrepant, { Kw => $kw, Got => $_, Exp => $exp };
    }

    foreach my $kw (keys %expected) {
	push @discrepant, { Kw => $kw, Exp => $expected{$kw} };
    }
}

sub print_divider () {
    print "------------------------------------------------------------\n";
}

sub print_discrepancies() {
    print "discrepancies\n";
    foreach my $disc (@discrepant) {
	printf "    keyword %-20s  found %s", $disc->{Kw}, $disc->{Got};
	if ($disc->{Exp}) {
	    printf " expected %s", $disc->{Exp};
	}
	print "\n";
    }
    print_divider();
    flush STDOUT;

    print STDERR <<END;
|
** Discrepancies detected between code and documentation **
|
END

    foreach my $disc (@discrepant) {
	printf STDERR "Keyword (or enum variant) `%s`\n", $disc->{Kw};
	my $print_notes = sub {
	    my ($heading, $notes, $if_none) = @_;
	    if (!$notes) {
		printf STDERR "%s\n", $if_none;
		return;
	    }
	    printf STDERR "%s\n", $heading;
	    if ($notes) {
		foreach my $note (split //, $notes) {
		    printf STDERR "    %s\n",
		      $note_descriptions{$note} // "UNDESCRIBED `$note`";
		}
		foreach my $note (keys %note_descriptions) {
		    next if $notes =~ m/$note/;
		    # elide "not:" for case changing stuff if there's no
		    # hint that this is going to be relevant
		    next if $note =~ m/[hc]/i and $notes !~ m/[hc]/i;
		    printf STDERR "    not: %s\n", $note_descriptions{$note};
		}
	    } else {
		printf STDERR "    None.\n";
	    }
	};
	$print_notes->(
            "Information and properties found in docs and source code:",
		       $disc->{Got},
		       'Keyword does not exist.'
		      );
	$print_notes->(
		       "Entry in exceptions table -- expected to find:",
		       $disc->{Exp},
		       "No entry in exceptions table.\n",
		      );
	printf STDERR "To suppress, add exceptions with note chars `%s`.\n",
	    $disc->{Got}
	    if $disc->{Got};
	printf STDERR "|\n";
    }
    printf STDERR <<END;
Docs and code should be fixed to match up,
or the exception table (__DATA__) adjusted.
END
}

sub grok_all() {
    grok_enum();
    grok_not_in_bool();
    grok_cases();
    grok_bool_only();
    grok_dbg_allkw();
    grok_document();
}

sub mode_check() {
    print_divider();
    print "reading source and documentation files\n";

    grok_all();

    correspondences();

    if (!@discrepant) {
	print "ok.\n";
	print_divider();
	exit 0;
    }

    print_discrepancies();

    exit 4;
}

sub mode_json() {
    $print_progress = 0;
    grok_all();
    print to_json({
        grokked => \%grokked,
        ref_lines => \%ref_lines,
    }) or die $!;
}

our $mode = 'check';

while (@ARGV) {
    $_ = shift @ARGV;
    if (m/^--json$/) {
	$mode = 'json';
    } else {
	die "unsupported option $_";
    }
}

my $mode_fn = ${*::}{"mode_$mode"} or die;
$mode_fn->();

__DATA__
# Expected exceptions table
# Each line is  <notechars>  <kw> [<kw> ...]
# (The lines starting `:` are used for text in the discrepancy reports.)
#
# Note characters have the following meanings:
#   Found in source code:
:     v    enum SubstDetails variant, in syntax.rs
:     e    expansion-only variant, ie NotInBool, according to boolean.rs
:     c    case change keyword, according to from define_cases! in paste.rs
:     h    heck name for a case change, according to define_cases! in paste.rs
:     b    BoolOnly variant, according to expand.rs
#   Found in reference documentation:
:     E    Documented as expansion keyword
:     C    Documented as case change keyword
:     H    Documented as heck name of a case change
:     B    Documented as condition (BoolOnly keyword)
#   Found in dbg_all_keywords:
:     d    $dbg_all_keywords dumps the expansion
:     D    $dbg_all_keywords dumps as a boolean

# we use placeholders in the docs, and handle case changing
# with its own combined SubstDetails variant
ve	changecase

# These are combined SD variants
dDEB	fvis fdefvis tvis
v	vis
dDEB	fmeta vmeta tmeta
v	xmeta

# Doc has both ${for fields ...} and ${for variants ...}, so E is repeated
veEE	for

# User-defined keywords
v	userdefined
