#! /usr/bin/perl
# debbugs-spam is part of debbugs, and is released
# under the terms of the GPL version 2, or any later version, at your
# option. See the file README and COPYING for more information.
# Copyright 2012 by Don Armstrong <don@donarmstrong.com>.


use warnings;
use strict;

use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;

=head1 NAME

debbugs-spam -- Scan log files for spam and populate nnn.log.spam

=head1 SYNOPSIS

debbugs-spam [options] bugnumber [[bugnumber2]..]

 Options:
  --spool-dir debbugs spool directory
  --debug, -d debugging level (Default 0)
  --help, -h display this help
  --man, -m display manual

=head1 OPTIONS

=over

=item B<--spool-dir>

Debbugs spool directory; defaults to the value configured in the
debbugs configuration file.

=item B<--debug, -d>

Debug verbosity.

=item B<--help, -h>

Display brief useage information.

=item B<--man, -m>

Display this manual.

=back

=head1 SUBCOMMANDS

=over

=item B<auto-scan>

Automatically scan messages using spamassassin and mark messages as
spam which hit the threshold, and those that are highly negative as
ham.

=item B<score>

Output the score of all of the messages in a bug

=over

=item B<--skip-seen> Skip messages which have previously been classified

=back

=item B<mark-spam>

Mark messages as spam if there is a regex match to subject or message
id

=item B<mark-ham>

Mark messages as ham if there is a regex match to subject or message
id

=item B<learn>

Learn from messages which are ham/spam

=back


=head1 EXAMPLES

Start spamd:

  /usr/sbin/spamd --socketpath=/home/debbugs/spamd_socket \
      --nouser-config --cf='include /home/debbugs/.spamassassin/user_prefs' \
      --cf='allow_user_rules 1' --allow-tell;

Then score bugs:

  debbugs-spam --spamc-opts '-U' --spamc-opts '/home/debbugs/spamd_socket' \
      score 859123;

=cut


use vars qw($DEBUG);

use Debbugs::Log qw(record_regex);
use Debbugs::Log::Spam;
use Debbugs::Config qw(:config);
use Debbugs::Command qw(:all);
use IPC::Open3 qw(open3);
use Carp;

my %options =
    (debug   => 0,
     help    => 0,
     man     => 0,
     verbose => 0,
     quiet   => 0,
     quick   => 0,
     spamc   => 'spamc',
     spamc_opts => [],
    );

handle_main_arguments(\%options,
                      'quick|q',
                      'service|s',
                      'sysconfdir|c',
                      'spamc=s' => 0,
                      'spamc_opts|spamc-opts=s@' => 0,
                      'spool_dir|spool-dir=s',
                      'debug|d+','help|h|?','man|m');

my %subcommands =
    ('auto-scan' => {function => \&auto_spamscan,
                     arguments => {'ham_threshold|ham-threshold=s' => 0,
                                  },
                     defaults => {ham_threshold => -5},
                    },
     'score' => {function => \&score_bug,
                 arguments => {'skip_seen|skip-seen!' => 0
                              },
                },
     'mark-spam' => {function => \&mark_spam,
                    },
     'mark-ham' => {function => \&mark_ham,
                   },
     'learn' => {function => \&learn,
                },
     'help' => {function => sub {pod2usage({verbose => 2});}}
    );

pod2usage() if $options{help};
pod2usage({verbose=>2}) if $options{man};

$DEBUG = $options{debug};

my @USAGE_ERRORS;
$options{verbose} = $options{verbose} - $options{quiet};

pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;

my ($subcommand) = shift @ARGV;
if (not defined $subcommand) {
    $subcommand = 'help';
    print STDERR "You must provide a subcommand; displaying usage.\n";
    pod2usage();
} elsif (not exists $subcommands{$subcommand}) {
    print STDERR "$subcommand is not a valid subcommand; displaying usage.\n";
    pod2usage();
}

if (exists $options{spool_dir} and defined $options{spool_dir}) {
    $config{spool_dir} = $options{spool_dir};
}
if ($subcommand ne 'help') {
    chdir($config{spool_dir}) or die "chdir $config{spool_dir} failed: $!";
}
my $opts =
    handle_subcommand_arguments(\@ARGV,
                                $subcommands{$subcommand}{arguments},
                                $subcommands{$subcommand}{defaults},
                               );
$subcommands{$subcommand}{function}->(\%options,$opts,\%config,\@ARGV);


sub mark_ham {
    mark_it('ham',@_);
}

sub mark_spam {
    mark_it('spam',@_);
}

sub mark_it {
    my ($spam_ham,$options,$opts,$config,$argv) = @_;
    my $regex = shift @{$argv};
    for my $bug_num (@{$argv}) {
        my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
            die "Unable to open bug log spam for $bug_num";
        foreachmsg(sub {
                       my ($bn,$rec,$mid) = @_;
                       my $body = $rec->{text};
                       my ($subject) = $body =~ /^Subject: *(.+)$/mi;
                       my $is_match = 0;
                       if ($subject =~ /$regex/) {
                           $is_match = 1;
                       }
                       if ($mid =~ /$regex/) {
                           $is_match = 1;
                       }
                       if ($is_match) {
                           print STDERR "it's a match" if $DEBUG;
                           if ($spam_ham eq 'spam') {
                               $spam->add_spam($mid);
                           } else {
                               $spam->add_ham($mid);
                           }
                       }
                   },
                   $bug_num
                  );
        $spam->save();
    }
}

sub learn {
    my ($options,$opts,$config,$argv) = @_;
    for my $bug_num (@{$argv}) {
        my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
            die "Unable to open bug log spam for $bug_num";
        foreachmsg(sub {
                       my ($bn,$rec,$mid) = @_;
                       my $score;
                       if ($spam->is_spam($mid)) {
                           $score //=
                               spam_score($rec,$options->{spamc},
                                          [@{$options->{spamc_opts}},
                                           '-L','spam'
                                          ]
                                         );
                           print STDERR "learning spam" if $DEBUG;
                       } elsif ($spam->is_ham($mid)) {
                           $score //=
                               spam_score($rec,$options->{spamc},
                                          [@{$options->{spamc_opts}},
                                           '-L','ham'
                                          ]
                                         );
                           print STDERR "learning ham" if $DEBUG;
                       } else {
                           print STDERR "not learning" if $DEBUG;
                       }
                       print STDERR " from $mid" if $DEBUG;
                   },
                   $bug_num
                  );
    }
}


sub score_bug {
    my ($options,$opts,$config,$argv) = @_;
    for my $bug_num (@{$argv}) {
        my @bug_score =
            spam_score_bug($bug_num,
                           $options->{spamc},
                           $options->{spamc_opts},
                           $opts->{skip_seen},
                          );
        print "$_->{score} $_->{message_id} $_->{subject}\n"
            foreach @bug_score;
    }
}

sub auto_spamscan {
    my ($options,$opts,$config,$argv) = @_;

    for my $bug_num (@{$argv}) {
        my $spam = Debbugs::Log::Spam->new(bug_num => $bug_num) or
            die "Unable to open bug log spam for $bug_num";
        foreachmsg(sub {
                       my ($bn,$rec,$mid) = @_;
                       if ($spam->is_spam($mid)) {
                           print STDERR "already spam\n" if $DEBUG;
                           return;
                       }
                       if ($spam->is_ham($mid)) {
                           print STDERR "already ham\n" if $DEBUG;
                           return;
                       }
                       my ($score,$is_spam,$report,$threshold) =
                           spam_score($rec,
                                      $options->{spamc},
                                      $options->{spamc_opts},
                                     );
                       if ($is_spam) {
                           print STDERR "it's spam ($score)\n" if $DEBUG;
                           $spam->add_spam($mid);
                       } elsif ($score < $opts->{ham_threshold}) {
                           print STDERR "it's really ham ($score)\n" if $DEBUG;
                           $spam->add_ham($mid);
                       }
                       else {
                           print STDERR "it's ham ($score)\n" if $DEBUG;
                       }
                   },
                   $bug_num,
                  );
        $spam->save();
    }
}

sub spam_score_bug {
    my ($bug,$spamc,$spamc_opts,$skip_seen) = @_;

    my $spam;
    if ($skip_seen) {
        $spam = Debbugs::Log::Spam->new(bug_num => $bug) or
            die "Unable to open bug log spam for $bug";
    }
    my @records;
    foreachmsg(sub {
                   my ($bn,$rec,$mid) = @_;
                   my $score;
                   if ($skip_seen) {
                       if ($spam->is_spam($mid)) {
                           $score = 999;
                       } elsif ($spam->is_ham($mid)) {
                           $score = -999;
                       }
                   }
                   $score //=
                       spam_score($rec,$spamc,$spamc_opts);
                   my ($subject) = $rec->{text} =~ /^Subject: *(.+)$/mi;
                   push @records,
                      {message_id => $mid,
                       score => $score,
                       subject => $subject,
                      };
               },
               $bug
              );
    return @records;
}

sub spam_score {
    my ($record,$spamc,$spamc_opts) = @_;
    my ($score,$threshold,$report);
    my $is_spam = 0;
    eval {
        $report = '';
        $score = 0;
        $threshold = 5;
        my ($spamc_in,$spamc_out);
        my $old_sig = $SIG{"PIPE"};
        $SIG{"PIPE"} = sub {
            die "SIGPIPE in child for some reason";
        };
        my $childpid =
            open3($spamc_in,$spamc_out,0,
                  $spamc,'-E','--headers',@{$spamc_opts}) or
                      die "Unable to fork spamc: $!";
        if (not $childpid) {
            die "Unable to fork spamc";
        }
        print {$spamc_in} $record->{text};
        close($spamc_in) or die "Unable to close spamc_in: $!";
        waitpid($childpid,0);
        my $exit_code = $? >> 8;
        if ($exit_code) {
            $is_spam = 1;
        }
        while (<$spamc_out>) {
            if (/^X-Spam/) {
                $report .= $_;
                if (/^X-Spam-Status: (Yes|No), score=(-?[\d\.]+) required=(-?[\d\.]+)/) {
                    $threshold = $3;
                    $score = $2;
                }
            }
            if (/^\s*$/) {
                last;
            }
        }
        if ($DEBUG) {
            print STDERR "[$exit_code] [$score/$threshold]\n$report\n";
        }
        close($spamc_out);
        $SIG{"PIPE"} = $old_sig;
    };
    if ($@) {
        carp "processing of message failed [$@]\n";
        return undef;
    }
    return wantarray?($score,$is_spam,$report):$score;
}

sub foreachmsg {
    my ($sub,$bug_num) = @_;
    my $log = Debbugs::Log->new(bug_num => $bug_num) or
        die "Unable to open bug log for $bug_num";
    my %seen_msgids;
    while (my $record = $log->read_record()) {
        next if $record->{type} eq 'html';
        next if $record->{type} eq 'autocheck';
        my ($msg_id) = record_regex($record,
                                    qr/^Message-Id:\s+<(.+)>/mi);
        next unless defined $msg_id;
        print STDERR "examining $msg_id: " if $DEBUG;
        if ($msg_id =~ /$config{email_domain}$/) {
            print STDERR "skipping\n" if $DEBUG;
            next;
        }
        if ($seen_msgids{$msg_id}) {
            print STDERR "already seen\n" if $DEBUG;
            next;
        }
        $seen_msgids{$msg_id}=1;
        $sub->($bug_num,$record,$msg_id);
        print STDERR "\n" if $DEBUG;
    }
}


__END__

# Local Variables:
# cperl-indent-level: 4
# indent-tabs-mode: nil
# End:
