///###////////////////////////////////////////////////////////////////////////
//
// Burton Computer Corporation
// http://www.burton-computer.com
// $Id: SpamFilter.h,v 1.25 2004/01/26 20:19:27 bburton Exp $
//
// Copyright (C) 2000 Burton Computer Corporation
// ALL RIGHTS RESERVED
//
// This program is open source software; you can redistribute it
// and/or modify it under the terms of the Q Public License (QPL)
// version 1.0. Use of this software in whole or in part, including
// linking it (modified or unmodified) into other programs is
// subject to the terms of the QPL.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// Q Public License for more details.
//
// You should have received a copy of the Q Public License
// along with this program; see the file LICENSE.txt.  If not, visit
// the Burton Computer Corporation or CoolDevTools web site
// QPL pages at:
//
//    http://www.burton-computer.com/qpl.html
//

#ifndef _SpamFilter_h
#define _SpamFilter_h

#include "File.h"
#include "FrequencyDB.h"

class LockFile;
class TokenSelector;

class SpamFilter
{
public:
  SpamFilter();
  ~SpamFilter();

  enum ScoreMode_t {
    SCORE_NORMAL,
    SCORE_ORIGINAL,
    SCORE_ALT1,
  };

  void setScoreMode(ScoreMode_t mode)
  {
    switch (mode) {
    case SCORE_ALT1:
      m_goodBias = 1;
      m_minWordCount = 1;
      m_spamThreshold = 0.6;
      break;

    case SCORE_ORIGINAL:
      m_goodBias = 2;
      m_minWordCount = m_defaultMinWordCount;
      m_spamThreshold = 0.9;
      break;

    default:
      assert(mode == SCORE_NORMAL);
      m_goodBias = 2;
      m_minWordCount = m_defaultMinWordCount;
      m_spamThreshold = m_defaultThreshold;
      break;
    }

    m_scoreMode = mode;
  }

  class Score
  {
  public:
    Score(double _score,
          ScoreMode_t _mode,
          bool is_spam)
      : m_score(_score),
        m_mode(_mode),
        m_isSpam(is_spam)
    {
    }

    double getValue() const
    {
      return m_score;
    }

    ScoreMode_t getMode()
    {
      return m_mode;
    }

    bool isSpam() const
    {
      return m_isSpam;
    }

    bool isGood() const
    {
      return !m_isSpam;
    }

    bool isConfident(bool should_be_spam)
    {
      switch (m_mode) {
      case SCORE_ALT1:
        return should_be_spam ? (m_score >= 0.85) : (m_score <= 0.15);

      default:
        assert(m_mode == SCORE_ORIGINAL || m_mode == SCORE_NORMAL);
        return should_be_spam ? (m_score >= 0.995) : (m_score <= 0.005);
      }
    }

  private:
    double m_score;
    ScoreMode_t m_mode;
    bool m_isSpam;
  };

  ScoreMode_t getScoreMode() const
  {
    return m_scoreMode;
  }

  void open(const File &shared_db_file,
            const File &private_db_file,
            bool read_only);

  void open(const File &db_file,
            bool read_only);

  void close(bool abandon_writes = false);

  void flush()
  {
    m_db.flush();
  }

  FrequencyDB *getDB()
  {
    return &m_db;
  }

  bool isNewMessage(const Message &msg)
  {
    bool is_spam;
    return !m_db.containsMessage(msg, is_spam);
  }

  void ensureGoodMessage(const Message &msg,
                         bool force_update);

  void ensureSpamMessage(const Message &msg,
                         bool force_update);

  void removeMessage(const Message &msg)
  {
    m_db.removeMessage(msg);
  }

  Score scoreMessage(Message &msg);

  Score scoreMessage(Message &msg,
                     ScoreMode_t mode);

  Score scoreMessageIfWrongMode(Score current_score,
                                Message &msg,
                                ScoreMode_t mode)
  {
    return (current_score.getMode() == mode) ? current_score : scoreMessage(msg, mode);
  }

  bool isSpam(Message &msg)
  {
    return scoreMessage(msg).isSpam();
  }

  void setGoodBias(int value)
  {
    m_goodBias = value;
  }

  void setTermsForScore(int value)
  {
    m_termsForScore = value;
  }

  void setMaxWordRepeats(int value)
  {
    m_maxWordRepeats = value;
  }

  int getMinWordCount() const
  {
    return m_minWordCount;
  }

  void setMinWordCount(int value)
  {
    m_defaultMinWordCount = value;
    m_minWordCount = value;
  }

  void setNewWordScore(double value)
  {
    m_newWordScore = value;
  }

  void setExtendTopTerms(bool value)
  {
    m_extendTopTerms = value;
  }

  double scoreTerm(int good_count,
                   int spam_count,
                   int good_message_count,
                   int spam_message_count);

  double getSpamThreshold()
  {
    return m_spamThreshold;
  }

  void setDefaultThreshold(double value)
  {
    m_defaultThreshold = value;
    setScoreMode(m_scoreMode);
  }

  void setMinDistanceForScore(double value)
  {
    m_minDistanceForScore = value;
  }

  void setMinArraySize(int value)
  {
    m_minArraySize = value;
  }

  void setWaterCounts(bool value)
  {
    m_waterCounts = value;
  }

  bool getWaterCounts()
  {
    return m_waterCounts;
  }

  void addTokenSelector(TokenSelector *selector)
  {
    m_tokenSelectors.push_back(selector);
  }

  void clearTokenSelectors();

private:
  void scoreTokens(const Message &msg);
  double computeRatio(double count,
                      double total_count);
  void computeScoreProducts(Message &msg,
                            TokenSelector *selector,
                            double &spamness,
                            double &goodness,
                            double &num_terms);
  void scoreToken(Token *tok,
                  int good_message_count,
                  int spam_message_count);
  void getSortedTokens(const Message &msg,
                       TokenSelector *selector,
                       int max_tokens,
                       vector<Token *> &tokens);
  double scoreMessage(Message &msg,
                      TokenSelector *selector);

  double normalScoreMessage(Message &msg, TokenSelector *selector);
  double alt1ScoreMessage(Message &msg, TokenSelector *selector);
  double originalScoreMessage(Message &msg, TokenSelector *selector);
  void lock(const File &db_file,
            bool read_only);

private:
  /// Not implemented.
  SpamFilter(const SpamFilter &);

  /// Not implemented.
  SpamFilter& operator=(const SpamFilter &);

private:
  FrequencyDB m_db;
  NewPtr<LockFile> m_lock;

  // settings related to scoring mode
  int m_goodBias;
  int m_minWordCount;
  int m_defaultMinWordCount;
  double m_spamThreshold;
  ScoreMode_t m_scoreMode;
  vector<TokenSelector *> m_tokenSelectors;

  // other settings
  int m_termsForScore;
  int m_maxWordRepeats;
  bool m_extendTopTerms;
  double m_newWordScore;
  double m_minDistanceForScore;
  int m_minArraySize;
  bool m_waterCounts;
  double m_defaultThreshold;
};

#endif // _SpamFilter_h
