#! /usr/bin/perl -w

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# th-dictd-to path/to/dictd/index
# ------------
# Description:
# ------------
#  This script will get dictd index file and it's corresponding .dict file
#  to generate Thwab file tree and related headword index file
#  you may pass the number of digits
#  a third argument will be the number of headwords tobe skiped
#  for example
#  		th-dict-to-th path/to/dictd/some.index 6 0
#
# Then you need to chdir to th-dict edit +/0 file to put comments
# copyrights, and set dictionary name ..etc
# remove back up if any "+/0~"
#
# Then in side any the th-dict subtree call "th-pack" script
#
# Copyright © 2006, Muayyad Saleh Alsadi<alsadi@gmail.com>
use utf8;
use encoding 'utf8';
use open ':utf8'; # input and output default layer will be UTF-8
use POSIX;
my $prefix="th-dict";
my $digits=6;
my $next_ch=1;
my $ch_n=$next_ch;
my $itm_n=0;
my %chaps;
my %items;
my $ch="";
my $itm="";

my $i_fn;
my $skip=0;

if ($#ARGV<0) {die "Need at least one argument"}
if ($#ARGV>=0) {$i_fn=$ARGV[0]}
if ($#ARGV>=1) {$digits=$ARGV[1]}
if ($#ARGV>=2) {$skip=$ARGV[2]}

if (-d "$prefix") {print "type 'rm -R $prefix'\n"; exit -1}
mkdir $prefix or die "Could not create Thwab directory";
mkdir $prefix."/+" or die "Could not create Thwab control + directory";
chdir $prefix;
open I ,">+/0" or die "Could not create Thwab info +/0 file";
open IX ,">+/3" or die "Could not create Thwab key index +/3 file";
print I "thwab = dict-SOMETHING-XX-YY\n",
	"version = 1\n",
	"charset = UTF-8\n",
	"format = plain\n",
	"digits = $digits\n",
	"title = Dictionary Title\n",
	"subtitle = Dictionary Subtitle\n",
	"lang = en\n",
	"classification = Lexical:Dictionaries\n",
	"key = Headword\n",
	"author = Author:SHORT:FULL:::2006:MY\@EMAIL.COM:www.MY-WEBSITE.org:Some Country,Translator:SHORT:FULL:::2006:MY\@EMAIL.COM:www.MY-WEBSITE.org:Some Country,Translator:SHORT:FULL:::2006:MY\@EMAIL.COM:www.MY-WEBSITE.org:Some Country\n",
	"computerized = alsadi:Muayyad Saleh Al-Sadi:::2006:alsadi\@gmail.com:www.cltb.net\n",
	"_\nCOMMENTS HERE\n\n",
	"In Thwab viewer, use \"Search for Key (Headword)\"\ntool to look-up.\n\nCopyright © COPYRIGHTS HERE\n\n",
	"Converted to Thwab iTar format using \"th-dictd-to-th\" script\n",
	"Written by Muayyad Saleh Al-Sadi<alsadi\@gmail.com>\n";
close I;
sub add_to_th($$){
  my $id=shift;
  my $str=shift;
  if ($id=~/\W*(\w)/) {
    $ch="$1"; $ch=uc $ch;
    if ($ch=~/[^\p{alpha}]/) {$ch="0"}
  } else {$ch="0"}
  if (exists $chaps{$ch}) {
    $ch_n=$chaps{$ch};
    $items{$ch}=$items{$ch}+1;
    $itm_n=$items{$ch};
    $dn=sprintf "%0$digits"."d",$ch_n;
  } else {
    $chaps{$ch}=$next_ch;
    $ch_n=$chaps{$ch};
    $items{$ch}=0;
    $itm_n=$items{$ch};
    $next_ch=$next_ch+1;
    $dn=sprintf "%0$digits"."d",$ch_n;
    mkdir $dn;
    print "new [$ch] chapter as [$dn]\n";
  }
  $fn=$dn.sprintf("/%0$digits"."d",$itm_n);
  open F ,">".$fn or die "Could not create Thwab memver file [$fn]";
  printf F "%s\n\n%s\n", $id,$str;
  close F;
  $id=~s/\n/_/gm;
  $id=~s/\t/  /g;
  $id=~s/^\s*(.*)\s*$/$1/;
  printf IX "%s\t%s\n",$id,$fn;
  return $fn;
}
my $tmp_i="../th-dict.i";
my $tmp_c="../th-dict.c";
my $cat="cat";
my $skiped=0;
my $c_fn=$i_fn;
my %b64_index;
$b64_index{'+'} = 62;
$b64_index{'/'} = 63;
$b64_index{'0'} = 52;
$b64_index{'1'} = 53;
$b64_index{'2'} = 54;
$b64_index{'3'} = 55;
$b64_index{'4'} = 56;
$b64_index{'5'} = 57;
$b64_index{'6'} = 58;
$b64_index{'7'} = 59;
$b64_index{'8'} = 60;
$b64_index{'9'} = 61;
$b64_index{'A'} = 0;
$b64_index{'B'} = 1;
$b64_index{'C'} = 2;
$b64_index{'D'} = 3;
$b64_index{'E'} = 4;
$b64_index{'F'} = 5;
$b64_index{'G'} = 6;
$b64_index{'H'} = 7;
$b64_index{'I'} = 8;
$b64_index{'J'} = 9;
$b64_index{'K'} = 10;
$b64_index{'L'} = 11;
$b64_index{'M'} = 12;
$b64_index{'N'} = 13;
$b64_index{'O'} = 14;
$b64_index{'P'} = 15;
$b64_index{'Q'} = 16;
$b64_index{'R'} = 17;
$b64_index{'S'} = 18;
$b64_index{'T'} = 19;
$b64_index{'U'} = 20;
$b64_index{'V'} = 21;
$b64_index{'W'} = 22;
$b64_index{'X'} = 23;
$b64_index{'Y'} = 24;
$b64_index{'Z'} = 25;
$b64_index{'a'} = 26;
$b64_index{'b'} = 27;
$b64_index{'c'} = 28;
$b64_index{'d'} = 29;
$b64_index{'e'} = 30;
$b64_index{'f'} = 31;
$b64_index{'g'} = 32;
$b64_index{'h'} = 33;
$b64_index{'i'} = 34;
$b64_index{'j'} = 35;
$b64_index{'k'} = 36;
$b64_index{'l'} = 37;
$b64_index{'m'} = 38;
$b64_index{'n'} = 39;
$b64_index{'o'} = 40;
$b64_index{'p'} = 41;
$b64_index{'q'} = 42;
$b64_index{'r'} = 43;
$b64_index{'s'} = 44;
$b64_index{'t'} = 45;
$b64_index{'u'} = 46;
$b64_index{'v'} = 47;
$b64_index{'w'} = 48;
$b64_index{'x'} = 49;
$b64_index{'y'} = 50;
$b64_index{'z'} = 51;
sub b64dec($)
{
  my $b=$_[0];
  my ($i,$v);
  my $l=length($b);
  $v=0;
  for ($i=0;$i<$l;$i=$i+1) { $v=($v*64)+$b64_index{substr ($b, $i,1)}  }
  return $v;
}
my ($t,$off,$len,$b64off,$b64len);
$c_fn=~s/.index$/.dict/;
print "checking for [$c_fn]\n";
if (not -f $c_fn) {
  $c_fn=$c_fn.".dz";
  if (not -f $c_fn) {die "Could not found dictionary contents file"}
  $cat='zcat';
}
print "decompressing content ...\n";
system "$cat '$c_fn' >'$tmp_c'";
open D_C ,"$tmp_c" or die "Could not open tmp content file [$tmp_c]";
#open D_C ,"$cat '$c_fn'|" or die "Could not open tmp content file [$tmp_c]";
print "parsing index [$i_fn]\n";
open D_I , $i_fn or die "Could not open dictionary index file";
#open T_I ,">$tmp_i" or die "Could not create tmp index file [$tmp_i]";
my %offests;
while(<D_I>) {
  chomp;
  if ($skiped<$skip) {$skiped=$skiped+1; next;}
  if (/([^\t]*)\t([^\t]*)\t([^\t]*)/) {
    $t=$1;
    $boff=$2;
    $blen=$3;
    $off=b64dec($boff);
    $len=b64dec($blen);
    # print "($t)\@{$off,$len}\n";
    if (exists($offests{$off})) {
      print "offset for [$t] found before\n";
      printf IX "%s\t%s\n",$t,$offests{$off};
      next;
    }
    if ($t=~/^00-database/) {print "skiping [$t]..\n";next}
    $c=" "x$len;
    seek D_C, $off, SEEK_SET;
    read D_C, $c, $len;
    $c=~s/^(\S.*\n)+//m; # remove title as it's added by add_to_th
    $offests{$off}=add_to_th($t,$c);
  }
}
close D_I;
close D_C;
#close T_I;
close I;
close IX;
my ($i,$k);
my $j=2;
for $i (values %items) {
  if ($i>$j) {$j=$i}
}
$k=length(sprintf("%d",$j));
if ($digits != $k) {
  printf "digits should be [%d]\nYou may like to rerun it  passing %d as argument\n",$k,$k
}
