#! /usr/bin/perl -w

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# th-dictd-to path/to/dictd/index
# ------------
# Description:
# ------------
#  This script will get dictd index file and it's corresponding .dict file
#  to generate Thwab file tree and related headword index file
#  you may pass number of digits
#
#  th-dict-to-th path/to/dictd/some.index 6
#
# Then you need to chdir to th-dict edit +/0 file to put comments
# copyrights, and set dictionary name ..etc
# remove back up if any "+/0~"
#
# Then in side any the th-dict subtree call "th-pack" script
#
# Copyright © 2006, Muayyad Saleh Alsadi<alsadi@gmail.com>

use utf8;
use encoding 'utf8';
use open ':utf8'; # input and output default layer will be UTF-8
use POSIX;
my $prefix="th-dict";
my $tmp_i="th-dict.i";
my $tmp_c="th-dict.c";
my $cat="cat";
my $digits=6;
my $i_fn=$ARGV[0];
my $skip=$ARGV[1];
my $skiped=0;
my $c_fn=$i_fn;
my @b64_index;
$b64_index{'+'} = 62;
$b64_index{'/'} = 63;
$b64_index{'0'} = 52;
$b64_index{'1'} = 53;
$b64_index{'2'} = 54;
$b64_index{'3'} = 55;
$b64_index{'4'} = 56;
$b64_index{'5'} = 57;
$b64_index{'6'} = 58;
$b64_index{'7'} = 59;
$b64_index{'8'} = 60;
$b64_index{'9'} = 61;
$b64_index{'A'} = 0;
$b64_index{'B'} = 1;
$b64_index{'C'} = 2;
$b64_index{'D'} = 3;
$b64_index{'E'} = 4;
$b64_index{'F'} = 5;
$b64_index{'G'} = 6;
$b64_index{'H'} = 7;
$b64_index{'I'} = 8;
$b64_index{'J'} = 9;
$b64_index{'K'} = 10;
$b64_index{'L'} = 11;
$b64_index{'M'} = 12;
$b64_index{'N'} = 13;
$b64_index{'O'} = 14;
$b64_index{'P'} = 15;
$b64_index{'Q'} = 16;
$b64_index{'R'} = 17;
$b64_index{'S'} = 18;
$b64_index{'T'} = 19;
$b64_index{'U'} = 20;
$b64_index{'V'} = 21;
$b64_index{'W'} = 22;
$b64_index{'X'} = 23;
$b64_index{'Y'} = 24;
$b64_index{'Z'} = 25;
$b64_index{'a'} = 26;
$b64_index{'b'} = 27;
$b64_index{'c'} = 28;
$b64_index{'d'} = 29;
$b64_index{'e'} = 30;
$b64_index{'f'} = 31;
$b64_index{'g'} = 32;
$b64_index{'h'} = 33;
$b64_index{'i'} = 34;
$b64_index{'j'} = 35;
$b64_index{'k'} = 36;
$b64_index{'l'} = 37;
$b64_index{'m'} = 38;
$b64_index{'n'} = 39;
$b64_index{'o'} = 40;
$b64_index{'p'} = 41;
$b64_index{'q'} = 42;
$b64_index{'r'} = 43;
$b64_index{'s'} = 44;
$b64_index{'t'} = 45;
$b64_index{'u'} = 46;
$b64_index{'v'} = 47;
$b64_index{'w'} = 48;
$b64_index{'x'} = 49;
$b64_index{'y'} = 50;
$b64_index{'z'} = 51;
sub b64dec($)
{
  my $b=$_[0];
  my ($i,$v);
  my $l=length($b);
  $v=0;
  for ($i=0;$i<$l;$i=$i+1) { $v=($v*64)+$b64_index{substr ($b, $i,1)}  }
  return $v;
}
my ($t,$off,$len,$b64off,$b64len);
$c_fn=~s/.index$/.dict/;
print "checking for [$c_fn]\n";
if (not -f $c_fn) {
  $c_fn=$c_fn.".dz";
  if (not -f $c_fn) {die "Could not found dictionary contents file"}
  $cat='zcat';
}
print "parsing index [$i_fn]\n";
open D_I , $i_fn or die "Could not open dictionary index file";
open T_I ,">$tmp_i" or die "Could not create tmp index file [$tmp_i]";
while(<D_I>) {
  chomp;
  if (/([^\t]*)\t([^\t]*)\t([^\t]*)/) {
    $t=$1;
    $boff=$2;
    $blen=$3;
    $off=b64dec($boff);
    $len=b64dec($blen);
    print T_I "$off\t$len\t$t\n";
  }
}
close D_I;
close T_I;
print "decompressing content ...\n";
system "$cat '$c_fn' >'$tmp_c'";
open D_C ,"$tmp_c" or die "Could not open tmp content file [$tmp_c]";
#open D_C ,"$cat '$c_fn'|" or die "Could not open tmp content file [$tmp_c]";
print "sorting index ...\n";
open D_I ,"sort -n '$tmp_i'|" or die "Could not open sorted tmp index file [$tmp_i]";
my $c;
my $old_off=-1;
my $ch_n=0;
my $itm_n=0;
my $next_ch=1;
my @chaps;
my @items;
my $fn="";
my $dn="";
my $old_fn="";
my $ch="";
my $itm="";
mkdir $prefix or die "Could not create Thwab directory";
mkdir $prefix."/+" or die "Could not create Thwab control + directory";
chdir $prefix;
open I ,">+/0" or die "Could not create Thwab info +/0 file";
open IX ,">+/3" or die "Could not create Thwab key index +/3 file";
print I "thwab = 2250\n",
	"version = 1\n",
	"charset = UTF-8\n",
	"format = plain\n",
	"digits = 4\n",
	"title = Dictionary Title\n",
	"subtitle = Dictionary Subtitle\n",
	"lang = en\n",
	"classification = Lexical:Dictionaries\n",
	"key = headword\n",
	"author = Author:short:full\n_\n";

while(<D_I>) {
  if ($skiped<$skip) {$skiped=$skiped+1; next;}
#   skip("00-database-utf8")
#   skip("00-database-8bit")
#   skip("00-database-allchars")
#   skip("00-database-alphabet")
  chomp;
  #print "[$_]\n";
  if (/([^\t]*)\t([^\t]*)\t([^\t]*)/) {
    $off=$1;
    $len=$2;
    $t=$3;
    if ($old_off == $off) {
      printf IX "%s\t%s\n",$t,$fn;
      next;
    }
    $c=" "x$len;
    seek D_C, $off, SEEK_SET;
    sysread D_C, $c, $len;
    #printf "%d,%d[%s]{%s}\n",$off,$len,$t,$c;
    if ($t=~/\W*(\w)/) {
      $ch="$1"; $ch=uc $ch;
      if ($ch=~/[^\p{alpha}]/) {$ch="0"}
    } else {$ch="0"}
    #if ($t=~/[^\p{alpha}]*(\p{alpha})/) { $ch="$1"; $ch=uc $ch}
    #else {$ch="0"}
    if (exists $chaps{$ch}) {
      $ch_n=$chaps{$ch};
      $items{$ch}=$items{$ch}+1;
      $itm_n=$items{$ch};
      $dn=sprintf "%0$digits"."d",$ch_n;
    } else {
      $chaps{$ch}=$next_ch;
      $ch_n=$chaps{$ch};
      $items{$ch}=0;
      $itm_n=$items{$ch};
      $next_ch=$next_ch+1;
      $dn=sprintf "%0$digits"."d",$ch_n;
      mkdir $dn;
      print "new [$ch] chapter as [$dn]\n";
    }
    $fn=$dn.sprintf("/%0$digits"."d",$itm_n);
    #printf "@%s[%s][%s]{%s}\n",$ch,$fn,$t,$c;
    open F ,">".$fn or die "Could not create Thwab memver file [$fn]";
    print F "$c";
    close F;
    printf IX "%s\t%s\n",$t,$fn;
    $old_off=$off;
    #sleep 1;
  }
}
close D_I;
close D_C;
close I;
close IX;
