#!/usr/bin/env python
# -*- coding: latin-1 -*-
#
# Time-stamp: <2004-06-27 19:42:08 graham>
#
#  COPYRIGHT
# Togaware 2004 All rights are reserved.
#
# Authors: Graham Williams
#
# TODO
#	Do full directory listing of CTAN server rasther than multiple hits
#		Use ftp://ftp.dante.de/tex-archive/FILES.byname
#	Supply name of ctan host on command line - note ftp and http differ
#	Command line option -v for verbose.
#

"""Identify any new docs associated with a package on CTAN."""

########################################################################
# IMPORTS
#
import getopt,os,re,shutil,sys,xml.dom,time
import urllib2
import shutil
from xml.dom.ext import PrettyPrint

from tcutils import commify, list_packages, loadXML

########################################################################
# SYSTEM VARIABLES
#
__version__ = "$Revision: 1.8 $".split()[1]
__verdate__ = "$Date: 2004/06/27 09:42:24 $".split()[1:3]
__source__  = "$Source: /cvsroot/texcatalogue/texcatalogue/src/newdocs.py,v $".\
              split()[1]
__program__ = re.sub('^.RCSfile: (.*).py,v .', "\\1",
                     "$RCSfile: newdocs.py,v $")

########################################################################
#
# CONSTANTS
#
verbose = False
#
# aarnet is out of date - request them to kickstart their mirror
#
CTAN_host = "http://mirror.aarnet.edu.au/pub/CTAN/"
#
# Don't use UNSW - after a number of connections it starts refusing connection
#
CTAN_host = "ftp://ctan.unsw.edu.au/tex-archive/"
#
# Dante is a bit far away, but okay for FILES.byname (5MB)
#
CTAN_host = "ftp://ftp.dante.de/tex-archive/"

########################################################################
#
# CHECK FOR VARIOUS DOCS 
#

# def check_ctan_doc(url):
#     """Check whether the URL specified exists"""
#     try:
#         site = urllib2.urlopen(url)
#     except Exception, e:
#         #
#         # Really need to check the Exceptions, but need so documentation!
#         # Want to capture other than not found errors and exit!
#         # Other erros include Connection refused
#         #
#         if "HTTP Error 404: Not Found" in str(e): # http to aarnet
#             return False
#         elif ": No such file or directory." in str(e): # ftp unsw
#             return False
#         else:
#             print e
#             sys.exit(1)
#     return True

def check_ctan_doc(path, ctan_files):
    return path in ctan_files

def xml_doc_contains(xml_doc_elements, fname):
    # print fname
    present = False
    for d in xml_doc_elements:
        href = d.getAttribute("href").split(":")
        # print href
        # print href[1][1:]
        #dvalue = d.firstChild.nodeValue
        if href[0] == "ctan" and href[1][1:] == fname:
            present = True
            break
    return present

def add_new_doc(xml_file, href):
    """Add DOCUMENTATION element to just before CTAN element."""
    newdoc = xml_file.createElement("documentation")
    newdoc.setAttribute('href', href)
    ctan = xml_file.getElementsByTagName('ctan')
    if ctan:
        entry = xml_file.getElementsByTagName('entry')[0]
        entry.insertBefore(newdoc, ctan[0])
    else:
        print "ERROR: No ctan element. Can't add new documentation."
        sys.exit(1)
        
def find_docs(packages, path = "./entries"):
    # print "Reading FILES.byname...",
    # We could read from a fresh download here, but it might be slow.
    # site = urllib2.urlopen("ftp://ftp.dante.de/tex-archive/FILES.byname")
    # So download the file separately and just read it locally.
    site = open("FILES.byname")
    ctan_files = site.readlines()
    ctan_files = map(lambda x: re.sub('^.*\|.*\| ', '', x.strip()), ctan_files)
    # Textures has a (R) character so remove it.
    ctan_files = map(lambda x: re.sub('\256', '', x), ctan_files)
    # print "done"
    print "CTAN contains %s files." % commify(len(ctan_files))
    progress = ""
    for p in packages:
        #
        # Progress meter!
        #
        if p[0] <> progress:
            print p[0],
            sys.stdout.flush()
            progress = p[0]
        #
        # Read in the XML file.
        #
        doc = loadXML(p)
        if not doc: continue
        #
        # Top level, first (and should be only) 'entry' element.
        #
        entry = doc.getElementsByTagName('entry')[0]
        #
        # Second level elements, as NodeLists
        #
        #
        # Other miscellaneous elements, as NodeLists
        #
        documentation = doc.getElementsByTagName('documentation')
        ctan = doc.getElementsByTagName('ctan')
        #
        # If CTAN, check if documentation exists on CTAN, and that
        # it is in the entry.  Create new element if it is not present
        # in entry.
        #
        # TODO: Move this into a function?
        #
        doc_changed = False
        if ctan:
            location = ctan[0].getAttribute('path')
            if location[0] != '/': location = '/' + location
            #
            # Check for various files on CTAN
            #
            for ftype in ("README", "README.", "readme", "readme.",
                          ".pdf", ".ps", ".html", ".dvi"):
                if ftype[0] == ".":
                    doc_fname = p + ftype
                elif ftype[-1] == ".":
                    doc_fname = ftype + p
                else:
                    doc_fname = ftype
                # ctan_path = CTAN_host + location + "/" + doc_fname
                # if check_ctan_doc(ctan_path):
                ctan_path = location[1:] + "/" + doc_fname
                if verbose: print ctan_path 
                if check_ctan_doc(ctan_path, ctan_files):
                    if verbose: print "%s found for %s" % (ftype, p)
                    sys.stdout.flush()
                    if not xml_doc_contains(documentation, ctan_path):
                        print "%s: Add new ctan %s documentation." %\
                              (p, ftype)
                        doc_changed = True
                        add_new_doc(doc, 'ctan:/' + ctan_path)
        if doc_changed:
            xmlfname = path + "/" + p[0] + "/" + p + ".xml"
            shutil.copy(xmlfname, xmlfname + ".bak")
            xmlfile = open(xmlfname, "w")
            # This loses <!DOCTYPE entry SYSTEM "../catalogue.dtd">
            # Add back in using doctypes.py
            PrettyPrint(doc, xmlfile)
            xmlfile.close()
            # sys.exit(1) # To exit after the first update - for testing
            
###################################################################
# MAIN PROGRAM
#
def _usage():
    """Print documentation."""
    print __doc__

def _main():
    """Process command line options then perform the action."""
    packages = list_packages()
    print "The catalogue contains %s entries." % commify(len(packages))
    # packages = ['a2ac', 'a0poster']
    # packages = ['a2ac', 'a0poster', 'a4', 'a2ping']
    # packages = ['pict2e']
    find_docs(packages)

########################################################################
#
# INTERACTIVE
#
# Test to see if we are running as a main program, rather than being
# imported as a library. If it's a main program, then let's do some
# work. Otherwise let's quielty proceed.
#
if __name__ == "__main__":
    _main()
