#!/usr/bin/env python3
"""This script fetches from
http://freedict.svn.sf.net/svnroot/freedict/trunk/freedict-database.xml
all the available databases. It accepts the following options:

    -dc    generate debian/control and debian/copyright
           This may take some time, since the copyright information is parsed
           out of the TEI XML source file(s) (if licensecheck fails, which it
                   does often).
    -f    fetch new source from SourceForge and save it into the file
          ../freedict_VER.orig.tar.gz (where VER is date in format yyyy.mm.dd)
          -x when -f is given, -x can be used to exclude the import of some
          dictionaries. The supplied string must be space-separated, so and
          escaped. For instance -x 'xxx-xxx yyy-yyy'
    -x    fetch new xml freedict database
    -na <database>   output name and author of <database> (e. g. lat-deu)
    -a    perform all actions (except fetching new xml database)

IMPORTANT: you must run this script from the root of the freedict source,
otherwise the operations will fail.

Dependencies:
    tar, unzip, svn

NOTE: this program works with python 3, legacy code for support will be removed.

NOTE[2]: This code has been written with the strategy pattern in mind.
"""

import collections, re
import os, sys, subprocess
import datetime, codecs
import xml.etree.ElementTree as ET
import urllib.request

XML_URL = 'http://freedict.svn.sf.net/svnroot/freedict/trunk/freedict-database.xml'
LANGCODE_TABLE = "debian/iso-639-3_20130123.tab"

def dictionarycod2longdescription(string):
    """
    Take a iso-639 dictionary string (like lat-deu) and translate it into the
    long version (like Latin-German)."""
    for item in os.listdir('debian'):
        if(item.startswith('iso-')):
            tmp = codecs.open(os.path.join('debian', item), 'r', 'utf-8').read()
            tmp = tmp.split('\n')
    langtbl = {}
    for line in tmp[1:]:
        line = line.split('\t')
        langtbl[line[0]] = line[6]
        if(line[1] != ''): # some have two language code, add it:
            langtbl[line[1]] = line[6]
    string = string.split('-')
    return langtbl[string[0]] +'-' + langtbl[string[1]]

def get_xml_content(fetch_new=False):
    """
    Check, whether debian/freedict-database.xml exists and is not older than
    ../freedict_VER.orig.tar.gz; if older download automatically, if not, use
    that instead of downloading again.
    """
    if(os.path.exists('debian/freedict-database.xml') and not fetch_new):
        # modification time of last xml:
        modified = datetime.datetime.fromtimestamp(
                os.path.getmtime('debian/freedict-database.xml') )
        # latest freedict version:
        names = [e for e in os.listdir('..') if e.endswith('.orig.tar.gz') and
                e.startswith('freedict')]
        names.sort()
        version = names[-1].split('_')[1]
        version = version.split('.')[:3]
        version = [int(e) for e in version]
        version = datetime.datetime( version[0], version[1], version[2])
        if(modified < version): # fetch new
            fetch_new = True
        else:
            fetch_new = False
    data = None
    if(fetch_new):
        # fetch data
        data = urllib.request.urlopen(XML_URL).read().decode('utf-8')
    else: # read from file
        data = codecs.open('debian/freedict-database.xml', 'r', 'utf-8').read()
    # make str, when py3
    # save back, when specified:
    if(fetch_new):
        f = codecs.open('debian/freedict-database.xml','w', 'utf-8')
        f.write( data )
        f.close()
    return data

def find_license(dict):
    """Find out license of dictionary.
First run license-check, afterwards use a self-brewed license checker.
Currently only GPL is detected, else FIXMe is output. It tries to output
something like GPL, GPL-2, GPL-3, GPL-2+, GPL-3+"""
    if(dict == 'eng-hun' or dict == 'hun-eng'): fn = 'hun-eng'+os.sep+'hun-eng.tei.header'
    else: fn = dict+os.sep+dict+'.tei'
    license = 'FIXME'
    proc = subprocess.Popen(['licensecheck', '-m', fn], stdout=subprocess.PIPE)
    try:
        text = proc.communicate()[0].decode('utf-8').split('\t')[1]
    except IndexError:
        pass
    else: # match "GPL (v2 or later)"
        sre = re.search(r'gpl \(v(\d).*(or later).*', text.lower())
        if(sre):
            tpl = sre.groups()
            license = 'GPL-%s%s' % (tpl[0], ('+' if(tpl[1] != '') else ''))
            return license

    with codecs.open(fn, 'r', 'utf-8') as f:
        read = True
        line = 'start'
        lastline = ''
        while(read and line != ''):
            line = f.readline()
            if(line.lower().find('<body') >= 0):
                line = line[ : line.find('<body') ]
                read = False
            if(line.lower().find('gpl')>=0 or line.lower().find('gnu general public lic')>=0):
                license = 'GPL'
                res = re.search("(version|ver\.) (\d)", lastline+line.lower())
                if(res):
                    license += '-'+str(res.groups()[1])
            longline = lastline.lower() + line.lower()
            if(license.startswith('GPL') and not license.endswith('+')):
                if(re.search('.*later.*version', longline) or \
                        (longline.find('or later')>=0) \
                        or (longline.find('and any later') >= 0)):
                    license += '+'
            lastline = line[ : ]
    return license

    def recursive_text(self, node):
        if(node.text.strip() == ''):
            text = ''
        else:
            text = node.text
        for child in node:
            text += '\n'+recursive_text( child )
        return text

class generate_control_copyright():
    def __init__(self, root):
        self.__dictionaries = {}
        self.root = root
        self.parse_data()

    def parse_data(self):
        """Iterate over XML tree to collect dictionary data."""
        for child in self.root:
            attr = child.attrib # shortcut
            self.__dictionaries[child.attrib['name']] = (attr['headwords'],\
                            attr['edition'], attr['status'],
                            attr['maintainerName'])

    def write_data(self):
        self.sort_dictionaries()
        self.write_control()
        self.write_copyright()

    def write_control(self):
        """Generate debian/control from debian/control.HEAD and the gathered
        dictionary data."""
        HEAD = codecs.open('debian/control.HEAD', 'r', 'utf-8').read() +'\n'
        string = [HEAD]
        string += ["""Package: dict-freedict-all
Architecture: all
Depends: ${misc:Depends}, """ + 'dict-freedict-' + ', dict-freedict-'.join( self.__dictionaries.keys() )
                ]
        string += ["\nDescription: meta-package to install or dictionary databases from the FreeDict project\n",
                " This package can be used to install all available bilingual dictionaries from\n" + \
                " the FreeDict project at once.",
                "\n\n"]

        for dict, content in self.__dictionaries.items():
            string.append('Package: dict-freedict-%s\n' % dict)
            string.append("""Architecture: all
Depends: ${misc:Depends}
Suggests: dictd | dicod, dict | opendict | kdict | gnome-dictionary
        | goldendict
Provides: dictd-dictionary\n""")
            if(content[2].lower() == 'unknown'):
                status = ''
            else:
                status = ' (FreeDict status: %s)' % content[2]
            string.append('Description: %s dictionary for the dict server/client'\
                    % dictionarycod2longdescription(dict))
            longstr = '''
 This is the %s dictionary from the FreeDict project in version %s%s. It contains %s headwords. It can be used for the dict server in conjunction with a dict client.'''\
                    % (dictionarycod2longdescription( dict ), content[1], \
                    status, content[0])
            # bring the description to 80 chars each line
            tmp = ' '
            for piece in longstr.split(' '):
                if(len(tmp+piece) <= 80):
                    tmp += piece + ' '
                else:
                    string.append(tmp[:-1]+'\n')
                    tmp = ' '+piece+ ' '
            string.append(tmp+'\n\n')
        codecs.open('debian/control','w', 'utf-8').write( ''.join(string) )

    def write_copyright(self):
        """Generate debian/copyright from debian/copyright.HEAD and the gathered
        dictionary data."""
        cprght_snippets = 'debian' + os.sep + 'copyright.snippets' + os.sep
        HEAD = codecs.open( cprght_snippets + 'HEAD', 'r', 'utf-8').read() +'\n'
        string = [HEAD]
        for dict, content in self.__dictionaries.items():
            # does an exception exist?
            if(os.path.exists( cprght_snippets + dict)):
                string.append( '\n' + codecs.open( \
                        cprght_snippets + dict, 'r', 'utf-8').read())
            else:
                string.append('\nFiles: %s/*\n' % dict)
                string.append('Copyright: 2000-2014 FreeDict contributors\n')
                string.append( 'License: ' + find_license(dict) + '\n' )
        string.append( '\n\n' + codecs.open( \
                cprght_snippets + 'TAIL', 'r').read() )
        codecs.open('debian/copyright','w', 'utf-8').write( \
            ''.join( string ))

   
    def sort_dictionaries(self):
        """
        Overwrite the self.__dictionaries-dictionary with a sorted
        collectionss.OrderedDict. We cannot expect to find ordered data in the
        XML, so we should sort on our own, afterwards.
        """
        d = collections.OrderedDict()
        keylist = list(self.__dictionaries.keys())
        keylist.sort()
        for key in keylist:
            d[key] = self.__dictionaries[key]
        self.__dictionaries = d


class fetch_source():
    def __init__(self, root):
        self.date = self.gen_date()
        self.dirname = 'freedict-%s.orig' % self.date
        self.root = root
        self.exclude_dictionaries = []
        if(len(sys.argv)== 4): # there's the -x option given
            if(sys.argv[2] == '-x'):
                self.exclude_dictionaries = sys.argv[3].split(' ')

    def gen_date(self):
        """Return date in format "yyyy.mm.dd"."""
        d = datetime.datetime.now()
        return str(d.year) + '.' + str(d.month).zfill(2).replace(' ','0') \
                + '.' + str(d.day).zfill(2).replace(' ','0')

    def prepare_environment(self):
        """
        Perform all actions which are needed before downloading the
        source.
        """
        os.mkdir(self.dirname)
        os.chdir( self.dirname )

    def clean_up(self):
        """
        Compress the original source, move it to the right destination and
        remove download directory.
        """
        tarname = self.dirname.replace('-','_') + '.tar.gz'
        os.chdir('..')
        os.system('tar czf %s %s' % (tarname, self.dirname))
        print('Moving tar archive upward to "..".')
        os.rename(tarname, '..'+os.sep+tarname)
        os.system('rm -r %s' % self.dirname)

    def write_data(self):
        """Download all upstream source packages."""
        self.prepare_environment()
        imported = 0
        for dict in self.root:
            if(dict.attrib['name'] in self.exclude_dictionaries):
                print("Skip %s (specified via commmand line)" \
                            % dict.attrib['name'])
                continue
            for release in dict:
                try:
                    if(release.attrib['platform'] == 'src'):
                        srcURL = release.attrib['URL']
                        fn = self.get_dict_fn_from_url ( release.attrib['URL'] )
                        data = urllib.request.urlopen( srcURL ).read()
                        print("Fetching", srcURL)
                        open( fn, 'wb').write( data )
                        print("Extracting",fn)
                        if(fn.endswith('.zip')):
                            os.system('unzip -qq "%s"' % fn)
                        elif(fn.endswith('tar.gz') or fn.endswith('tar.bz2')):
                            os.system('tar xf "%s"' % fn)
                        elif(fn.endswith('.bz2')):
                            os.mkdir(fn[:7])
                            os.system('tar xf "%s" -C "%s"' % (fn, fn[:7]))
                        else:
                            print('E: unknown format of "%s".' % fn)
                            sys.exit(0)
                        os.remove(fn)
                        imported += 1
                except KeyError: pass
        print("Imported",str(imported),"dictionaries.")
        # fetch tools directory
        print("Fetching tools directory")
        ret=os.system('svn -q export http://svn.code.sf.net/p/freedict/code/trunk/tools')
        if(ret != 0):
            print("Error while fetching the tools directory.")
            sys.exit(127)
        self.clean_up()

    def get_dict_fn_from_url(self, url):
        """The SF URL's end on '?download' or '/download', just before those
        strings in the file name, so remove that and return the file name."""
        if(url.endswith('?download')):
            # that's len('?download')
            return url.split('/')[-1][:-9]
        elif(url.endswith('/download')):
            return url.split('/')[-2]
        else:
            print("Unknown URL Format:", url)
            sys.exit(127)

class name_and_author():
    def __init__(self, tree):
        self.tree = tree
    def write_data(self):
        """Iterate over XML tree to collect dictionary data. Afterwards decide,
        wether sys.argv[2] is in there and print() name and author."""
        try:
            dict = sys.argv[2]
        except IndexError:
            print("Error: you must specify a dictionary.")
            sys.exit(127)

        for child in self.tree:
            attr = child.attrib # shortcut
            if(attr['name'] == dict):
                if(attr['maintainerName'] == '[up for grabs]'):
                    dict = (dictionarycod2longdescription( attr['name'] ), \
                                'up for adoption')
                else:
                    dict = (dictionarycod2longdescription( attr['name'] ),\
                                attr['maintainerName'])
        if(type(dict) == str):
            print("Error: database not found.")
            sys.exit(127)
        else:
            print(dict[0]+';'+dict[1])

def clean_up_tree(root):
    """Iterate over XML tree and delete those <dictionary/>-nodes which have no
    release."""
    for child in root:
        if(len(child.getchildren()) == 0):
            print('Removing '+child.attrib['name']+' (is empty).')
            root.remove(child)

def main():
    # are we in the correct directory?
    if(os.getcwd().endswith('debian')):
        os.chdir('..')
    elif(not os.getcwd().find('freedict') >= 0):
        print("You must run this script from the FreeDict packaging root.")
        sys.exit(127)

    # cmd args
    xmlsrc = None
    if(len(sys.argv) == 1):
        print(__doc__)
        sys.exit(127)

    xmlsrc = get_xml_content()
    if(sys.argv[1] == '-f'):
        objects = [fetch_source]
    elif(sys.argv[1] == '-dc'):
        objects = [generate_control_copyright]
    elif(sys.argv[1] == '-a'):
        objects = [generate_control, fetch_source]
    elif(sys.argv[1] == '-x'):
        # fetch again, but this time from the web, overhead should not be
        # perceptible
        xmlsrc = get_xml_content(fetch_new=True)
        sys.exit(0)
    elif(sys.argv[1] == '-na'):
        objects = [name_and_author]
    else:
        print(__doc__)
        sys.exit(127)

    # usual operation
    root = ET.fromstring(xmlsrc)
    clean_up_tree(root)
    for obj in objects:
        inst = obj(root)
        inst.write_data()

main()
