#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright 2007-2008 Harri Pitkänen (hatapitk@iki.fi)
# Search for differences between the XML word list from the
# Research Institute for the Languages of Finland and vocabulary
# database of Joukahainen.

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

import sys
import locale
import codecs
import xml.parsers.expat
import subprocess
import _pg
import os
import datetime
import voikkoutils
import voikkoinfl

VOIKKO_DATA = voikkoutils.get_preference('voikko_data_dir')
SPELLCOMMAND = 'LC_CTYPE="fi_FI.UTF-8" voikkospell | grep "W: " | sort | sponge'

kwordlist = {}
gradclasses = {}
current_element = None
current_word = None
current_word_allow_no_gradation = False
current_gradation_is_optional = False

def start_element(name, attrs):
	global current_element
	global current_gradation_is_optional
	current_element = name
	if name == 'av' and attrs.has_key('astevaihtelu') and attrs['astevaihtelu'] == 'valinnainen':
		current_gradation_is_optional = True
	else:
		current_gradation_is_optional = False
def end_element(name):
	global current_element
	global current_word
	if name == 'st':
		current_element = None
		current_word = None
def char_data(data):
	global current_element
	global current_word
	global kwordlist
	global gradclasses
	global current_word_allow_no_gradation
	global current_gradation_is_optional
	if current_element == 's':
		current_word = data
		if not kwordlist.has_key(current_word): kwordlist[current_word] = []
		if not gradclasses.has_key(current_word):
			gradclasses[current_word] = set('-')
			current_word_allow_no_gradation = False
		else:
			if '-' in gradclasses[current_word]:
				current_word_allow_no_gradation = True
			else:
				gradclasses[current_word].add('-')
	elif current_element == 'tn':
		kwordlist[current_word].append(data)
	elif current_element == 'av':
		if current_gradation_is_optional: gradclasses[current_word].add('-')
		elif current_word_allow_no_gradation == False: gradclasses[current_word].discard('-')
		gradclasses[current_word].add(data)

def print_line(word, problem, in_joukahainen = True):
	if in_joukahainen: print '<tr><td><a href="/query/wlist?word=%s">Jouk.</a></td><td>' % word.encode('UTF-8')
	else: print '<tr><td>-</td><td>'
	print '%s</td><td>%s' % (word.encode('UTF-8'), problem.encode('UTF-8'))
	print '</td></tr>'

# Returns the inflection class and gradation class for a word in Joukahainen
# Returns tuple (inflclass_main, grad_type) or None, if no inflection was available
def _get_inflection_gradation(db, wid):
	results = db.query(("SELECT value FROM string_attribute_value " +
	                  "WHERE wid = %i AND aid = 1") % wid)
	if results.ntuples() != 1: return None
	result = results.getresult()[0]
	infclass_parts = unicode(result[0], 'UTF-8').split('-')
	if len(infclass_parts) == 1:
		infclass_main = unicode(result[0], 'UTF-8')
		grad_type = '-'
	elif len(infclass_parts) == 2:
		infclass_main = infclass_parts[0]
		grad_type = infclass_parts[1]
	else: return None
	return (infclass_main, grad_type)

noun_types = voikkoinfl.readInflectionTypes(VOIKKO_DATA + "/subst.aff")
verb_types = voikkoinfl.readInflectionTypes(VOIKKO_DATA + "/verb.aff")
# Returns the correct InflectioType for given word.
# classid is the word class identifier in Joukahainen.
# Returns None if no class information could be retrieved
def _get_inftype(classid, infclass_main):
	global noun_types
	global verb_types
	if classid in [1, 2]: word_types = noun_types
	elif classid == 3: word_types = verb_types
	else: return None
	
	for word_type in word_types:
		if not infclass_main in word_type.joukahainenClasses: continue
		else: return word_type
	return None

# Returns a list of Kotus inflection classes for the given word
def kotus_classes(db, wid, classid):
	infclass_parts = _get_inflection_gradation(db, wid)
	if infclass_parts == None: return []
	(infclass_main, grad_type) = infclass_parts
	
	word_type = _get_inftype(classid, infclass_main)
	if word_type == None: return []
	
	return word_type.kotusClasses

# Returns a list of possible Kotus gradation classes for the given word
def kotus_gradclasses(db, word):
	infclass_parts = _get_inflection_gradation(db, wid)
	if infclass_parts == None: return []
	(infclass_main, grad_type) = infclass_parts
	
	if infclass_main == u'poikkeava': return ['P']
	
	# d->t in some classes is not seen as gradation in Kotus classification
	if infclass_main in [u'voida', u'kanavoida', u'haravoida'] and grad_type == u'av2': return ['-']
	
	if grad_type == u'-': return ['-']
	elif grad_type == u'av1': return ['A', 'B', 'C', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'M']
	elif grad_type == u'av2': return ['A', 'B', 'C', 'E', 'F', 'G', 'H', 'I', 'J', 'K']
	elif grad_type == u'av3': return ['L']
	elif grad_type == u'av4': return ['L']
	elif grad_type == u'av5': return ['D']
	elif grad_type == u'av6': return ['D']
	else: return []


def compare_classes(kotus, joukahainen):
	for k in kotus:
		if not k in joukahainen: return False
	return True

def compare_gradclasses(kotus, joukahainen):
	if len(kotus) == 0 or len(joukahainen) == 0: return True
	if 'P' in joukahainen: return True
	for k in kotus:
		if not k in joukahainen: return False
	if '-' in joukahainen and not '-' in kotus: return False
	if (len(kotus - set(['-'])) == 0 and len(joukahainen - set(['-'])) != 0) or \
	   (len(kotus - set(['-'])) != 0 and len(joukahainen - set(['-'])) == 0): return False
	return True

def print_unknown_line(word):
	global kwordlist
	global gradclasses
	line = word
	if len(kwordlist[word]) == 0: return line.encode('UTF-8') + "<br>"
	line = line + u" (%s)" % reduce(lambda x, y: u"%s, %s" % (x, y), kwordlist[word])
	if gradclasses[word] == set(['-']): return line.encode('UTF-8') + '<br>'
	line = line + u"(%s)" % reduce(lambda x, y: u"%s, %s" % (x, y), gradclasses[word])
	return line.encode('UTF-8') + "<br>"

# Start of main program

# Set defaults
KOTUS_FILE = u'kotus-sanalista_v1.xml'
IGN_FILE = u'kotus-diff-ignore.txt'

# Read Kotus word list
kotusfile = codecs.open(KOTUS_FILE, 'r', 'UTF-8')
kotusdata = kotusfile.read()
kotusfile.close()

# Initialise the XML parser
parser = xml.parsers.expat.ParserCreate()
parser.StartElementHandler = start_element
parser.EndElementHandler = end_element
parser.CharacterDataHandler = char_data

# Parse the Kotus word list
parser.Parse(kotusdata.encode('UTF-8'))

# Read the list of words to ignore
ignlist = []
ignfile = codecs.open(IGN_FILE, 'r', 'UTF-8')
while True:
	line = ignfile.readline()
	if line == '': break
	if line.find('#') != -1: line = line[:line.find('#')]
	line = line.strip()
	if line != '': ignlist.append(line)
ignfile.close()

# Connect to the database
db = _pg.connect(dbname='joukahainen')

# Get the list of relevant words from Joukahainen
jwordlist = db.query("""
SELECT w.word, w.wid, w.class, s.value,
  CASE WHEN
   (w.wid IN ((SELECT f2.wid FROM int_attribute_value f2 WHERE f2.aid = 38) UNION
              (SELECT f3.wid FROM string_attribute_value f3 WHERE f3.aid in (18, 28)) UNION
              (SELECT f4.wid FROM flag_attribute_value f4 WHERE f4.aid in
                      (15, 17, 19, 33, 34, 35, 36))))
  THEN 't' else 'f' END
FROM word w, string_attribute_value s
WHERE w.wid = s.wid AND s.aid = 1
AND w.wid NOT IN (SELECT f.wid FROM flag_attribute_value f
  WHERE f.aid in (2, 5, 11, 12, 13, 14, 24, 26, 29, 37, 40))
ORDER BY w.word""").getresult()

# Page header
print '<html>'
print '<head>'
print '<title>Joukahaisen ja Kotuksen sanalistan eroja</title>'
print '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
print '<style type="text/css"> table, td, th { border: 1px solid black; border-collapse: collapse;} </style>'
print '</head>'
print '<body>'
print '<h1>Kotus-diff %s</h1>' % datetime.datetime.now().isoformat()
print '<table>'
print '<tr><th>Linkit</th><th>Sana</th><th>Eroavuudet</th></tr>'

prev_word = None
prev_isOKtoIgnoreMissing = False
inf_classes = []
grad_classes = []
for (word, wid, classid, infl, isOKtoIgnoreMissing) in jwordlist:
	word = unicode(word, 'UTF-8')
	if word in ignlist: continue
	if word != prev_word:
		if prev_word == None:
			prev_word = word
			prev_isOKtoIgnoreMissing = isOKtoIgnoreMissing
			continue
		if prev_word not in kwordlist:
			if u'-' + prev_word in kwordlist:
				if db.query("SELECT count(*) FROM flag_attribute_value WHERE " +
					  "wid=%i AND aid = 29" % wid).getresult()[0][0] == 0:
					print_line(prev_word, u'Kotus: vain yhdyssanan jälkiosana')
			elif prev_isOKtoIgnoreMissing == 'f':
				print_line(prev_word, 'Vain Joukahaisessa')
		else:
			if len(kwordlist[prev_word]) > 0 and len(inf_classes) > 0 and \
			   not compare_classes(kwordlist[prev_word], inf_classes):
				print_line(prev_word, 'Kotuksen taivutus %s, Joukahaisen taivutus %s' \
				  % (reduce(lambda x, y: x+','+y, kwordlist[prev_word]), reduce(lambda x, y: x+','+y, inf_classes)))
			
			if len(kwordlist[prev_word]) > 0 and \
			   not compare_gradclasses(gradclasses[prev_word], set(grad_classes)):
				print_line(prev_word, 'Kotuksen astevaihtelu %s, Joukahaisen astevaihtelu %s' \
				  % (reduce(lambda x, y: x+','+y, list(gradclasses[prev_word])), reduce(lambda x, y: x+','+y, grad_classes)))
		inf_classes = []
		grad_classes = []
		prev_word = word
		prev_isOKtoIgnoreMissing = isOKtoIgnoreMissing
	inf_classes = inf_classes + kotus_classes(db, wid, classid)
	grad_classes = grad_classes + kotus_gradclasses(db, wid)
print '</table>'

# Words that should be added
jwordlist = db.query("SELECT w.word FROM word w ORDER BY w.word").getresult()
knownwords = []
for jword in jwordlist:
	knownwords.append(unicode(jword[0], 'UTF-8'))
jwordlist = None

speller = subprocess.Popen(SPELLCOMMAND, shell = True, stdin = subprocess.PIPE,
                           stdout = subprocess.PIPE, close_fds = True)
for (word, classes) in kwordlist.iteritems():
	if word.find(u' ') != -1: continue
	speller.stdin.write(word.encode('UTF-8') + '\n')
(out, err) = speller.communicate()
unknownlist = out.split('\n')

print '<h2>Ei Joukahaisessa</h2>'
print '<p>'
count = 0
for unknown in unknownlist:
	word = unicode(unknown, 'UTF-8')[3:]
	if word in knownwords: continue
	if word in ignlist: continue
	if len(word) < 2: continue
	if word[0] == u'-' and word[1:] in knownwords: continue
	print print_unknown_line(word)
	count = count + 1
print (u'(Yhteensä %i sanaa)' % count).encode('UTF-8')
print '</p>'

# Page footer
print '</body>'
print '</html>'
