# -*- coding: iso-8859-1 -*-

"""
Syllabification of text and other language processing services.

"""

class SyllableBreak:

    def __init__(self, before, type, extraUncertainty):
        """
        @param before: the break precedes the character with this index
        @param type: 'certain' for breaks that cannot be modified, 
            'default' for breaks that could be removed, and 
            'not_default' for potential breaks that do not exist by 
            default
        @param extraUncertainty: true for special diphthongs in 
            non-initial syllables, false otherwise
        
        """
        assert type in ['certain', 'default', 'not_default']
        if extraUncertainty:
            assert type == 'default'
            
        self.before = before
        self.type = type
        self.extraUncertainty = extraUncertainty

class SearchTreeBranch:
    pass
        
class Hyphenation:

    """
    Finds potential syllable breaks in a string of Finnish text.
    
    Potential breaks are classified according to type. A list of 
    alternative syllabifications sorted by probability can be produced.
    This class also provides other language-related services, such as 
    determining whether a syllable is long or short.
    
    """
    
    def __init__(self, vowels, consonants, diphNormal, diphSpecial,
                 customLower, customUpper, inputPartSeparator,
                 outputPartSeparator, outputSyllSeparator, syllShapes,
                 maxAlternatives, penalty):
        """
        @param vowels: vowel characters, in lowercase
        @type vowels: string
        @param consonants: consonant characters, in lowercase
        @type consonants: string
        @param diphNormal: diphthongs that are handled the same way
            at all positions
        @type diphNormal: list of strings of length 2, lowercase
        @param diphSpecial: diphthongs that are by default diphthongs
            only in the first syllable
        @type diphSpecial: list of strings of length 2, lowercase
        @param customLower: characters for which the case conversion 
            of the default locale may not work, e.g. ''
        @type customLower: string
        @param customUpper: the same characters in the same order as in
            customLower, but in uppercase 
        @type customUpper: string
        @param inputPartSeparator: this character, e.g. '-', is
            assumed to separate the parts of a compound word. It is 
            treated similarly to whitespace syllabification. 
        @type inputPartSeparator: string of length 1
        @param outputPartSeparator: this character, e.g. '=', replaces
            inputPartSeparator in hyphenated strings produced by this
            class 
        @type outputPartSeparator: string of length 1
        @param outputSyllSeparator: this character, e.g. '-', is
            the hyphen (syllable separator) in hyphenated strings 
            produced by this class
        @type outputSyllSeparator: string of length 1
        @param syllShapes: a three-dimensional array of symbols that
            represent syllable shapes. First index = nucleus (0..2),
            second = coda (0..1), third = onset (0..1).
        @type syllShapes: 3D array of strings of length 1
        @param maxAlternatives: how many alternative syllabifications
            can be produced at most
        @type maxAlternatives: number
        @param penalty: an object with fields max, default, 
            special, notDefault 
        @type penalty: config.ids.Penalties
        
        """   
        self.__vowels = vowels
        self.__consonants = consonants
        self.__diphNormal = diphNormal
        self.__diphSpecial = diphSpecial
        self.__customLower = customLower
        self.__customUpper = customUpper
        self.__inputPartSeparator = inputPartSeparator
        self.__outputPartSeparator = outputPartSeparator
        self.__outputSyllSeparator = outputSyllSeparator
        self.__syllShapes = syllShapes
        self.__maxAlternatives = maxAlternatives
        self.__penalty = penalty
    
    def customLower(self, input):
        """
        Return the given string in lowercase. If a custom conversion
        for a character has been provided at construction, this
        method uses that conversion. For all other characters the
        default conversion to lowercase is used. This method can
        thus handle  and  correctly when Finnish locale is not 
        available.
        
        """
        output = ''
        for char in input:
            index = self.__customUpper.find(char)
            if index == -1:
                outputChar = char.lower()
            else:
                outputChar = self.__customLower[index]
            output += outputChar
        return output
    
    def syllabification(self, input):
        """
        Find potential syllable breaks a string of text.
        
        @param input: may contain other characters than whitespace and
            those returned by recognizedChars(), but they have no 
            special meaning
        @return: list of SyllableBreak objects
        
        """
        input = self.customLower(input)
        input = input.replace(self.__inputPartSeparator, ' ')
        
        breaks = []
        firstVowelFound = False
        firstSyllPassed = False
        followingLongOrDiphthong = False
        setLOD = False
        
        for index in range(len(input) - 1):
            if input[index].isspace():
                firstVowelFound = False
                firstSyllPassed = False
            if input[index] in self.__vowels:
                firstVowelFound = True
            elif input[index] in self.__consonants:
                if firstVowelFound:
                    firstSyllPassed = True
            
            if setLOD:
                # The flag is on for this round
                followingLongOrDiphthong = True
                setLOD = False
            elif followingLongOrDiphthong:
                followingLongOrDiphthong = False
                
            subStr = input[index:index + 2]
            
            if subStr[0] in self.__consonants \
            and subStr[1] in self.__vowels \
            and firstSyllPassed:
                breaks.append(SyllableBreak(index, 'certain', False))
            
            if subStr[0] in self.__vowels \
            and subStr[1] in self.__vowels:
                if followingLongOrDiphthong:
                    # If a vowel character is the second part of a
                    # diphthong or long vowel, it cannot form a
                    # diphthong or long vowel with the next  
                    # character
                    breaks.append(SyllableBreak(index + 1, 'default',
                                                False))
                elif subStr[0] == subStr[1]:
                    setLOD = True
                elif subStr in self.__diphNormal:
                    breaks.append(SyllableBreak(index + 1, 
                                                'not_default',
                                                False))
                    setLOD = True
                elif subStr in self.__diphSpecial:
                    if firstSyllPassed:
                        breaks.append(SyllableBreak(index + 1, 
                                                    'default',
                                                    True))
                    else:
                        breaks.append(SyllableBreak(index + 1, 
                                                    'not_default',
                                                    False))
                        setLOD = True
                else:
                    breaks.append(SyllableBreak(index + 1, 'default',
                                                False))
        return breaks
    
    def allHyphenations(self, input):
        """
        Generate a list of the most probable syllabifications.
        
        This method uses syllabification() to find potential breaks
        and generates alternative hyphenations that deviate from the
        default. The hyphenated strings are sorted by probability.
        @param input: a string of text  
        @return: a list of tuples (penalty, hyphenated string) sorted
            by rising penalty. The first item is the default 
            hyphenation with penalty = 0.
            
        """
        breaks = self.syllabification(input)
        
        input = input.replace(self.__inputPartSeparator,
                              self.__outputPartSeparator)
        
        hyphenations = []
        current = ''
        self.doHyphenations(input, 0, breaks, hyphenations, current, 
                            0, 0)
        hyphenations.sort()
        
        #print hyphenations
        
        return hyphenations
    
    def doHyphenations(self, input, penalty, breaks, hyphenations, 
                       current, prevBreak, breakIndex):
        if (penalty > self.__penalty.max 
            or len(hyphenations) >= self.__maxAlternatives):
            return
        
        if breakIndex == len(breaks):
            current += input[prevBreak:]
            hyphenations.append((penalty, current))
            return
        
        brk = breaks[breakIndex]        
        withBreak = current + input[prevBreak:brk.before] \
                    + self.__outputSyllSeparator                   
        withoutBreak = current + input[prevBreak:brk.before]
        
        doBreak = SearchTreeBranch()
        doBreak.text = withBreak
        dontBreak = SearchTreeBranch()
        dontBreak.text = withoutBreak
        branches = None
        
        if brk.type == 'certain':
            doBreak.penalty = 0            
            branches = [doBreak]
        elif brk.type == 'default':
            doBreak.penalty = 0
            if brk.extraUncertainty:
                dontBreak.penalty = self.__penalty.special
            else:
                dontBreak.penalty = self.__penalty.default
            # The default path is taken first to make sure it gets done
            branches = [doBreak, dontBreak]
        else:
            doBreak.penalty = self.__penalty.notDefault
            dontBreak.penalty = 0
            branches = [dontBreak, doBreak]
                    
        for branch in branches:
            self.doHyphenations(
                    input, 
                    penalty + branch.penalty, 
                    breaks, 
                    hyphenations, 
                    branch.text, 
                    brk.before, 
                    breakIndex + 1)
                            
    def recognizedChars(self):
        """
        Get recognized input characters.
        
        Whitespace characters are not listed by this method. Any 
        whitespace can be used to separate words in the input. 
        @return: a string containing lowercase versions of recognized
            characters
        
        """
        return self.__vowels + self.__consonants \
                + self.__inputPartSeparator

    def letterChars(self):
        return self.__vowels + self.__consonants
        
    def endsInConsonant(self, syllable):
        if len(syllable.phones) == 0:
            return False
        syll = self.customLower(syllable.phones)
        return syll[-1] in self.__consonants

    def hasDiphthong(self, syllable):
        syll = self.customLower(syllable.phones)
        for diphthong in self.__diphNormal + self.__diphSpecial:
            if syll.find(diphthong) != -1:
                return True
        return False

    def hasLongVowel(self, syllable):
        syll = self.customLower(syllable.phones)
        for longVowel in [x + x for x in self.__vowels]:
            if syll.find(longVowel) != -1:
                return True
        return False

    def isLong(self, syllable):
        return (self.endsInConsonant(syllable) 
                or self.hasDiphthong(syllable)
                or self.hasLongVowel(syllable))
        
    def getOnset(self, str):
        onset = ''
        for char in str:
            if self.customLower(char) in self.__vowels:
                break
            onset += char
                    
        return onset

    def getNucleus(self, str):
        nucleus = ''
        i = 0
        while i < len(str) \
        and self.customLower(str[i]) not in self.__vowels:
            i += 1
        while i < len(str) \
        and self.customLower(str[i]) in self.__vowels:
            nucleus += str[i]
            i += 1
        return nucleus

    def syllableShape(self, syllable):
        vowel = 2
        if self.hasDiphthong(syllable):
            vowel = 0
        elif self.hasLongVowel(syllable):
            vowel = 1
        
        coda = 1
        if self.getOnset(syllable.phones) \
           + self.getNucleus(syllable.phones) != syllable.phones:
            coda = 0
        
        onset = 1
        if len(self.getOnset(syllable.phones)) > 0:
            onset = 0
            
        return self.__syllShapes[vowel][coda][onset]
        
if __name__ == "__main__":
    pass