#!/usr/bin/env python
#coding: utf-8

# nlp.py - natural language processor class for SAPI Book Reader
# Copyright (C) Bohdan R. Rau 2013 <ethanak@polip.com>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program.  If not, write to:
# 	The Free Software Foundation, Inc.,
# 	51 Franklin Street, Fifth Floor
# 	Boston, MA  02110-1301, USA.

import re,os

_ME_SIMPLE_SPLITTER=re.compile(ur'^(?P<sent>(.*?)[.?!][^\w\s]*(\s+|$))(?P<rest>.*)$',re.DOTALL | re.UNICODE)
_ME_DICT_SPLITTER_P=ur'^(?P<sent>(?:(?:(?:^|\s)(?:%s)\.(?=\s+\w))|.)*?[.?!][^\w\s]*(?:\s+|$))(?P<rest>.*)$'
_ME_DICT_SPLITTER_EN=ur'^(?P<sent>(?:(?:(?:^|\s)(?:no\.(?=\s+[0-9])|(?:%s)\.(?=\s+\w)))|.)*?[.?!][^\w\s]*(?:\s+|$))(?P<rest>.*)$'
_ME_DATA_PATH=os.path.dirname(__file__)
_ME_BLANK=re.compile(ur'\s+')
_ME_PARAG=re.compile(ur'^(.*?)\n(.*)$',re.DOTALL)

_ME_SENTCLEANER_START=re.compile(ur'^(\W|_)+(.*)$',re.DOTALL | re.UNICODE)
_ME_SENTCLEANER_QUOTE=re.compile(u",,|''|([)}\]]+$)",re.DOTALL | re.UNICODE)
_ME_SENTCLEANER_DTC=re.compile(ur'[)}\]]| (-+ |[({\[])',re.DOTALL | re.UNICODE)
_ME_SENTCLEANER_DTE=re.compile(ur'-+\s*$',re.DOTALL | re.UNICODE)
_ME_SENTCLEANER_EN1=re.compile(r'^(.*\s)?no\.$',re.DOTALL | re.UNICODE | re.IGNORECASE)

_ME_SENTCLEANER_TRANS={
    0x2018: u"'",
    0x2019: u"'",
    0x201b: u"'",
    ord(u'`'):u"'",
    0xab: u'',
    0xbb: u'',
    0x201c: u'',
    0x201d: u'',
    0x201e: u'',
    0x201f: u'',
    0x2039: u'',
    0x203a: u'',
    ord(u'"'):u'',
    ord(u'—'):u'-',
    ord(u'–'):u'-',
}

_ME_PARA_LONG = 4
_ME_PARA_DIALSTART = 2
_ME_PARA_DIALEND = 1
_ME_PARA_NONE = 8



#exportable

ME_PAUSE_NONE = 0
ME_PAUSE_SENTENCE = 1
ME_PAUSE_PARA = 2
ME_PAUSE_PREDIAL = 3
ME_PAUSE_POSTDIAL = 4
ME_PAUSE_LONG = 5

ME_PAUSE_DEFAULTS = (0, 450, 750, 950, 1050, 1400)


class sentenser(object):
    def _create_splitter(self,lang):
        path=os.path.join(_ME_DATA_PATH,lang+'.abr')
        try:
            f=open(path).readlines()
        except:
            self._splitter=_ME_SIMPLE_SPLITTER
            return
        s=[]
        for a in f:
            a=a.decode('utf-8').strip().rstrip('.').lower()
            if a.startswith('#') or a == '':
                continue
            a=a.replace('.',r'\.').replace('_',r'\s*')
            a=_ME_BLANK.sub(r'\s+',a)
            s.append(a)
        if len(s) == 0 and lang != 'en':
            self._splitter=_ME_SIMPLE_SPLITTER
            return
        s='|'.join(s)
        t=_ME_DICT_SPLITTER_EN if lang == 'en' else _ME_DICT_SPLITTER_P
        self._splitter=re.compile(t % s,re.DOTALL | re.UNICODE | re.IGNORECASE)
        return
            
    def feed(self,lang,offset,txt):
        self._lang=lang
        self._offset=offset
        if isinstance(txt,unicode):
            self._txt=txt
        else:
            self._txt=txt.decode('utf-8')
        self._paragraph=''
        self._create_splitter(lang)
        self._last_said=_ME_PARA_NONE
        self._this_said=0
        self._para_start=offset
        self._pos=offset
        self._para_pos=0
        self._exhausted=False
    
    def get_sentence(self):
        if self._exhausted:
            return None
        while True:
            if self._paragraph == '':
                if self._txt == '':
                    self._exhausted = True
                    return None
                self._para_start=True
                r=_ME_PARAG.match(self._txt)
                if r:
                    p=r.group(1)
                    self._txt=r.group(2)
                else:
                    p=self._txt
                    self._txt=''
                self._para_start=self._pos
                self._pos += len(p)+1
                self._paragraph=p.strip()
                if not self.is_readable(self._paragraph):
                    self._paragraph=''
                    self._last_said=_ME_PARA_LONG
                    continue
                s=self._para_type(self._paragraph)
                #print "CF",s,self._last_said
                if (self._last_said & _ME_PARA_NONE) != 0:
                    self._this_said=ME_PAUSE_NONE
                elif (self._last_said & _ME_PARA_LONG) != 0:
                    self._this_said = ME_PAUSE_LONG
                else:
                    if (s & _ME_PARA_DIALSTART) != 0:
                        if (self._last_said & _ME_PARA_DIALEND) == 0:
                            self._this_said = ME_PAUSE_PREDIAL
                        else:
                            self._this_said = ME_PAUSE_PARA
                    else:
                        if (self._last_said & _ME_PARA_DIALEND) != 0:
                            self._this_said = ME_PAUSE_POSTDIAL
                        else:
                            self._this_said = ME_PAUSE_PARA
                self._last_said = s
                self._para_pos=0
            
            r=self._splitter.match(self._paragraph)
            if r:
                s=r.group('sent')
                self._paragraph=r.group('rest')
            else:
                s=self._paragraph
                self._paragraph=''
            pos=self._para_pos+self._para_start
            self._para_pos += len(s)
            cnt=len(s)
            s=s.strip()
            if not self.is_readable(s):
                continue
            s=self.cleanpunct(s)
            pau=self._this_said
            self._this_said=ME_PAUSE_SENTENCE
            return {'pause':pau,'pos':pos,'count':cnt,'txt':s, 'wave':None, 'freq':None}
    
    
    def cleanpunct(self, sentence):
        # strip 
        r=_ME_SENTCLEANER_START.match(sentence)
        if r:
            sentence=r.group(2)
            
        sentence=sentence.translate(_ME_SENTCLEANER_TRANS)
        sentence=_ME_SENTCLEANER_QUOTE.sub("",sentence)
        sentence=_ME_SENTCLEANER_DTC.sub(", ",sentence)
        sentence=_ME_SENTCLEANER_DTE.sub(u"…",sentence)
        if self._lang == 'en':
            if _ME_SENTCLEANER_EN1.match(sentence):
                sentence=sentence[:-1] + '!'
        return sentence
        
    
    
    def __init__(self):
        self._rgx=re.compile(r'^((.*?)[.?!][^\w\s]*(\s+|$))(.*)$',re.DOTALL | re.UNICODE)
        self._rda=re.compile(r'.*\w',re.DOTALL | re.UNICODE)
        self._rpc=re.compile(r'[\s_]+',re.DOTALL | re.UNICODE)
        self._rpa=re.compile(u'^[-—–]',re.DOTALL | re.UNICODE)
        self._rleft=re.compile(u'^["\u201c-\u201f\u2039](.*[,;:.?!]["\u201c-\u201f\u203a])',re.DOTALL | re.UNICODE)
        self._rrite=re.compile(u'^.*[,;:.?!]\s*["\u201c-\u201f\u203a]$',re.DOTALL | re.UNICODE)
  
    def is_dialog(self,txt):
        if isinstance(txt,str):
            txt=txt.decode('utf-8')
        if self._rpa.match(txt):
            return True
        return False
    
    def is_readable(self,txt):
        if isinstance(txt,str):
            txt=txt.decode('utf-8')
        #txt=self._rpc.sub(' ',txt)
        if self._rda.match(txt):
            return True
        return False
    
    def _para_type(self,txt):
        if self._rpa.match(txt):
            return _ME_PARA_DIALSTART | _ME_PARA_DIALEND
        n=0
        if self._rleft.match(txt):
            n |= _ME_PARA_DIALSTART
        if self._rrite.match(txt):
            n |= _ME_PARA_DIALEND
        
        return n
    

if __name__ == '__main__':
    T=sentenser()
    T.feed('en',0,'''
           There is--
           Snow no. 23. "But no." No one mr. Fuck can do it.
           ''')
    while True:
        a=T.get_sentence()
        if not a:
            break
        print a