Package translate :: Package lang :: Module ngram
[hide private]
[frames] | no frames]

Source Code for Module translate.lang.ngram

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright (c) 2006 Thomas Mangin 
  5  # Copyright (c) 2009 Zuza Software Foundation 
  6  # 
  7  # This program is distributed under Gnu General Public License 
  8  # (cf. the file COPYING in distribution). Alternatively, you can use 
  9  # the program under the conditions of the Artistic License (as Perl). 
 10  # 
 11  # This program is free software; you can redistribute it and/or modify 
 12  # it under the terms of the GNU General Public License as published by 
 13  # the Free Software Foundation; either version 2 of the License, or 
 14  # (at your option) any later version. 
 15  # 
 16  # This program is distributed in the hope that it will be useful, 
 17  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 18  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 19  # GNU General Public License for more details. 
 20  # 
 21  # You should have received a copy of the GNU General Public License 
 22  # along with this program; if not, write to the Free Software 
 23  # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 
 24  # 
 25  # Orignal file from http://thomas.mangin.me.uk/data/source/ngram.py 
 26   
 27  import re 
 28   
 29  nb_ngrams = 400 
 30   
31 -class _NGram:
32 - def __init__(self, arg={}):
33 if isinstance(arg, basestring): 34 self.addText(arg) 35 self.normalise() 36 elif isinstance(arg, dict): 37 self.ngrams = arg 38 self.normalise() 39 else: 40 self.ngrams = dict()
41
42 - def addText(self, text):
43 if isinstance(text, str): 44 text = text.decode('utf-8') 45 46 ngrams = dict() 47 48 text = text.replace('\n', ' ') 49 text = re.sub('\s+', ' ', text) 50 words = text.split(' ') 51 52 for word in words: 53 word = '_'+word+'_' 54 size = len(word) 55 for i in xrange(size): 56 for s in (1, 2, 3, 4): 57 sub = word[i:i+s] 58 if not ngrams.has_key(sub): 59 ngrams[sub] = 0 60 ngrams[sub] += 1 61 62 if i+s >= size: 63 break 64 self.ngrams = ngrams 65 return self
66
67 - def sorted(self):
68 sorted = [(self.ngrams[k], k) for k in self.ngrams.keys()] 69 sorted.sort() 70 sorted.reverse() 71 sorted = sorted[:nb_ngrams] 72 return sorted
73
74 - def normalise(self):
75 count = 0 76 ngrams = {} 77 for v, k in self.sorted(): 78 ngrams[k] = count 79 count += 1 80 81 self.ngrams = ngrams 82 return self
83
84 - def addValues(self, key, value):
85 self.ngrams[key] = value 86 return self
87
88 - def compare(self, ngram):
89 d = 0 90 ngrams = ngram.ngrams 91 for k in self.ngrams.keys(): 92 if ngrams.has_key(k): 93 d += abs(ngrams[k] - self.ngrams[k]) 94 else: 95 d += nb_ngrams 96 return d
97 98 99 import os 100 import glob 101
102 -class NGram:
103 - def __init__(self, folder, ext='.lm'):
104 self.ngrams = dict() 105 folder = os.path.join(folder, '*'+ext) 106 size = len(ext) 107 count = 0 108 109 for fname in glob.glob(os.path.normcase(folder)): 110 count += 1 111 lang = os.path.split(fname)[-1][:-size] 112 ngrams = {} 113 lines = open(fname, 'r').readlines() 114 115 try: 116 i = len(lines) 117 for line in lines: 118 line = line.decode('utf-8') 119 parts = line[:-1].split() 120 if len(parts) != 2: 121 try: 122 ngrams[parts[0]] = i 123 except IndexError: 124 pass # Line probably only contained spaces, if anything 125 else: 126 ngrams[parts[0]] = int(parts[1]) 127 i -= 1 128 except UnicodeDecodeError, e: 129 continue 130 131 if ngrams: 132 self.ngrams[lang] = _NGram(ngrams) 133 134 if not count: 135 raise ValueError("no language files found")
136
137 - def classify(self, text):
138 ngram = _NGram(text) 139 r = 'guess' 140 141 langs = self.ngrams.keys() 142 r = langs.pop() 143 min = self.ngrams[r].compare(ngram) 144 145 for lang in langs: 146 d = self.ngrams[lang].compare(ngram) 147 if d < min: 148 min = d 149 r = lang 150 151 if min > 0.8 * (nb_ngrams**2): 152 r = '' 153 return r
154
155 -class Generate:
156 - def __init__(self, folder, ext='.txt'):
157 self.ngrams = dict() 158 folder = os.path.join(folder, '*'+ext) 159 size = len(ext) 160 161 for fname in glob.glob(os.path.normcase(folder)): 162 lang = os.path.split(fname)[-1][:-size] 163 n = _NGram() 164 165 file = open(fname,'r') 166 for line in file.readlines(): 167 n.addText(line) 168 file.close() 169 170 n.normalise() 171 self.ngrams[lang] = n
172
173 - def save(self, folder, ext='.lm'):
174 for lang in self.ngrams.keys(): 175 fname = os.path.join(folder, lang+ext) 176 file = open(fname, 'w') 177 for v, k in self.ngrams[lang].sorted(): 178 file.write("%s\t %d\n" % (k, v)) 179 file.close()
180 181 if __name__ == '__main__': 182 import sys 183 184 # Should you want to generate your own .lm files 185 #conf = Generate('/tmp') 186 #conf.save('/tmp') 187 188 text = sys.stdin.readline() 189 from translate.misc.file_discovery import get_abs_data_filename 190 l = NGram(get_abs_data_filename('langmodels')) 191 print l.classify(text) 192