1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 import re
28
29 nb_ngrams = 400
30
33 if isinstance(arg, basestring):
34 self.addText(arg)
35 self.normalise()
36 elif isinstance(arg, dict):
37 self.ngrams = arg
38 self.normalise()
39 else:
40 self.ngrams = dict()
41
42 - def addText(self, text):
43 if isinstance(text, str):
44 text = text.decode('utf-8')
45
46 ngrams = dict()
47
48 text = text.replace('\n', ' ')
49 text = re.sub('\s+', ' ', text)
50 words = text.split(' ')
51
52 for word in words:
53 word = '_'+word+'_'
54 size = len(word)
55 for i in xrange(size):
56 for s in (1, 2, 3, 4):
57 sub = word[i:i+s]
58 if not ngrams.has_key(sub):
59 ngrams[sub] = 0
60 ngrams[sub] += 1
61
62 if i+s >= size:
63 break
64 self.ngrams = ngrams
65 return self
66
73
75 count = 0
76 ngrams = {}
77 for v, k in self.sorted():
78 ngrams[k] = count
79 count += 1
80
81 self.ngrams = ngrams
82 return self
83
85 self.ngrams[key] = value
86 return self
87
89 d = 0
90 ngrams = ngram.ngrams
91 for k in self.ngrams.keys():
92 if ngrams.has_key(k):
93 d += abs(ngrams[k] - self.ngrams[k])
94 else:
95 d += nb_ngrams
96 return d
97
98
99 import os
100 import glob
101
104 self.ngrams = dict()
105 folder = os.path.join(folder, '*'+ext)
106 size = len(ext)
107 count = 0
108
109 for fname in glob.glob(os.path.normcase(folder)):
110 count += 1
111 lang = os.path.split(fname)[-1][:-size]
112 ngrams = {}
113 lines = open(fname, 'r').readlines()
114
115 try:
116 i = len(lines)
117 for line in lines:
118 line = line.decode('utf-8')
119 parts = line[:-1].split()
120 if len(parts) != 2:
121 try:
122 ngrams[parts[0]] = i
123 except IndexError:
124 pass
125 else:
126 ngrams[parts[0]] = int(parts[1])
127 i -= 1
128 except UnicodeDecodeError, e:
129 continue
130
131 if ngrams:
132 self.ngrams[lang] = _NGram(ngrams)
133
134 if not count:
135 raise ValueError("no language files found")
136
154
156 - def __init__(self, folder, ext='.txt'):
157 self.ngrams = dict()
158 folder = os.path.join(folder, '*'+ext)
159 size = len(ext)
160
161 for fname in glob.glob(os.path.normcase(folder)):
162 lang = os.path.split(fname)[-1][:-size]
163 n = _NGram()
164
165 file = open(fname,'r')
166 for line in file.readlines():
167 n.addText(line)
168 file.close()
169
170 n.normalise()
171 self.ngrams[lang] = n
172
173 - def save(self, folder, ext='.lm'):
174 for lang in self.ngrams.keys():
175 fname = os.path.join(folder, lang+ext)
176 file = open(fname, 'w')
177 for v, k in self.ngrams[lang].sorted():
178 file.write("%s\t %d\n" % (k, v))
179 file.close()
180
181 if __name__ == '__main__':
182 import sys
183
184
185
186
187
188 text = sys.stdin.readline()
189 from translate.misc.file_discovery import get_abs_data_filename
190 l = NGram(get_abs_data_filename('langmodels'))
191 print l.classify(text)
192