1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """
22 This module contains functions for identifying languages based on language
23 models.
24 """
25
26 from os import extsep, path
27
28 from translate.misc.file_discovery import get_abs_data_filename
29 from translate.storage.base import TranslationStore
30
31 from ngram import NGram
32
33
35 MODEL_DIR = get_abs_data_filename('langmodels')
36 """The directory containing the ngram language model files."""
37 CONF_FILE = 'fpdb.conf'
38 """
39 The name of the file that contains language name-code pairs
40 (relative to C{MODEL_DIR}).
41 """
42
43 - def __init__(self, model_dir=None, conf_file=None):
44 if model_dir is None:
45 model_dir = self.MODEL_DIR
46 if not path.isdir(model_dir):
47 raise ValueError('Directory does not exist: %s' % (model_dir))
48
49 if conf_file is None:
50 conf_file = self.CONF_FILE
51 conf_file = path.abspath(path.join(model_dir, conf_file))
52 if not path.isfile(conf_file):
53 raise ValueError('File does not exist: %s' % (conf_file))
54
55 self._load_config(conf_file)
56 self.ngram = NGram(model_dir)
57
59 """Load the mapping of language names to language codes as given in the
60 configuration file."""
61 lines = open(conf_file).read().splitlines()
62 self._lang_codes = {}
63 for line in lines:
64 parts = line.split()
65 if not parts or line.startswith('#'):
66 continue
67 lname, lcode = parts[0], parts[1]
68
69 lname = path.split(lname)[-1]
70 if extsep in lname:
71 lname = lname[:lname.rindex(extsep)]
72
73
74 if lcode.endswith('-utf8'):
75 lcode = lcode[:-len('-utf8')]
76 if lcode.endswith('-') or lcode.endswith('_'):
77 lcode = lcode[:-1]
78
79 self._lang_codes[lname] = lcode
80
82 """Identify the language of the text in the given string."""
83 if not text:
84 return None
85 result = self.ngram.classify(text)
86 if result in self._lang_codes:
87 result = self._lang_codes[result]
88 return result
89
91 """Identify the source language of the given translation store or
92 units.
93
94 @type instore: C{TranslationStore} or list or tuple of
95 C{TranslationUnit}s.
96 @param instore: The translation store to extract source text from.
97 @returns: The identified language's code or C{None} if the language
98 could not be identified."""
99 if not isinstance(instore, (TranslationStore, list, tuple)):
100 return None
101
102 text = u' '.join(unit.source for unit in instore[:50] if unit.istranslatable() and unit.source)
103 if not text:
104 return None
105 return self.identify_lang(text)
106
108 """Identify the target language of the given translation store or
109 units.
110
111 @type instore: C{TranslationStore} or list or tuple of
112 C{TranslationUnit}s.
113 @param instore: The translation store to extract target text from.
114 @returns: The identified language's code or C{None} if the language
115 could not be identified."""
116 if not isinstance(instore, (TranslationStore, list, tuple)):
117 return None
118
119 text = u' '.join(unit.target for unit in instore[:200] if unit.istranslatable() and unit.target)
120 if not text:
121 return None
122 return self.identify_lang(text)
123
124 if __name__ == "__main__":
125 from sys import argv
126 from os import path
127 script_dir = path.abspath(path.dirname(argv[0]))
128 identifier = LanguageIdentifier(path.join(script_dir, '..', 'share', 'langmodels'))
129 import locale
130 encoding = locale.getpreferredencoding()
131 print "Language detected:", identifier.identify_lang(argv[1].decode(encoding))
132