Package translate :: Package lang :: Module identify
[hide private]
[frames] | no frames]

Source Code for Module translate.lang.identify

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """ 
 22  This module contains functions for identifying languages based on language 
 23  models. 
 24  """ 
 25   
 26  from os import extsep, path 
 27   
 28  from translate.misc.file_discovery import get_abs_data_filename 
 29  from translate.storage.base import TranslationStore 
 30   
 31  from ngram import NGram 
 32   
 33   
34 -class LanguageIdentifier(object):
35 MODEL_DIR = get_abs_data_filename('langmodels') 36 """The directory containing the ngram language model files.""" 37 CONF_FILE = 'fpdb.conf' 38 """ 39 The name of the file that contains language name-code pairs 40 (relative to C{MODEL_DIR}). 41 """ 42
43 - def __init__(self, model_dir=None, conf_file=None):
44 if model_dir is None: 45 model_dir = self.MODEL_DIR 46 if not path.isdir(model_dir): 47 raise ValueError('Directory does not exist: %s' % (model_dir)) 48 49 if conf_file is None: 50 conf_file = self.CONF_FILE 51 conf_file = path.abspath(path.join(model_dir, conf_file)) 52 if not path.isfile(conf_file): 53 raise ValueError('File does not exist: %s' % (conf_file)) 54 55 self._load_config(conf_file) 56 self.ngram = NGram(model_dir)
57
58 - def _load_config(self, conf_file):
59 """Load the mapping of language names to language codes as given in the 60 configuration file.""" 61 lines = open(conf_file).read().splitlines() 62 self._lang_codes = {} 63 for line in lines: 64 parts = line.split() 65 if not parts or line.startswith('#'): 66 continue # Skip comment- and empty lines 67 lname, lcode = parts[0], parts[1] 68 69 lname = path.split(lname)[-1] # Make sure lname is not prefixed by directory names 70 if extsep in lname: 71 lname = lname[:lname.rindex(extsep)] # Remove extension if it has 72 73 # Remove trailing '[_-]-utf8' from code 74 if lcode.endswith('-utf8'): 75 lcode = lcode[:-len('-utf8')] 76 if lcode.endswith('-') or lcode.endswith('_'): 77 lcode = lcode[:-1] 78 79 self._lang_codes[lname] = lcode
80
81 - def identify_lang(self, text):
82 """Identify the language of the text in the given string.""" 83 if not text: 84 return None 85 result = self.ngram.classify(text) 86 if result in self._lang_codes: 87 result = self._lang_codes[result] 88 return result
89
90 - def identify_source_lang(self, instore):
91 """Identify the source language of the given translation store or 92 units. 93 94 @type instore: C{TranslationStore} or list or tuple of 95 C{TranslationUnit}s. 96 @param instore: The translation store to extract source text from. 97 @returns: The identified language's code or C{None} if the language 98 could not be identified.""" 99 if not isinstance(instore, (TranslationStore, list, tuple)): 100 return None 101 102 text = u' '.join(unit.source for unit in instore[:50] if unit.istranslatable() and unit.source) 103 if not text: 104 return None 105 return self.identify_lang(text)
106
107 - def identify_target_lang(self, instore):
108 """Identify the target language of the given translation store or 109 units. 110 111 @type instore: C{TranslationStore} or list or tuple of 112 C{TranslationUnit}s. 113 @param instore: The translation store to extract target text from. 114 @returns: The identified language's code or C{None} if the language 115 could not be identified.""" 116 if not isinstance(instore, (TranslationStore, list, tuple)): 117 return None 118 119 text = u' '.join(unit.target for unit in instore[:200] if unit.istranslatable() and unit.target) 120 if not text: 121 return None 122 return self.identify_lang(text)
123 124 if __name__ == "__main__": 125 from sys import argv 126 from os import path 127 script_dir = path.abspath(path.dirname(argv[0])) 128 identifier = LanguageIdentifier(path.join(script_dir, '..', 'share', 'langmodels')) 129 import locale 130 encoding = locale.getpreferredencoding() 131 print "Language detected:", identifier.identify_lang(argv[1].decode(encoding)) 132