# pyenchant # # Copyright (C) 2004-2008 Ryan Kelly # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # In addition, as a special exception, you are # given permission to link the code of this program with # non-LGPL Spelling Provider libraries (eg: a MSFT Office # spell checker backend) and distribute linked combinations including # the two. You must obey the GNU Lesser General Public License in all # respects for all of the code used other than said providers. If you modify # this file, you may extend this exception to your version of the # file, but you are not obligated to do so. If you do not wish to # do so, delete this exception statement from your version. # """ enchant.utils: Misc utilities for the enchant package ======================================================== This module provides miscellaneous utilities for use with the enchant spellchecking package. Currently available functionality includes: * string/unicode compatibility wrappers * functions for dealing with locale/language settings * ability to list supporting data files (win32 only) * functions for bundling supporting data files from a build """ import os import sys import codecs from enchant.errors import * # Attempt to access local language information try: import locale except ImportError: locale = None # # Unicode/Bytes compatabilty wrappers. # # These allow us to support both Python 2.x and Python 3.x from # the same codebase. # # We provide explicit type objects "bytes" and "unicode" that can be # used to construct instances of the appropriate type. The class # "EnchantStr" derives from the default "str" type and implements the # necessary logic for encoding/decoding as strings are passed into # the underlying C library (where they must always be utf-8 encoded # byte strings). # try: unicode = unicode except NameError: str = str unicode = str bytes = bytes basestring = (str, bytes) else: str = str unicode = unicode bytes = str basestring = basestring def raw_unicode(raw): """Make a unicode string from a raw string. This function takes a string containing unicode escape characters, and returns the corresponding unicode string. Useful for writing unicode string literals in your python source while being upwards- compatible with Python 3. For example, instead of doing this: s = u"hello\u2149" # syntax error in Python 3 Or this: s = "hello\u2149" # not what you want in Python 2.x You can do this: s = raw_unicode(r"hello\u2149") # works everywhere! """ return raw.encode("utf8").decode("unicode-escape") def raw_bytes(raw): """Make a bytes object out of a raw string. This is analogous to raw_unicode, but processes byte escape characters to produce a bytes object. """ return codecs.escape_decode(raw)[0] class EnchantStr(str): """String subclass for interfacing with enchant C library. This class encapsulates the logic for interfacing between python native string/unicode objects and the underlying enchant library, which expects all strings to be UTF-8 character arrays. It is a subclass of the default string class 'str' - on Python 2.x that makes it an ascii string, on Python 3.x it is a unicode object. Initialise it with a string or unicode object, and use the encode() method to obtain an object suitable for passing to the underlying C library. When strings are read back into python, use decode(s) to translate them back into the appropriate python-level string type. This allows us to following the common Python 2.x idiom of returning unicode when unicode is passed in, and byte strings otherwise. It also lets the interface be upwards-compatible with Python 3, in which string objects are unicode by default. """ def __new__(cls, value): """EnchantStr data constructor. This method records whether the initial string was unicode, then simply passes it along to the default string constructor. """ if type(value) is unicode: was_unicode = True if str is not unicode: value = value.encode("utf-8") else: was_unicode = False if str is not bytes: raise Error("Don't pass bytestrings to pyenchant") self = str.__new__(cls, value) self._was_unicode = was_unicode return self def encode(self): """Encode this string into a form usable by the enchant C library.""" if str is unicode: return str.encode(self, "utf-8") else: return self def decode(self, value): """Decode a string returned by the enchant C library.""" if self._was_unicode: if str is unicode: # On some python3 versions, ctypes converts c_char_p # to str() rather than bytes() if isinstance(value, str): value = value.encode() return value.decode("utf-8") else: return value.decode("utf-8") else: return value def printf(values, sep=" ", end="\n", file=None): """Compatability wrapper from print statement/function. This function is a simple Python2/Python3 compatability wrapper for printing to stdout. """ if file is None: file = sys.stdout file.write(sep.join(map(str, values))) file.write(end) try: next = next except NameError: def next(iter): """Compatability wrapper for advancing an iterator.""" return iter.next() try: xrange = xrange except NameError: xrange = range # # Other useful functions. # def levenshtein(s1, s2): """Calculate the Levenshtein distance between two strings. This is straight from Wikipedia. """ if len(s1) < len(s2): return levenshtein(s2, s1) if not s1: return len(s2) previous_row = xrange(len(s2) + 1) for i, c1 in enumerate(s1): current_row = [i + 1] for j, c2 in enumerate(s2): insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1] def trim_suggestions(word, suggs, maxlen, calcdist=None): """Trim a list of suggestions to a maximum length. If the list of suggested words is too long, you can use this function to trim it down to a maximum length. It tries to keep the "best" suggestions based on similarity to the original word. If the optional "calcdist" argument is provided, it must be a callable taking two words and returning the distance between them. It will be used to determine which words to retain in the list. The default is a simple Levenshtein distance. """ if calcdist is None: calcdist = levenshtein decorated = [(calcdist(word, s), s) for s in suggs] decorated.sort() return [s for (l, s) in decorated[:maxlen]] def get_default_language(default=None): """Determine the user's default language, if possible. This function uses the 'locale' module to try to determine the user's preferred language. The return value is as follows: * if a locale is available for the LC_MESSAGES category, that language is used * if a default locale is available, that language is used * if the keyword argument is given, it is used * if nothing else works, None is returned Note that determining the user's language is in general only possible if they have set the necessary environment variables on their system. """ try: import locale tag = locale.getlocale()[0] if tag is None: tag = locale.getdefaultlocale()[0] if tag is None: raise Error("No default language available") return tag except Exception: pass return default get_default_language._DOC_ERRORS = ["LC"] def get_resource_filename(resname): """Get the absolute path to the named resource file. This serves widely the same purpose as pkg_resources.resource_filename(), but tries to avoid loading pkg_resources unless we're actually in an egg. """ path = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(path, resname) if os.path.exists(path): return path if hasattr(sys, "frozen"): exe_path = unicode(sys.executable, sys.getfilesystemencoding()) exe_dir = os.path.dirname(exe_path) path = os.path.join(exe_dir, resname) if os.path.exists(path): return path else: import pkg_resources try: path = pkg_resources.resource_filename("enchant", resname) except KeyError: pass else: path = os.path.abspath(path) if os.path.exists(path): return path raise Error("Could not locate resource '%s'" % (resname,)) def win32_data_files(): """Get list of supporting data files, for use with setup.py This function returns a list of the supporting data files available to the running version of PyEnchant. This is in the format expected by the data_files argument of the distutils setup function. It's very useful, for example, for including the data files in an executable produced by py2exe. Only really tested on the win32 platform (it's the only platform for which we ship our own supporting data files) """ # Include the main enchant DLL try: libEnchant = get_resource_filename("libenchant.dll") except Error: libEnchant = get_resource_filename("libenchant-1.dll") mainDir = os.path.dirname(libEnchant) dataFiles = [('', [libEnchant])] # And some specific supporting DLLs for dll in os.listdir(mainDir): if not dll.endswith(".dll"): continue for prefix in ("iconv", "intl", "libglib", "libgmodule"): if dll.startswith(prefix): break else: continue dataFiles[0][1].append(os.path.join(mainDir, dll)) # And anything found in the supporting data directories dataDirs = ("share/enchant/myspell", "share/enchant/ispell", "lib/enchant") for dataDir in dataDirs: files = [] fullDir = os.path.join(mainDir, os.path.normpath(dataDir)) for fn in os.listdir(fullDir): fullFn = os.path.join(fullDir, fn) if os.path.isfile(fullFn): files.append(fullFn) dataFiles.append((dataDir, files)) return dataFiles win32_data_files._DOC_ERRORS = ["py", "py", "exe"]