Updated statsgen.py, maskgen.py, and policygen.py to Python3

Deleted policygen.py (too much work)
This commit is contained in:
larry.spohn
2024-08-07 09:08:45 -04:00
parent 2c041086bc
commit 25e2ec481e
22 changed files with 359 additions and 6499 deletions

View File

@@ -1,907 +0,0 @@
# pyenchant
#
# Copyright (C) 2004-2011, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPsE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
enchant: Access to the enchant spellchecking library
=====================================================
This module provides several classes for performing spell checking
via the Enchant spellchecking library. For more details on Enchant,
visit the project website:
http://www.abisource.com/enchant/
Spellchecking is performed using 'Dict' objects, which represent
a language dictionary. Their use is best demonstrated by a quick
example::
>>> import enchant
>>> d = enchant.Dict("en_US") # create dictionary for US English
>>> d.check("enchant")
True
>>> d.check("enchnt")
False
>>> d.suggest("enchnt")
['enchant', 'enchants', 'enchanter', 'penchant', 'incant', 'enchain', 'enchanted']
Languages are identified by standard string tags such as "en" (English)
and "fr" (French). Specific language dialects can be specified by
including an additional code - for example, "en_AU" refers to Australian
English. The later form is preferred as it is more widely supported.
To check whether a dictionary exists for a given language, the function
'dict_exists' is available. Dictionaries may also be created using the
function 'request_dict'.
A finer degree of control over the dictionaries and how they are created
can be obtained using one or more 'Broker' objects. These objects are
responsible for locating dictionaries for a specific language.
In Python 2.x, unicode strings are supported transparently in the
standard manner - if a unicode string is given as an argument, the
result will be a unicode string. Note that Enchant works in UTF-8
internally, so passing an ASCII string to a dictionary for a language
requiring Unicode may result in UTF-8 strings being returned.
In Python 3.x unicode strings are expected throughout. Bytestrings
should not be passed into any functions.
Errors that occur in this module are reported by raising subclasses
of 'Error'.
"""
_DOC_ERRORS = ['enchnt', 'enchnt', 'fr']
# Make version info available
__ver_major__ = 1
__ver_minor__ = 6
__ver_patch__ = 6
__ver_sub__ = ""
__version__ = "%d.%d.%d%s" % (__ver_major__, __ver_minor__,
__ver_patch__, __ver_sub__)
import os
try:
from enchant import _enchant as _e
except ImportError:
if not os.environ.get("PYENCHANT_IGNORE_MISSING_LIB", False):
raise
_e = None
from enchant.errors import *
from enchant.utils import EnchantStr, get_default_language
from enchant.pypwl import PyPWL
# Due to the unfortunate name collision between the enchant "tokenize" module
# and the stdlib "tokenize" module, certain values of sys.path can cause
# the former to override the latter and break the "warnings" module.
# This hacks around it by making a dumming "warnings" module.
try:
import warnings
except ImportError:
class warnings(object):
def warn(self, *args, **kwds):
pass
warnings = warnings()
class ProviderDesc(object):
"""Simple class describing an Enchant provider.
Each provider has the following information associated with it:
* name: Internal provider name (e.g. "aspell")
* desc: Human-readable description (e.g. "Aspell Provider")
* file: Location of the library containing the provider
"""
_DOC_ERRORS = ["desc"]
def __init__(self, name, desc, file):
self.name = name
self.desc = desc
self.file = file
def __str__(self):
return "<Enchant: %s>" % self.desc
def __repr__(self):
return str(self)
def __eq__(self, pd):
"""Equality operator on ProviderDesc objects."""
return (self.name == pd.name and \
self.desc == pd.desc and \
self.file == pd.file)
def __hash__(self):
"""Hash operator on ProviderDesc objects."""
return hash(self.name + self.desc + self.file)
class _EnchantObject(object):
"""Base class for enchant objects.
This class implements some general functionality for interfacing with
the '_enchant' C-library in a consistent way. All public objects
from the 'enchant' module are subclasses of this class.
All enchant objects have an attribute '_this' which contains the
pointer to the underlying C-library object. The method '_check_this'
can be called to ensure that this point is not None, raising an
exception if it is.
"""
def __init__(self):
"""_EnchantObject constructor."""
self._this = None
# To be importable when enchant C lib is missing, we need
# to create a dummy default broker.
if _e is not None:
self._init_this()
def _check_this(self, msg=None):
"""Check that self._this is set to a pointer, rather than None."""
if msg is None:
msg = "%s unusable: the underlying C-library object has been freed."
msg = msg % (self.__class__.__name__,)
if self._this is None:
raise Error(msg)
def _init_this(self):
"""Initialise the underlying C-library object pointer."""
raise NotImplementedError
def _raise_error(self, default="Unspecified Error", eclass=Error):
"""Raise an exception based on available error messages.
This method causes an Error to be raised. Subclasses should
override it to retrieve an error indication from the underlying
API if possible. If such a message cannot be retrieved, the
argument value <default> is used. The class of the exception
can be specified using the argument <eclass>
"""
raise eclass(default)
_raise_error._DOC_ERRORS = ["eclass"]
def __getstate__(self):
"""Customize pickling of PyEnchant objects.
Since it's not safe for multiple objects to share the same C-library
object, we make sure it's unset when pickling.
"""
state = self.__dict__.copy()
state["_this"] = None
return state
def __setstate__(self, state):
self.__dict__.update(state)
self._init_this()
class Broker(_EnchantObject):
"""Broker object for the Enchant spellchecker.
Broker objects are responsible for locating and managing dictionaries.
Unless custom functionality is required, there is no need to use Broker
objects directly. The 'enchant' module provides a default broker object
so that 'Dict' objects can be created directly.
The most important methods of this class include:
* dict_exists: check existence of a specific language dictionary
* request_dict: obtain a dictionary for specific language
* set_ordering: specify which dictionaries to try for for a
given language.
"""
def __init__(self):
"""Broker object constructor.
This method is the constructor for the 'Broker' object. No
arguments are required.
"""
_EnchantObject.__init__(self)
def _init_this(self):
self._this = _e.broker_init()
if not self._this:
raise Error("Could not initialise an enchant broker.")
def __del__(self):
"""Broker object destructor."""
if _e is not None:
self._free()
def _raise_error(self, default="Unspecified Error", eclass=Error):
"""Overrides _EnchantObject._raise_error to check broker errors."""
err = _e.broker_get_error(self._this)
if err == "" or err is None:
raise eclass(default)
raise eclass(err)
def _free(self):
"""Free system resource associated with a Broker object.
This method can be called to free the underlying system resources
associated with a Broker object. It is called automatically when
the object is garbage collected. If called explicitly, the
Broker and any associated Dict objects must no longer be used.
"""
if self._this is not None:
_e.broker_free(self._this)
self._this = None
def request_dict(self, tag=None):
"""Request a Dict object for the language specified by <tag>.
This method constructs and returns a Dict object for the
requested language. 'tag' should be a string of the appropriate
form for specifying a language, such as "fr" (French) or "en_AU"
(Australian English). The existence of a specific language can
be tested using the 'dict_exists' method.
If <tag> is not given or is None, an attempt is made to determine
the current language in use. If this cannot be determined, Error
is raised.
NOTE: this method is functionally equivalent to calling the Dict()
constructor and passing in the <broker> argument.
"""
return Dict(tag, self)
request_dict._DOC_ERRORS = ["fr"]
def _request_dict_data(self, tag):
"""Request raw C pointer data for a dictionary.
This method call passes on the call to the C library, and does
some internal bookkeeping.
"""
self._check_this()
tag = EnchantStr(tag)
new_dict = _e.broker_request_dict(self._this, tag.encode())
if new_dict is None:
eStr = "Dictionary for language '%s' could not be found"
self._raise_error(eStr % (tag,), DictNotFoundError)
return new_dict
def request_pwl_dict(self, pwl):
"""Request a Dict object for a personal word list.
This method behaves as 'request_dict' but rather than returning
a dictionary for a specific language, it returns a dictionary
referencing a personal word list. A personal word list is a file
of custom dictionary entries, one word per line.
"""
self._check_this()
pwl = EnchantStr(pwl)
new_dict = _e.broker_request_pwl_dict(self._this, pwl.encode())
if new_dict is None:
eStr = "Personal Word List file '%s' could not be loaded"
self._raise_error(eStr % (pwl,))
d = Dict(False)
d._switch_this(new_dict, self)
return d
def _free_dict(self, dict):
"""Free memory associated with a dictionary.
This method frees system resources associated with a Dict object.
It is equivalent to calling the object's 'free' method. Once this
method has been called on a dictionary, it must not be used again.
"""
self._check_this()
_e.broker_free_dict(self._this, dict._this)
dict._this = None
dict._broker = None
def dict_exists(self, tag):
"""Check availability of a dictionary.
This method checks whether there is a dictionary available for
the language specified by 'tag'. It returns True if a dictionary
is available, and False otherwise.
"""
self._check_this()
tag = EnchantStr(tag)
val = _e.broker_dict_exists(self._this, tag.encode())
return bool(val)
def set_ordering(self, tag, ordering):
"""Set dictionary preferences for a language.
The Enchant library supports the use of multiple dictionary programs
and multiple languages. This method specifies which dictionaries
the broker should prefer when dealing with a given language. 'tag'
must be an appropriate language specification and 'ordering' is a
string listing the dictionaries in order of preference. For example
a valid ordering might be "aspell,myspell,ispell".
The value of 'tag' can also be set to "*" to set a default ordering
for all languages for which one has not been set explicitly.
"""
self._check_this()
tag = EnchantStr(tag)
ordering = EnchantStr(ordering)
_e.broker_set_ordering(self._this, tag.encode(), ordering.encode())
def describe(self):
"""Return list of provider descriptions.
This method returns a list of descriptions of each of the
dictionary providers available. Each entry in the list is a
ProviderDesc object.
"""
self._check_this()
self.__describe_result = []
_e.broker_describe(self._this, self.__describe_callback)
return [ProviderDesc(*r) for r in self.__describe_result]
def __describe_callback(self, name, desc, file):
"""Collector callback for dictionary description.
This method is used as a callback into the _enchant function
'enchant_broker_describe'. It collects the given arguments in
a tuple and appends them to the list '__describe_result'.
"""
s = EnchantStr("")
name = s.decode(name)
desc = s.decode(desc)
file = s.decode(file)
self.__describe_result.append((name, desc, file))
def list_dicts(self):
"""Return list of available dictionaries.
This method returns a list of dictionaries available to the
broker. Each entry in the list is a two-tuple of the form:
(tag,provider)
where <tag> is the language lag for the dictionary and
<provider> is a ProviderDesc object describing the provider
through which that dictionary can be obtained.
"""
self._check_this()
self.__list_dicts_result = []
_e.broker_list_dicts(self._this, self.__list_dicts_callback)
return [(r[0], ProviderDesc(*r[1])) for r in self.__list_dicts_result]
def __list_dicts_callback(self, tag, name, desc, file):
"""Collector callback for listing dictionaries.
This method is used as a callback into the _enchant function
'enchant_broker_list_dicts'. It collects the given arguments into
an appropriate tuple and appends them to '__list_dicts_result'.
"""
s = EnchantStr("")
tag = s.decode(tag)
name = s.decode(name)
desc = s.decode(desc)
file = s.decode(file)
self.__list_dicts_result.append((tag, (name, desc, file)))
def list_languages(self):
"""List languages for which dictionaries are available.
This function returns a list of language tags for which a
dictionary is available.
"""
langs = []
for (tag, prov) in self.list_dicts():
if tag not in langs:
langs.append(tag)
return langs
def __describe_dict(self, dict_data):
"""Get the description tuple for a dict data object.
<dict_data> must be a C-library pointer to an enchant dictionary.
The return value is a tuple of the form:
(<tag>,<name>,<desc>,<file>)
"""
# Define local callback function
cb_result = []
def cb_func(tag, name, desc, file):
s = EnchantStr("")
tag = s.decode(tag)
name = s.decode(name)
desc = s.decode(desc)
file = s.decode(file)
cb_result.append((tag, name, desc, file))
# Actually call the describer function
_e.dict_describe(dict_data, cb_func)
return cb_result[0]
__describe_dict._DOC_ERRORS = ["desc"]
def get_param(self, name):
"""Get the value of a named parameter on this broker.
Parameters are used to provide runtime information to individual
provider backends. See the method 'set_param' for more details.
"""
name = EnchantStr(name)
return name.decode(_e.broker_get_param(self._this, name.encode()))
get_param._DOC_ERRORS = ["param"]
def set_param(self, name, value):
"""Set the value of a named parameter on this broker.
Parameters are used to provide runtime information to individual
provider backends. For example, the myspell provider will search
any directories given in the "enchant.myspell.dictionary.path"
parameter when looking for its dictionary files.
"""
name = EnchantStr(name)
value = EnchantStr(value)
_e.broker_set_param(self._this, name.encode(), value.encode())
class Dict(_EnchantObject):
"""Dictionary object for the Enchant spellchecker.
Dictionary objects are responsible for checking the spelling of words
and suggesting possible corrections. Each dictionary is owned by a
Broker object, but unless a new Broker has explicitly been created
then this will be the 'enchant' module default Broker and is of little
interest.
The important methods of this class include:
* check(): check whether a word id spelled correctly
* suggest(): suggest correct spellings for a word
* add(): add a word to the user's personal dictionary
* remove(): add a word to the user's personal exclude list
* add_to_session(): add a word to the current spellcheck session
* store_replacement(): indicate a replacement for a given word
Information about the dictionary is available using the following
attributes:
* tag: the language tag of the dictionary
* provider: a ProviderDesc object for the dictionary provider
"""
def __init__(self, tag=None, broker=None):
"""Dict object constructor.
A dictionary belongs to a specific language, identified by the
string <tag>. If the tag is not given or is None, an attempt to
determine the language currently in use is made using the 'locale'
module. If the current language cannot be determined, Error is raised.
If <tag> is instead given the value of False, a 'dead' Dict object
is created without any reference to a language. This is typically
only useful within PyEnchant itself. Any other non-string value
for <tag> raises Error.
Each dictionary must also have an associated Broker object which
obtains the dictionary information from the underlying system. This
may be specified using <broker>. If not given, the default broker
is used.
"""
# Initialise misc object attributes to None
self.provider = None
# If no tag was given, use the default language
if tag is None:
tag = get_default_language()
if tag is None:
err = "No tag specified and default language could not "
err = err + "be determined."
raise Error(err)
self.tag = tag
# If no broker was given, use the default broker
if broker is None:
broker = _broker
self._broker = broker
# Now let the superclass initialise the C-library object
_EnchantObject.__init__(self)
def _init_this(self):
# Create dead object if False was given.
# Otherwise, use the broker to get C-library pointer data.
self._this = None
if self.tag:
this = self._broker._request_dict_data(self.tag)
self._switch_this(this, self._broker)
def __del__(self):
"""Dict object destructor."""
# Calling free() might fail if python is shutting down
try:
self._free()
except AttributeError:
pass
def _switch_this(self, this, broker):
"""Switch the underlying C-library pointer for this object.
As all useful state for a Dict is stored by the underlying C-library
pointer, it is very convenient to allow this to be switched at
run-time. Pass a new dict data object into this method to affect
the necessary changes. The creating Broker object (at the Python
level) must also be provided.
This should *never* *ever* be used by application code. It's
a convenience for developers only, replacing the clunkier <data>
parameter to __init__ from earlier versions.
"""
# Free old dict data
Dict._free(self)
# Hook in the new stuff
self._this = this
self._broker = broker
# Update object properties
desc = self.__describe(check_this=False)
self.tag = desc[0]
self.provider = ProviderDesc(*desc[1:])
_switch_this._DOC_ERRORS = ["init"]
def _check_this(self, msg=None):
"""Extend _EnchantObject._check_this() to check Broker validity.
It is possible for the managing Broker object to be freed without
freeing the Dict. Thus validity checking must take into account
self._broker._this as well as self._this.
"""
if self._broker is None or self._broker._this is None:
self._this = None
_EnchantObject._check_this(self, msg)
def _raise_error(self, default="Unspecified Error", eclass=Error):
"""Overrides _EnchantObject._raise_error to check dict errors."""
err = _e.dict_get_error(self._this)
if err == "" or err is None:
raise eclass(default)
raise eclass(err)
def _free(self):
"""Free the system resources associated with a Dict object.
This method frees underlying system resources for a Dict object.
Once it has been called, the Dict object must no longer be used.
It is called automatically when the object is garbage collected.
"""
if self._broker is not None and self._this is not None:
self._broker._free_dict(self)
def check(self, word):
"""Check spelling of a word.
This method takes a word in the dictionary language and returns
True if it is correctly spelled, and false otherwise.
"""
self._check_this()
word = EnchantStr(word)
val = _e.dict_check(self._this, word.encode())
if val == 0:
return True
if val > 0:
return False
self._raise_error()
def suggest(self, word):
"""Suggest possible spellings for a word.
This method tries to guess the correct spelling for a given
word, returning the possibilities in a list.
"""
self._check_this()
word = EnchantStr(word)
suggs = _e.dict_suggest(self._this, word.encode())
return [word.decode(w) for w in suggs]
def add(self, word):
"""Add a word to the user's personal word list."""
self._check_this()
word = EnchantStr(word)
_e.dict_add(self._this, word.encode())
def remove(self, word):
"""Add a word to the user's personal exclude list."""
self._check_this()
word = EnchantStr(word)
_e.dict_remove(self._this, word.encode())
def add_to_pwl(self, word):
"""Add a word to the user's personal word list."""
warnings.warn("Dict.add_to_pwl is deprecated, please use Dict.add",
category=DeprecationWarning, stacklevel=2)
self._check_this()
word = EnchantStr(word)
_e.dict_add_to_pwl(self._this, word.encode())
def add_to_session(self, word):
"""Add a word to the session personal list."""
self._check_this()
word = EnchantStr(word)
_e.dict_add_to_session(self._this, word.encode())
def remove_from_session(self, word):
"""Add a word to the session exclude list."""
self._check_this()
word = EnchantStr(word)
_e.dict_remove_from_session(self._this, word.encode())
def is_added(self, word):
"""Check whether a word is in the personal word list."""
self._check_this()
word = EnchantStr(word)
return _e.dict_is_added(self._this, word.encode())
def is_removed(self, word):
"""Check whether a word is in the personal exclude list."""
self._check_this()
word = EnchantStr(word)
return _e.dict_is_removed(self._this, word.encode())
def is_in_session(self, word):
"""Check whether a word is in the session list."""
warnings.warn("Dict.is_in_session is deprecated, " \
"please use Dict.is_added",
category=DeprecationWarning, stacklevel=2)
self._check_this()
word = EnchantStr(word)
return _e.dict_is_in_session(self._this, word.encode())
def store_replacement(self, mis, cor):
"""Store a replacement spelling for a miss-spelled word.
This method makes a suggestion to the spellchecking engine that the
miss-spelled word <mis> is in fact correctly spelled as <cor>. Such
a suggestion will typically mean that <cor> appears early in the
list of suggested spellings offered for later instances of <mis>.
"""
if not mis:
raise ValueError("can't store replacement for an empty string")
if not cor:
raise ValueError("can't store empty string as a replacement")
self._check_this()
mis = EnchantStr(mis)
cor = EnchantStr(cor)
_e.dict_store_replacement(self._this, mis.encode(), cor.encode())
store_replacement._DOC_ERRORS = ["mis", "mis"]
def __describe(self, check_this=True):
"""Return a tuple describing the dictionary.
This method returns a four-element tuple describing the underlying
spellchecker system providing the dictionary. It will contain the
following strings:
* language tag
* name of dictionary provider
* description of dictionary provider
* dictionary file
Direct use of this method is not recommended - instead, access this
information through the 'tag' and 'provider' attributes.
"""
if check_this:
self._check_this()
_e.dict_describe(self._this, self.__describe_callback)
return self.__describe_result
def __describe_callback(self, tag, name, desc, file):
"""Collector callback for dictionary description.
This method is used as a callback into the _enchant function
'enchant_dict_describe'. It collects the given arguments in
a tuple and stores them in the attribute '__describe_result'.
"""
s = EnchantStr("")
tag = s.decode(tag)
name = s.decode(name)
desc = s.decode(desc)
file = s.decode(file)
self.__describe_result = (tag, name, desc, file)
class DictWithPWL(Dict):
"""Dictionary with separately-managed personal word list.
NOTE: As of version 1.4.0, enchant manages a per-user pwl and
exclude list. This class is now only needed if you want
to explicitly maintain a separate word list in addition to
the default one.
This class behaves as the standard Dict class, but also manages a
personal word list stored in a separate file. The file must be
specified at creation time by the 'pwl' argument to the constructor.
Words added to the dictionary are automatically appended to the pwl file.
A personal exclude list can also be managed, by passing another filename
to the constructor in the optional 'pel' argument. If this is not given,
requests to exclude words are ignored.
If either 'pwl' or 'pel' are None, an in-memory word list is used.
This will prevent calls to add() and remove() from affecting the user's
default word lists.
The Dict object managing the PWL is available as the 'pwl' attribute.
The Dict object managing the PEL is available as the 'pel' attribute.
To create a DictWithPWL from the user's default language, use None
as the 'tag' argument.
"""
_DOC_ERRORS = ["pel", "pel", "PEL", "pel"]
def __init__(self, tag, pwl=None, pel=None, broker=None):
"""DictWithPWL constructor.
The argument 'pwl', if not None, names a file containing the
personal word list. If this file does not exist, it is created
with default permissions.
The argument 'pel', if not None, names a file containing the personal
exclude list. If this file does not exist, it is created with
default permissions.
"""
Dict.__init__(self, tag, broker)
if pwl is not None:
if not os.path.exists(pwl):
f = open(pwl, "wt")
f.close()
del f
self.pwl = self._broker.request_pwl_dict(pwl)
else:
self.pwl = PyPWL()
if pel is not None:
if not os.path.exists(pel):
f = open(pel, "wt")
f.close()
del f
self.pel = self._broker.request_pwl_dict(pel)
else:
self.pel = PyPWL()
def _check_this(self, msg=None):
"""Extend Dict._check_this() to check PWL validity."""
if self.pwl is None:
self._free()
if self.pel is None:
self._free()
Dict._check_this(self, msg)
self.pwl._check_this(msg)
self.pel._check_this(msg)
def _free(self):
"""Extend Dict._free() to free the PWL as well."""
if self.pwl is not None:
self.pwl._free()
self.pwl = None
if self.pel is not None:
self.pel._free()
self.pel = None
Dict._free(self)
def check(self, word):
"""Check spelling of a word.
This method takes a word in the dictionary language and returns
True if it is correctly spelled, and false otherwise. It checks
both the dictionary and the personal word list.
"""
if self.pel.check(word):
return False
if self.pwl.check(word):
return True
if Dict.check(self, word):
return True
return False
def suggest(self, word):
"""Suggest possible spellings for a word.
This method tries to guess the correct spelling for a given
word, returning the possibilities in a list.
"""
suggs = Dict.suggest(self, word)
suggs.extend([w for w in self.pwl.suggest(word) if w not in suggs])
for i in range(len(suggs) - 1, -1, -1):
if self.pel.check(suggs[i]):
del suggs[i]
return suggs
def add(self, word):
"""Add a word to the associated personal word list.
This method adds the given word to the personal word list, and
automatically saves the list to disk.
"""
self._check_this()
self.pwl.add(word)
self.pel.remove(word)
def remove(self, word):
"""Add a word to the associated exclude list."""
self._check_this()
self.pwl.remove(word)
self.pel.add(word)
def add_to_pwl(self, word):
"""Add a word to the associated personal word list.
This method adds the given word to the personal word list, and
automatically saves the list to disk.
"""
self._check_this()
self.pwl.add_to_pwl(word)
self.pel.remove(word)
def is_added(self, word):
"""Check whether a word is in the personal word list."""
self._check_this()
return self.pwl.is_added(word)
def is_removed(self, word):
"""Check whether a word is in the personal exclude list."""
self._check_this()
return self.pel.is_added(word)
## Create a module-level default broker object, and make its important
## methods available at the module level.
_broker = Broker()
request_dict = _broker.request_dict
request_pwl_dict = _broker.request_pwl_dict
dict_exists = _broker.dict_exists
list_dicts = _broker.list_dicts
list_languages = _broker.list_languages
get_param = _broker.get_param
set_param = _broker.set_param
# Expose the "get_version" function.
def get_enchant_version():
"""Get the version string for the underlying enchant library."""
return _e.get_version()
# Run unit tests when called from comand-line
if __name__ == "__main__":
import sys
import enchant.tests
res = enchant.tests.runtestsuite()
if len(res.errors) > 0 or len(res.failures) > 0:
sys.exit(1)
sys.exit(0)

View File

@@ -1,366 +0,0 @@
# pyenchant
#
# Copyright (C) 2004-2008, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPsE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
enchant._enchant: ctypes-based wrapper for enchant C library
This module implements the low-level interface to the underlying
C library for enchant. The interface is based on ctypes and tries
to do as little as possible while making the higher-level components
easier to write.
The following conveniences are provided that differ from the underlying
C API:
* the "enchant" prefix has been removed from all functions, since
python has a proper module system
* callback functions do not take a user_data argument, since
python has proper closures that can manage this internally
* string lengths are not passed into functions such as dict_check,
since python strings know how long they are
"""
import sys, os, os.path
from ctypes import *
from ctypes.util import find_library
from enchant import utils
from enchant.errors import *
from enchant.utils import unicode
# Locate and load the enchant dll.
# We've got several options based on the host platform.
e = None
def _e_path_possibilities():
"""Generator yielding possible locations of the enchant library."""
yield os.environ.get("PYENCHANT_LIBRARY_PATH")
yield find_library("enchant")
yield find_library("libenchant")
yield find_library("libenchant-1")
if sys.platform == 'darwin':
# enchant lib installed by macports
yield "/opt/local/lib/libenchant.dylib"
# On win32 we ship a bundled version of the enchant DLLs.
# Use them if they're present.
if sys.platform == "win32":
e_path = None
try:
e_path = utils.get_resource_filename("libenchant.dll")
except (Error, ImportError):
try:
e_path = utils.get_resource_filename("libenchant-1.dll")
except (Error, ImportError):
pass
if e_path is not None:
# We need to use LoadLibraryEx with LOAD_WITH_ALTERED_SEARCH_PATH so
# that we don't accidentally suck in other versions of e.g. glib.
if not isinstance(e_path, unicode):
e_path = unicode(e_path, sys.getfilesystemencoding())
LoadLibraryEx = windll.kernel32.LoadLibraryExW
LOAD_WITH_ALTERED_SEARCH_PATH = 0x00000008
e_handle = LoadLibraryEx(e_path, None, LOAD_WITH_ALTERED_SEARCH_PATH)
if not e_handle:
raise WinError()
e = CDLL(e_path, handle=e_handle)
# On darwin we ship a bundled version of the enchant DLLs.
# Use them if they're present.
if e is None and sys.platform == "darwin":
try:
e_path = utils.get_resource_filename("lib/libenchant.1.dylib")
except (Error, ImportError):
pass
else:
# Enchant doesn't natively support relocatable binaries on OSX.
# We fake it by patching the enchant source to expose a char**, which
# we can write the runtime path into ourelves.
e = CDLL(e_path)
try:
e_dir = os.path.dirname(os.path.dirname(e_path))
prefix_dir = POINTER(c_char_p).in_dll(e, "enchant_prefix_dir_p")
prefix_dir.contents = c_char_p(e_dir)
except AttributeError:
e = None
# Not found yet, search various standard system locations.
if e is None:
for e_path in _e_path_possibilities():
if e_path is not None:
try:
e = cdll.LoadLibrary(e_path)
except OSError:
pass
else:
break
# No usable enchant install was found :-(
if e is None:
raise ImportError("enchant C library not found")
# Define various callback function types
def CALLBACK(restype, *argtypes):
"""Factory for generating callback function prototypes.
This is factored into a factory so I can easily change the definition
for experimentation or debugging.
"""
return CFUNCTYPE(restype, *argtypes)
t_broker_desc_func = CALLBACK(None, c_char_p, c_char_p, c_char_p, c_void_p)
t_dict_desc_func = CALLBACK(None, c_char_p, c_char_p, c_char_p, c_char_p, c_void_p)
# Simple typedefs for readability
t_broker = c_void_p
t_dict = c_void_p
# Now we can define the types of each function we are going to use
broker_init = e.enchant_broker_init
broker_init.argtypes = []
broker_init.restype = t_broker
broker_free = e.enchant_broker_free
broker_free.argtypes = [t_broker]
broker_free.restype = None
broker_request_dict = e.enchant_broker_request_dict
broker_request_dict.argtypes = [t_broker, c_char_p]
broker_request_dict.restype = t_dict
broker_request_pwl_dict = e.enchant_broker_request_pwl_dict
broker_request_pwl_dict.argtypes = [t_broker, c_char_p]
broker_request_pwl_dict.restype = t_dict
broker_free_dict = e.enchant_broker_free_dict
broker_free_dict.argtypes = [t_broker, t_dict]
broker_free_dict.restype = None
broker_dict_exists = e.enchant_broker_dict_exists
broker_dict_exists.argtypes = [t_broker, c_char_p]
broker_free_dict.restype = c_int
broker_set_ordering = e.enchant_broker_set_ordering
broker_set_ordering.argtypes = [t_broker, c_char_p, c_char_p]
broker_set_ordering.restype = None
broker_get_error = e.enchant_broker_get_error
broker_get_error.argtypes = [t_broker]
broker_get_error.restype = c_char_p
broker_describe1 = e.enchant_broker_describe
broker_describe1.argtypes = [t_broker, t_broker_desc_func, c_void_p]
broker_describe1.restype = None
def broker_describe(broker, cbfunc):
def cbfunc1(*args):
cbfunc(*args[:-1])
broker_describe1(broker, t_broker_desc_func(cbfunc1), None)
broker_list_dicts1 = e.enchant_broker_list_dicts
broker_list_dicts1.argtypes = [t_broker, t_dict_desc_func, c_void_p]
broker_list_dicts1.restype = None
def broker_list_dicts(broker, cbfunc):
def cbfunc1(*args):
cbfunc(*args[:-1])
broker_list_dicts1(broker, t_dict_desc_func(cbfunc1), None)
try:
broker_get_param = e.enchant_broker_get_param
except AttributeError:
# Make the lookup error occur at runtime
def broker_get_param(broker, param_name):
return e.enchant_broker_get_param(param_name)
else:
broker_get_param.argtypes = [t_broker, c_char_p]
broker_get_param.restype = c_char_p
try:
broker_set_param = e.enchant_broker_set_param
except AttributeError:
# Make the lookup error occur at runtime
def broker_set_param(broker, param_name):
return e.enchant_broker_set_param(param_name)
else:
broker_set_param.argtypes = [t_broker, c_char_p, c_char_p]
broker_set_param.restype = None
try:
get_version = e.enchant_get_version
except AttributeError:
# Make the lookup error occur at runtime
def get_version():
return e.enchant_get_version()
else:
get_version.argtypes = []
get_version.restype = c_char_p
dict_check1 = e.enchant_dict_check
dict_check1.argtypes = [t_dict, c_char_p, c_size_t]
dict_check1.restype = c_int
def dict_check(dict, word):
return dict_check1(dict, word, len(word))
dict_suggest1 = e.enchant_dict_suggest
dict_suggest1.argtypes = [t_dict, c_char_p, c_size_t, POINTER(c_size_t)]
dict_suggest1.restype = POINTER(c_char_p)
def dict_suggest(dict, word):
numSuggsP = pointer(c_size_t(0))
suggs_c = dict_suggest1(dict, word, len(word), numSuggsP)
suggs = []
n = 0
while n < numSuggsP.contents.value:
suggs.append(suggs_c[n])
n = n + 1
if numSuggsP.contents.value > 0:
dict_free_string_list(dict, suggs_c)
return suggs
dict_add1 = e.enchant_dict_add
dict_add1.argtypes = [t_dict, c_char_p, c_size_t]
dict_add1.restype = None
def dict_add(dict, word):
return dict_add1(dict, word, len(word))
dict_add_to_pwl1 = e.enchant_dict_add
dict_add_to_pwl1.argtypes = [t_dict, c_char_p, c_size_t]
dict_add_to_pwl1.restype = None
def dict_add_to_pwl(dict, word):
return dict_add_to_pwl1(dict, word, len(word))
dict_add_to_session1 = e.enchant_dict_add_to_session
dict_add_to_session1.argtypes = [t_dict, c_char_p, c_size_t]
dict_add_to_session1.restype = None
def dict_add_to_session(dict, word):
return dict_add_to_session1(dict, word, len(word))
dict_remove1 = e.enchant_dict_remove
dict_remove1.argtypes = [t_dict, c_char_p, c_size_t]
dict_remove1.restype = None
def dict_remove(dict, word):
return dict_remove1(dict, word, len(word))
dict_remove_from_session1 = e.enchant_dict_remove_from_session
dict_remove_from_session1.argtypes = [t_dict, c_char_p, c_size_t]
dict_remove_from_session1.restype = c_int
def dict_remove_from_session(dict, word):
return dict_remove_from_session1(dict, word, len(word))
dict_is_added1 = e.enchant_dict_is_added
dict_is_added1.argtypes = [t_dict, c_char_p, c_size_t]
dict_is_added1.restype = c_int
def dict_is_added(dict, word):
return dict_is_added1(dict, word, len(word))
dict_is_removed1 = e.enchant_dict_is_removed
dict_is_removed1.argtypes = [t_dict, c_char_p, c_size_t]
dict_is_removed1.restype = c_int
def dict_is_removed(dict, word):
return dict_is_removed1(dict, word, len(word))
dict_is_in_session1 = e.enchant_dict_is_in_session
dict_is_in_session1.argtypes = [t_dict, c_char_p, c_size_t]
dict_is_in_session1.restype = c_int
def dict_is_in_session(dict, word):
return dict_is_in_session1(dict, word, len(word))
dict_store_replacement1 = e.enchant_dict_store_replacement
dict_store_replacement1.argtypes = [t_dict, c_char_p, c_size_t, c_char_p, c_size_t]
dict_store_replacement1.restype = None
def dict_store_replacement(dict, mis, cor):
return dict_store_replacement1(dict, mis, len(mis), cor, len(cor))
dict_free_string_list = e.enchant_dict_free_string_list
dict_free_string_list.argtypes = [t_dict, POINTER(c_char_p)]
dict_free_string_list.restype = None
dict_get_error = e.enchant_dict_get_error
dict_get_error.argtypes = [t_dict]
dict_get_error.restype = c_char_p
dict_describe1 = e.enchant_dict_describe
dict_describe1.argtypes = [t_dict, t_dict_desc_func, c_void_p]
dict_describe1.restype = None
def dict_describe(dict, cbfunc):
def cbfunc1(tag, name, desc, file, data):
cbfunc(tag, name, desc, file)
dict_describe1(dict, t_dict_desc_func(cbfunc1), None)

View File

@@ -1,203 +0,0 @@
# pyenchant
#
# Copyright (C) 2004-2008, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
enchant.checker.CmdLineChecker: Command-Line spell checker
This module provides the class CmdLineChecker, which interactively
spellchecks a piece of text by interacting with the user on the
command line. It can also be run as a script to spellcheck a file.
"""
import sys
from enchant.checker import SpellChecker
from enchant.utils import printf
class CmdLineChecker:
"""A simple command-line spell checker.
This class implements a simple command-line spell checker. It must
be given a SpellChecker instance to operate on, and interacts with
the user by printing instructions on stdout and reading commands from
stdin.
"""
_DOC_ERRORS = ["stdout", "stdin"]
def __init__(self):
self._stop = False
self._checker = None
def set_checker(self, chkr):
self._checker = chkr
def get_checker(self, chkr):
return self._checker
def run(self):
"""Run the spellchecking loop."""
self._stop = False
for err in self._checker:
self.error = err
printf(["ERROR:", err.word])
printf(["HOW ABOUT:", err.suggest()])
status = self.read_command()
while not status and not self._stop:
status = self.read_command()
if self._stop:
break
printf(["DONE"])
def print_help(self):
printf(["0..N: replace with the numbered suggestion"])
printf(["R0..rN: always replace with the numbered suggestion"])
printf(["i: ignore this word"])
printf(["I: always ignore this word"])
printf(["a: add word to personal dictionary"])
printf(["e: edit the word"])
printf(["q: quit checking"])
printf(["h: print this help message"])
printf(["----------------------------------------------------"])
printf(["HOW ABOUT:", self.error.suggest()])
def read_command(self):
cmd = raw_input(">> ")
cmd = cmd.strip()
if cmd.isdigit():
repl = int(cmd)
suggs = self.error.suggest()
if repl >= len(suggs):
printf(["No suggestion number", repl])
return False
printf(["Replacing '%s' with '%s'" % (self.error.word, suggs[repl])])
self.error.replace(suggs[repl])
return True
if cmd[0] == "R":
if not cmd[1:].isdigit():
printf(["Badly formatted command (try 'help')"])
return False
repl = int(cmd[1:])
suggs = self.error.suggest()
if repl >= len(suggs):
printf(["No suggestion number", repl])
return False
self.error.replace_always(suggs[repl])
return True
if cmd == "i":
return True
if cmd == "I":
self.error.ignore_always()
return True
if cmd == "a":
self.error.add()
return True
if cmd == "e":
repl = raw_input("New Word: ")
self.error.replace(repl.strip())
return True
if cmd == "q":
self._stop = True
return True
if "help".startswith(cmd.lower()):
self.print_help()
return False
printf(["Badly formatted command (try 'help')"])
return False
def run_on_file(self, infile, outfile=None, enc=None):
"""Run spellchecking on the named file.
This method can be used to run the spellchecker over the named file.
If <outfile> is not given, the corrected contents replace the contents
of <infile>. If <outfile> is given, the corrected contents will be
written to that file. Use "-" to have the contents written to stdout.
If <enc> is given, it specifies the encoding used to read the
file's contents into a unicode string. The output will be written
in the same encoding.
"""
inStr = "".join(file(infile, "r").readlines())
if enc is not None:
inStr = inStr.decode(enc)
self._checker.set_text(inStr)
self.run()
outStr = self._checker.get_text()
if enc is not None:
outStr = outStr.encode(enc)
if outfile is None:
outF = file(infile, "w")
elif outfile == "-":
outF = sys.stdout
else:
outF = file(outfile, "w")
outF.write(outStr)
outF.close()
run_on_file._DOC_ERRORS = ["outfile", "infile", "outfile", "stdout"]
def _run_as_script():
"""Run the command-line spellchecker as a script.
This function allows the spellchecker to be invoked from the command-line
to check spelling in a file.
"""
# Check necessary command-line options
from optparse import OptionParser
op = OptionParser()
op.add_option("-o", "--output", dest="outfile", metavar="FILE",
help="write changes into FILE")
op.add_option("-l", "--lang", dest="lang", metavar="TAG", default="en_US",
help="use language idenfified by TAG")
op.add_option("-e", "--encoding", dest="enc", metavar="ENC",
help="file is unicode with encoding ENC")
(opts, args) = op.parse_args()
# Sanity check
if len(args) < 1:
raise ValueError("Must name a file to check")
if len(args) > 1:
raise ValueError("Can only check a single file")
# Create and run the checker
chkr = SpellChecker(opts.lang)
cmdln = CmdLineChecker()
cmdln.set_checker(chkr)
cmdln.run_on_file(args[0], opts.outfile, opts.enc)
if __name__ == "__main__":
_run_as_script()

View File

@@ -1,304 +0,0 @@
# GtkSpellCheckerDialog for pyenchant
#
# Copyright (C) 2004-2005, Fredrik Corneliusson
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
import gtk
import gobject
from enchant.utils import printf, unicode
# columns
COLUMN_SUGGESTION = 0
def create_list_view(col_label, ):
# create list widget
list_ = gtk.ListStore(str)
list_view = gtk.TreeView(model=list_)
list_view.set_rules_hint(True)
list_view.get_selection().set_mode(gtk.SELECTION_SINGLE)
# Add Colums
renderer = gtk.CellRendererText()
renderer.set_data("column", COLUMN_SUGGESTION)
column = gtk.TreeViewColumn(col_label, renderer, text=COLUMN_SUGGESTION)
list_view.append_column(column)
return list_view
class GtkSpellCheckerDialog(gtk.Window):
def __init__(self, *args, **kwargs):
gtk.Window.__init__(self, *args, **kwargs)
self.set_title('Spell check')
self.set_default_size(350, 200)
self._checker = None
self._numContext = 40
self.errors = None
# create accel group
accel_group = gtk.AccelGroup()
self.add_accel_group(accel_group)
# list of widgets to disable if there's no spell error left
self._conditional_widgets = []
conditional = self._conditional_widgets.append
# layout
mainbox = gtk.VBox(spacing=5)
hbox = gtk.HBox(spacing=5)
self.add(mainbox)
mainbox.pack_start(hbox, padding=5)
box1 = gtk.VBox(spacing=5)
hbox.pack_start(box1, padding=5)
conditional(box1)
# unreconized word
text_view_lable = gtk.Label('Unreconized word')
text_view_lable.set_justify(gtk.JUSTIFY_LEFT)
box1.pack_start(text_view_lable, False, False)
text_view = gtk.TextView()
text_view.set_wrap_mode(gtk.WRAP_WORD)
text_view.set_editable(False)
text_view.set_cursor_visible(False)
self.error_text = text_view.get_buffer()
text_buffer = text_view.get_buffer()
text_buffer.create_tag("fg_black", foreground="black")
text_buffer.create_tag("fg_red", foreground="red")
box1.pack_start(text_view)
# Change to
change_to_box = gtk.HBox()
box1.pack_start(change_to_box, False, False)
change_to_label = gtk.Label('Change to:')
self.replace_text = gtk.Entry()
text_view_lable.set_justify(gtk.JUSTIFY_LEFT)
change_to_box.pack_start(change_to_label, False, False)
change_to_box.pack_start(self.replace_text)
# scrolled window
sw = gtk.ScrolledWindow()
sw.set_shadow_type(gtk.SHADOW_ETCHED_IN)
sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC)
box1.pack_start(sw)
self.suggestion_list_view = create_list_view('Suggestions')
self.suggestion_list_view.connect("button_press_event", self._onButtonPress)
self.suggestion_list_view.connect("cursor-changed", self._onSuggestionChanged)
sw.add(self.suggestion_list_view)
# ---Buttons---#000000#FFFFFF----------------------------------------------------
button_box = gtk.VButtonBox()
hbox.pack_start(button_box, False, False)
# Ignore
button = gtk.Button("Ignore")
button.connect("clicked", self._onIgnore)
button.add_accelerator("activate", accel_group,
gtk.keysyms.Return, 0, gtk.ACCEL_VISIBLE)
button_box.pack_start(button)
conditional(button)
# Ignore all
button = gtk.Button("Ignore All")
button.connect("clicked", self._onIgnoreAll)
button_box.pack_start(button)
conditional(button)
# Replace
button = gtk.Button("Replace")
button.connect("clicked", self._onReplace)
button_box.pack_start(button)
conditional(button)
# Replace all
button = gtk.Button("Replace All")
button.connect("clicked", self._onReplaceAll)
button_box.pack_start(button)
conditional(button)
# Recheck button
button = gtk.Button("_Add")
button.connect("clicked", self._onAdd)
button_box.pack_start(button)
conditional(button)
# Close button
button = gtk.Button(stock=gtk.STOCK_CLOSE)
button.connect("clicked", self._onClose)
button.add_accelerator("activate", accel_group,
gtk.keysyms.Escape, 0, gtk.ACCEL_VISIBLE)
button_box.pack_end(button)
# dictionary label
self._dict_lable = gtk.Label('')
mainbox.pack_start(self._dict_lable, False, False, padding=5)
mainbox.show_all()
def _onIgnore(self, w, *args):
printf(["ignore"])
self._advance()
def _onIgnoreAll(self, w, *args):
printf(["ignore all"])
self._checker.ignore_always()
self._advance()
def _onReplace(self, *args):
printf(["Replace"])
repl = self._getRepl()
self._checker.replace(repl)
self._advance()
def _onReplaceAll(self, *args):
printf(["Replace all"])
repl = self._getRepl()
self._checker.replace_always(repl)
self._advance()
def _onAdd(self, *args):
"""Callback for the "add" button."""
self._checker.add()
self._advance()
def _onClose(self, w, *args):
self.emit('delete_event', gtk.gdk.Event(gtk.gdk.BUTTON_PRESS))
return True
def _onButtonPress(self, widget, event):
if event.type == gtk.gdk._2BUTTON_PRESS:
printf(["Double click!"])
self._onReplace()
def _onSuggestionChanged(self, widget, *args):
selection = self.suggestion_list_view.get_selection()
model, iter = selection.get_selected()
if iter:
suggestion = model.get_value(iter, COLUMN_SUGGESTION)
self.replace_text.set_text(suggestion)
def _getRepl(self):
"""Get the chosen replacement string."""
repl = self.replace_text.get_text()
repl = self._checker.coerce_string(repl)
return repl
def _fillSuggestionList(self, suggestions):
model = self.suggestion_list_view.get_model()
model.clear()
for suggestion in suggestions:
value = unicode("%s" % (suggestion,))
model.append([value, ])
def setSpellChecker(self, checker):
assert checker, 'checker cant be None'
self._checker = checker
self._dict_lable.set_text('Dictionary:%s' % (checker.dict.tag,))
def getSpellChecker(self, checker):
return self._checker
def updateUI(self):
self._advance()
def _disableButtons(self):
for w in self._conditional_widgets:
w.set_sensitive(False)
def _enableButtons(self):
for w in self._conditional_widgets:
w.set_sensitive(True)
def _advance(self):
"""Advance to the next error.
This method advances the SpellChecker to the next error, if
any. It then displays the error and some surrounding context,
and well as listing the suggested replacements.
"""
# Disable interaction if no checker
if self._checker is None:
self._disableButtons()
self.emit('check-done')
return
# Advance to next error, disable if not available
try:
self._checker.next()
except StopIteration:
self._disableButtons()
self.error_text.set_text("")
self._fillSuggestionList([])
self.replace_text.set_text("")
return
self._enableButtons()
# Display error context with erroneous word in red
self.error_text.set_text('')
iter = self.error_text.get_iter_at_offset(0)
append = self.error_text.insert_with_tags_by_name
lContext = self._checker.leading_context(self._numContext)
tContext = self._checker.trailing_context(self._numContext)
append(iter, lContext, 'fg_black')
append(iter, self._checker.word, 'fg_red')
append(iter, tContext, 'fg_black')
# Display suggestions in the replacements list
suggs = self._checker.suggest()
self._fillSuggestionList(suggs)
if suggs:
self.replace_text.set_text(suggs[0])
else:
self.replace_text.set_text("")
def _test():
from enchant.checker import SpellChecker
text = "This is sme text with a fw speling errors in it. Here are a fw more to tst it ut."
printf(["BEFORE:", text])
chk_dlg = GtkSpellCheckerDialog()
chk_dlg.show()
chk_dlg.connect('delete_event', gtk.main_quit)
chkr = SpellChecker("en_US", text)
chk_dlg.setSpellChecker(chkr)
chk_dlg.updateUI()
gtk.main()
if __name__ == "__main__":
_test()

View File

@@ -1,379 +0,0 @@
# pyenchant
#
# Copyright (C) 2004-2008, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
enchant.checker: High-level spellchecking functionality
========================================================
This package is designed to host higher-level spellchecking functionality
than is available in the base enchant package. It should make writing
applications that follow common usage idioms significantly easier.
The most useful class is SpellChecker, which implements a spellchecking
loop over a block of text. It is capable of modifying the text in-place
if given an array of characters to work with.
This package also contains several interfaces to the SpellChecker class,
such as a wxPython GUI dialog and a command-line interface.
"""
import array
import warnings
import enchant
from enchant.errors import *
from enchant.tokenize import get_tokenizer
from enchant.utils import bytes, unicode, basestring, next
from enchant.utils import get_default_language
class SpellChecker:
"""Class implementing stateful spellchecking behaviour.
This class is designed to implement a spell-checking loop over
a block of text, correcting/ignoring/replacing words as required.
This loop is implemented using an iterator paradigm so it can be
embedded inside other loops of control.
The SpellChecker object is stateful, and the appropriate methods
must be called to alter its state and affect the progress of
the spell checking session. At any point during the checking
session, the attribute 'word' will hold the current erroneously
spelled word under consideration. The action to take on this word
is determined by calling methods such as 'replace', 'replace_always'
and 'ignore_always'. Once this is done, calling 'next' advances
to the next misspelled word.
As a quick (and rather silly) example, the following code replaces
each misspelled word with the string "SPAM":
>>> text = "This is sme text with a fw speling errors in it."
>>> chkr = SpellChecker("en_US",text)
>>> for err in chkr:
... err.replace("SPAM")
...
>>> chkr.get_text()
'This is SPAM text with a SPAM SPAM errors in it.'
>>>
Internally, the SpellChecker always works with arrays of (possibly
unicode) character elements. This allows the in-place modification
of the string as it is checked, and is the closest thing Python has
to a mutable string. The text can be set as any of a normal string,
unicode string, character array or unicode character array. The
'get_text' method will return the modified array object if an
array is used, or a new string object if a string it used.
Words input to the SpellChecker may be either plain strings or
unicode objects. They will be converted to the same type as the
text being checked, using python's default encoding/decoding
settings.
If using an array of characters with this object and the
array is modified outside of the spellchecking loop, use the
'set_offset' method to reposition the internal loop pointer
to make sure it doesn't skip any words.
"""
_DOC_ERRORS = ["sme", "fw", "speling", "chkr", "chkr", "chkr"]
def __init__(self, lang=None, text=None, tokenize=None, chunkers=None, filters=None):
"""Constructor for the SpellChecker class.
SpellChecker objects can be created in two ways, depending on
the nature of the first argument. If it is a string, it
specifies a language tag from which a dictionary is created.
Otherwise, it must be an enchant Dict object to be used.
Optional keyword arguments are:
* text: to set the text to be checked at creation time
* tokenize: a custom tokenization function to use
* chunkers: a list of chunkers to apply during tokenization
* filters: a list of filters to apply during tokenization
If <tokenize> is not given and the first argument is a Dict,
its 'tag' attribute must be a language tag so that a tokenization
function can be created automatically. If this attribute is missing
the user's default language will be used.
"""
if lang is None:
lang = get_default_language()
if isinstance(lang, basestring):
dict = enchant.Dict(lang)
else:
dict = lang
try:
lang = dict.tag
except AttributeError:
lang = get_default_language()
if lang is None:
raise DefaultLanguageNotFoundError
self.lang = lang
self.dict = dict
if tokenize is None:
try:
tokenize = get_tokenizer(lang, chunkers, filters)
except TokenizerNotFoundError:
# Fall back to default tokenization if no match for 'lang'
tokenize = get_tokenizer(None, chunkers, filters)
self._tokenize = tokenize
self.word = None
self.wordpos = None
self._ignore_words = {}
self._replace_words = {}
# Default to the empty string as the text to be checked
self._text = array.array('u')
self._use_tostring = False
self._tokens = iter([])
if text is not None:
self.set_text(text)
def __iter__(self):
"""Each SpellChecker object is its own iterator"""
return self
def set_text(self, text):
"""Set the text to be spell-checked.
This method must be called, or the 'text' argument supplied
to the constructor, before calling the 'next()' method.
"""
# Convert to an array object if necessary
if isinstance(text, basestring):
if type(text) is unicode:
self._text = array.array('u', text)
else:
self._text = array.array('c', text)
self._use_tostring = True
else:
self._text = text
self._use_tostring = False
self._tokens = self._tokenize(self._text)
def get_text(self):
"""Return the spell-checked text."""
if self._use_tostring:
return self._array_to_string(self._text)
return self._text
def _array_to_string(self, text):
"""Format an internal array as a standard string."""
if text.typecode == 'u':
return text.tounicode()
return text.tostring()
def wants_unicode(self):
"""Check whether the checker wants unicode strings.
This method will return True if the checker wants unicode strings
as input, False if it wants normal strings. It's important to
provide the correct type of string to the checker.
"""
if self._text.typecode == 'u':
return True
return False
def coerce_string(self, text, enc=None):
"""Coerce string into the required type.
This method can be used to automatically ensure that strings
are of the correct type required by this checker - either unicode
or standard. If there is a mismatch, conversion is done using
python's default encoding unless another encoding is specified.
"""
if self.wants_unicode():
if not isinstance(text, unicode):
if enc is None:
return text.decode()
else:
return text.decode(enc)
return text
if not isinstance(text, bytes):
if enc is None:
return text.encode()
else:
return text.encode(enc)
return text
def __next__(self):
return self.next()
def next(self):
"""Process text up to the next spelling error.
This method is designed to support the iterator protocol.
Each time it is called, it will advance the 'word' attribute
to the next spelling error in the text. When no more errors
are found, it will raise StopIteration.
The method will always return self, so that it can be used
sensibly in common idioms such as:
for err in checker:
err.do_something()
"""
# Find the next spelling error.
# The uncaught StopIteration from next(self._tokens)
# will provide the StopIteration for this method
while True:
(word, pos) = next(self._tokens)
# decode back to a regular string
word = self._array_to_string(word)
if self.dict.check(word):
continue
if word in self._ignore_words:
continue
self.word = word
self.wordpos = pos
if word in self._replace_words:
self.replace(self._replace_words[word])
continue
break
return self
def replace(self, repl):
"""Replace the current erroneous word with the given string."""
repl = self.coerce_string(repl)
aRepl = array.array(self._text.typecode, repl)
if repl:
self.dict.store_replacement(self.word, repl)
self._text[self.wordpos:self.wordpos + len(self.word)] = aRepl
incr = len(repl) - len(self.word)
self._tokens.set_offset(self._tokens.offset + incr, replaced=True)
def replace_always(self, word, repl=None):
"""Always replace given word with given replacement.
If a single argument is given, this is used to replace the
current erroneous word. If two arguments are given, that
combination is added to the list for future use.
"""
if repl is None:
repl = word
word = self.word
repl = self.coerce_string(repl)
word = self.coerce_string(word)
self._replace_words[word] = repl
if self.word == word:
self.replace(repl)
def ignore_always(self, word=None):
"""Add given word to list of words to ignore.
If no word is given, the current erroneous word is added.
"""
if word is None:
word = self.word
word = self.coerce_string(word)
if word not in self._ignore_words:
self._ignore_words[word] = True
def add_to_personal(self, word=None):
"""Add given word to the personal word list.
If no word is given, the current erroneous word is added.
"""
warnings.warn("SpellChecker.add_to_personal is deprecated, " \
"please use SpellChecker.add",
category=DeprecationWarning, stacklevel=2)
self.add(word)
def add(self, word=None):
"""Add given word to the personal word list.
If no word is given, the current erroneous word is added.
"""
if word is None:
word = self.word
self.dict.add(word)
def suggest(self, word=None):
"""Return suggested spellings for the given word.
If no word is given, the current erroneous word is used.
"""
if word is None:
word = self.word
suggs = self.dict.suggest(word)
return suggs
def check(self, word):
"""Check correctness of the given word."""
return self.dict.check(word)
def set_offset(self, off, whence=0):
"""Set the offset of the tokenization routine.
For more details on the purpose of the tokenization offset,
see the documentation of the 'enchant.tokenize' module.
The optional argument whence indicates the method by
which to change the offset:
* 0 (the default) treats <off> as an increment
* 1 treats <off> as a distance from the start
* 2 treats <off> as a distance from the end
"""
if whence == 0:
self._tokens.set_offset(self._tokens.offset + off)
elif whence == 1:
assert (off > 0)
self._tokens.set_offset(off)
elif whence == 2:
assert (off > 0)
self._tokens.set_offset(len(self._text) - 1 - off)
else:
raise ValueError("Invalid value for whence: %s" % (whence,))
def leading_context(self, chars):
"""Get <chars> characters of leading context.
This method returns up to <chars> characters of leading
context - the text that occurs in the string immediately
before the current erroneous word.
"""
start = max(self.wordpos - chars, 0)
context = self._text[start:self.wordpos]
return self._array_to_string(context)
def trailing_context(self, chars):
"""Get <chars> characters of trailing context.
This method returns up to <chars> characters of trailing
context - the text that occurs in the string immediately
after the current erroneous word.
"""
start = self.wordpos + len(self.word)
end = min(start + chars, len(self._text))
context = self._text[start:end]
return self._array_to_string(context)

View File

@@ -1,246 +0,0 @@
# pyenchant
#
# Copyright (C) 2004-2009, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
enchant.checker.tests: Unittests for enchant SpellChecker class
"""
import unittest
import enchant
import enchant.tokenize
from enchant.utils import *
from enchant.errors import *
from enchant.checker import *
class TestChecker(unittest.TestCase):
"""TestCases for checking behaviour of SpellChecker class."""
def test_basic(self):
"""Test a basic run of the SpellChecker class."""
text = """This is sme text with a few speling erors in it. Its gret
for checking wheather things are working proprly with the SpellChecker
class. Not gret for much elss though."""
chkr = SpellChecker("en_US", text=text)
for n, err in enumerate(chkr):
if n == 0:
# Fix up "sme" -> "some" properly
self.assertEqual(err.word, "sme")
self.assertEqual(err.wordpos, 8)
self.assertTrue("some" in err.suggest())
err.replace("some")
if n == 1:
# Ignore "speling"
self.assertEqual(err.word, "speling")
if n == 2:
# Check context around "erors", and replace
self.assertEqual(err.word, "erors")
self.assertEqual(err.leading_context(5), "ling ")
self.assertEqual(err.trailing_context(5), " in i")
err.replace(raw_unicode("errors"))
if n == 3:
# Replace-all on gret as it appears twice
self.assertEqual(err.word, "gret")
err.replace_always("great")
if n == 4:
# First encounter with "wheather", move offset back
self.assertEqual(err.word, "wheather")
err.set_offset(-1 * len(err.word))
if n == 5:
# Second encounter, fix up "wheather'
self.assertEqual(err.word, "wheather")
err.replace("whether")
if n == 6:
# Just replace "proprly", but also add an ignore
# for "SpellChecker"
self.assertEqual(err.word, "proprly")
err.replace("properly")
err.ignore_always("SpellChecker")
if n == 7:
# The second "gret" should have been replaced
# So it's now on "elss"
self.assertEqual(err.word, "elss")
err.replace("else")
if n > 7:
self.fail("Extraneous spelling errors were found")
text2 = """This is some text with a few speling errors in it. Its great
for checking whether things are working properly with the SpellChecker
class. Not great for much else though."""
self.assertEqual(chkr.get_text(), text2)
def test_filters(self):
"""Test SpellChecker with the 'filters' argument."""
text = """I contain WikiWords that ShouldBe skipped by the filters"""
chkr = SpellChecker("en_US", text=text,
filters=[enchant.tokenize.WikiWordFilter])
for err in chkr:
# There are no errors once the WikiWords are skipped
self.fail("Extraneous spelling errors were found")
self.assertEqual(chkr.get_text(), text)
def test_chunkers(self):
"""Test SpellChecker with the 'chunkers' argument."""
text = """I contain <html a=xjvf>tags</html> that should be skipped"""
chkr = SpellChecker("en_US", text=text,
chunkers=[enchant.tokenize.HTMLChunker])
for err in chkr:
# There are no errors when the <html> tag is skipped
self.fail("Extraneous spelling errors were found")
self.assertEqual(chkr.get_text(), text)
def test_chunkers_and_filters(self):
"""Test SpellChecker with the 'chunkers' and 'filters' arguments."""
text = """I contain <html a=xjvf>tags</html> that should be skipped
along with a <a href='http://example.com/">link to
http://example.com/</a> that should also be skipped"""
# There are no errors when things are correctly skipped
chkr = SpellChecker("en_US", text=text,
filters=[enchant.tokenize.URLFilter],
chunkers=[enchant.tokenize.HTMLChunker])
for err in chkr:
self.fail("Extraneous spelling errors were found")
self.assertEqual(chkr.get_text(), text)
# The "html" is an error when not using HTMLChunker
chkr = SpellChecker("en_US", text=text,
filters=[enchant.tokenize.URLFilter])
for err in chkr:
self.assertEqual(err.word, "html")
break
self.assertEqual(chkr.get_text(), text)
# The "http" from the URL is an error when not using URLFilter
chkr = SpellChecker("en_US", text=text,
chunkers=[enchant.tokenize.HTMLChunker])
for err in chkr:
self.assertEqual(err.word, "http")
break
self.assertEqual(chkr.get_text(), text)
def test_unicode(self):
"""Test SpellChecker with a unicode string."""
text = raw_unicode("""I am a unicode strng with unicode erors.""")
chkr = SpellChecker("en_US", text)
for n, err in enumerate(chkr):
if n == 0:
self.assertEqual(err.word, raw_unicode("unicode"))
self.assertEqual(err.wordpos, 7)
chkr.ignore_always()
if n == 1:
self.assertEqual(err.word, raw_unicode("strng"))
chkr.replace_always("string")
self.assertEqual(chkr._replace_words[raw_unicode("strng")], raw_unicode("string"))
if n == 2:
self.assertEqual(err.word, raw_unicode("erors"))
chkr.replace("erros")
chkr.set_offset(-6)
if n == 3:
self.assertEqual(err.word, raw_unicode("erros"))
chkr.replace("errors")
self.assertEqual(n, 3)
self.assertEqual(chkr.get_text(), raw_unicode("I am a unicode string with unicode errors."))
def test_chararray(self):
"""Test SpellChecker with a character array as input."""
# Python 3 does not provide 'c' array type
if str is unicode:
atype = 'u'
else:
atype = 'c'
text = "I wll be stord in an aray"
txtarr = array.array(atype, text)
chkr = SpellChecker("en_US", txtarr)
for (n, err) in enumerate(chkr):
if n == 0:
self.assertEqual(err.word, "wll")
self.assertEqual(err.word.__class__, str)
if n == 1:
self.assertEqual(err.word, "stord")
txtarr[err.wordpos:err.wordpos + len(err.word)] = array.array(atype, "stored")
chkr.set_offset(-1 * len(err.word))
if n == 2:
self.assertEqual(err.word, "aray")
chkr.replace("array")
self.assertEqual(n, 2)
if str is unicode:
self.assertEqual(txtarr.tounicode(), "I wll be stored in an array")
else:
self.assertEqual(txtarr.tostring(), "I wll be stored in an array")
def test_pwl(self):
"""Test checker loop with PWL."""
from enchant import DictWithPWL
d = DictWithPWL("en_US", None, None)
txt = "I am sme text to be cheked with personal list of cheked words"
chkr = SpellChecker(d, txt)
for n, err in enumerate(chkr):
if n == 0:
self.assertEqual(err.word, "sme")
if n == 1:
self.assertEqual(err.word, "cheked")
chkr.add()
self.assertEqual(n, 1)
def test_bug2785373(self):
"""Testcases for bug #2785373."""
c = SpellChecker(enchant.Dict("en"), "")
c.set_text("So, one dey when I wes 17, I left.")
for err in c:
pass
c = SpellChecker(enchant.Dict("en"), "")
c.set_text(raw_unicode("So, one dey when I wes 17, I left."))
for err in c:
pass
def test_default_language(self):
lang = get_default_language()
if lang is None:
self.assertRaises(DefaultLanguageNotFoundError, SpellChecker)
else:
checker = SpellChecker()
self.assertEqual(checker.lang, lang)
def test_replace_with_shorter_string(self):
"""Testcase for replacing with a shorter string (bug #10)"""
text = ". I Bezwaar tegen verguning."
chkr = SpellChecker("en_US", text)
for i, err in enumerate(chkr):
err.replace("SPAM")
assert i < 3
self.assertEquals(chkr.get_text(), ". I SPAM SPAM SPAM.")
def test_replace_with_empty_string(self):
"""Testcase for replacing with an empty string (bug #10)"""
text = ". I Bezwaar tegen verguning."
chkr = SpellChecker("en_US", text)
for i, err in enumerate(chkr):
err.replace("")
assert i < 3
self.assertEquals(chkr.get_text(), ". I .")

View File

@@ -1,272 +0,0 @@
# pyenchant
#
# Copyright (C) 2004-2008, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
# Major code cleanup and re-write thanks to Phil Mayes, 2007
#
"""
enchant.checker.wxSpellCheckerDialog: wxPython spellchecker interface
This module provides the class wxSpellCheckerDialog, which provides
a wxPython dialog that can be used as an interface to a spell checking
session. Currently it is intended as a proof-of-concept and demonstration
class, but it should be suitable for general-purpose use in a program.
The class must be given an enchant.checker.SpellChecker object with
which to operate. It can (in theory...) be used in modal and non-modal
modes. Use Show() when operating on an array of characters as it will
modify the array in place, meaning other work can be done at the same
time. Use ShowModal() when operating on a static string.
"""
_DOC_ERRORS = ["ShowModal"]
import wx
from enchant.utils import printf
class wxSpellCheckerDialog(wx.Dialog):
"""Simple spellcheck dialog for wxPython
This class implements a simple spellcheck interface for wxPython,
in the form of a dialog. It's intended mainly of an example of
how to do this, although it should be useful for applications that
just need a simple graphical spellchecker.
To use, a SpellChecker instance must be created and passed to the
dialog before it is shown:
>>> dlg = wxSpellCheckerDialog(None,-1,"")
>>> chkr = SpellChecker("en_AU",text)
>>> dlg.SetSpellChecker(chkr)
>>> dlg.Show()
This is most useful when the text to be checked is in the form of
a character array, as it will be modified in place as the user
interacts with the dialog. For checking strings, the final result
will need to be obtained from the SpellChecker object:
>>> dlg = wxSpellCheckerDialog(None,-1,"")
>>> chkr = SpellChecker("en_AU",text)
>>> dlg.SetSpellChecker(chkr)
>>> dlg.ShowModal()
>>> text = dlg.GetSpellChecker().get_text()
Currently the checker must deal with strings of the same type as
returned by wxPython - unicode or normal string depending on the
underlying system. This needs to be fixed, somehow...
"""
_DOC_ERRORS = ["dlg", "chkr", "dlg", "SetSpellChecker", "chkr", "dlg",
"dlg", "chkr", "dlg", "SetSpellChecker", "chkr", "dlg",
"ShowModal", "dlg", "GetSpellChecker"]
# Remember dialog size across invocations by storing it on the class
sz = (300, 70)
def __init__(self, parent=None, id=-1, title="Checking Spelling..."):
wx.Dialog.__init__(self, parent, id, title, size=wxSpellCheckerDialog.sz,
style=wx.DEFAULT_DIALOG_STYLE | wx.RESIZE_BORDER)
self._numContext = 40
self._checker = None
self._buttonsEnabled = True
self.error_text = wx.TextCtrl(self, -1, "", style=wx.TE_MULTILINE | wx.TE_READONLY | wx.TE_RICH)
self.replace_text = wx.TextCtrl(self, -1, "", style=wx.TE_PROCESS_ENTER)
self.replace_list = wx.ListBox(self, -1, style=wx.LB_SINGLE)
self.InitLayout()
wx.EVT_LISTBOX(self, self.replace_list.GetId(), self.OnReplSelect)
wx.EVT_LISTBOX_DCLICK(self, self.replace_list.GetId(), self.OnReplace)
def InitLayout(self):
"""Lay out controls and add buttons."""
sizer = wx.BoxSizer(wx.HORIZONTAL)
txtSizer = wx.BoxSizer(wx.VERTICAL)
btnSizer = wx.BoxSizer(wx.VERTICAL)
replaceSizer = wx.BoxSizer(wx.HORIZONTAL)
txtSizer.Add(wx.StaticText(self, -1, "Unrecognised Word:"), 0, wx.LEFT | wx.TOP, 5)
txtSizer.Add(self.error_text, 1, wx.ALL | wx.EXPAND, 5)
replaceSizer.Add(wx.StaticText(self, -1, "Replace with:"), 0, wx.ALL | wx.ALIGN_CENTER_VERTICAL, 5)
replaceSizer.Add(self.replace_text, 1, wx.ALL | wx.ALIGN_CENTER_VERTICAL, 5)
txtSizer.Add(replaceSizer, 0, wx.EXPAND, 0)
txtSizer.Add(self.replace_list, 2, wx.ALL | wx.EXPAND, 5)
sizer.Add(txtSizer, 1, wx.EXPAND, 0)
self.buttons = []
for label, action, tip in ( \
("Ignore", self.OnIgnore, "Ignore this word and continue"),
("Ignore All", self.OnIgnoreAll, "Ignore all instances of this word and continue"),
("Replace", self.OnReplace, "Replace this word"),
("Replace All", self.OnReplaceAll, "Replace all instances of this word"),
("Add", self.OnAdd, "Add this word to the dictionary"),
("Done", self.OnDone, "Finish spell-checking and accept changes"),
):
btn = wx.Button(self, -1, label)
btn.SetToolTip(wx.ToolTip(tip))
btnSizer.Add(btn, 0, wx.ALIGN_RIGHT | wx.ALL, 4)
btn.Bind(wx.EVT_BUTTON, action)
self.buttons.append(btn)
sizer.Add(btnSizer, 0, wx.ALL | wx.EXPAND, 5)
self.SetAutoLayout(True)
self.SetSizer(sizer)
sizer.Fit(self)
def Advance(self):
"""Advance to the next error.
This method advances the SpellChecker to the next error, if
any. It then displays the error and some surrounding context,
and well as listing the suggested replacements.
"""
# Disable interaction if no checker
if self._checker is None:
self.EnableButtons(False)
return False
# Advance to next error, disable if not available
try:
self._checker.next()
except StopIteration:
self.EnableButtons(False)
self.error_text.SetValue("")
self.replace_list.Clear()
self.replace_text.SetValue("")
if self.IsModal(): # test needed for SetSpellChecker call
# auto-exit when checking complete
self.EndModal(wx.ID_OK)
return False
self.EnableButtons()
# Display error context with erroneous word in red.
# Restoring default style was misbehaving under win32, so
# I am forcing the rest of the text to be black.
self.error_text.SetValue("")
self.error_text.SetDefaultStyle(wx.TextAttr(wx.BLACK))
lContext = self._checker.leading_context(self._numContext)
self.error_text.AppendText(lContext)
self.error_text.SetDefaultStyle(wx.TextAttr(wx.RED))
self.error_text.AppendText(self._checker.word)
self.error_text.SetDefaultStyle(wx.TextAttr(wx.BLACK))
tContext = self._checker.trailing_context(self._numContext)
self.error_text.AppendText(tContext)
# Display suggestions in the replacements list
suggs = self._checker.suggest()
self.replace_list.Set(suggs)
self.replace_text.SetValue(suggs and suggs[0] or '')
return True
def EnableButtons(self, state=True):
"""Enable the checking-related buttons"""
if state != self._buttonsEnabled:
for btn in self.buttons[:-1]:
btn.Enable(state)
self._buttonsEnabled = state
def GetRepl(self):
"""Get the chosen replacement string."""
repl = self.replace_text.GetValue()
return repl
def OnAdd(self, evt):
"""Callback for the "add" button."""
self._checker.add()
self.Advance()
def OnDone(self, evt):
"""Callback for the "close" button."""
wxSpellCheckerDialog.sz = self.error_text.GetSizeTuple()
if self.IsModal():
self.EndModal(wx.ID_OK)
else:
self.Close()
def OnIgnore(self, evt):
"""Callback for the "ignore" button.
This simply advances to the next error.
"""
self.Advance()
def OnIgnoreAll(self, evt):
"""Callback for the "ignore all" button."""
self._checker.ignore_always()
self.Advance()
def OnReplace(self, evt):
"""Callback for the "replace" button."""
repl = self.GetRepl()
if repl:
self._checker.replace(repl)
self.Advance()
def OnReplaceAll(self, evt):
"""Callback for the "replace all" button."""
repl = self.GetRepl()
self._checker.replace_always(repl)
self.Advance()
def OnReplSelect(self, evt):
"""Callback when a new replacement option is selected."""
sel = self.replace_list.GetSelection()
if sel == -1:
return
opt = self.replace_list.GetString(sel)
self.replace_text.SetValue(opt)
def GetSpellChecker(self):
"""Get the spell checker object."""
return self._checker
def SetSpellChecker(self, chkr):
"""Set the spell checker, advancing to the first error.
Return True if error(s) to correct, else False."""
self._checker = chkr
return self.Advance()
def _test():
class TestDialog(wxSpellCheckerDialog):
def __init__(self, *args):
wxSpellCheckerDialog.__init__(self, *args)
wx.EVT_CLOSE(self, self.OnClose)
def OnClose(self, evnt):
chkr = dlg.GetSpellChecker()
if chkr is not None:
printf(["AFTER:", chkr.get_text()])
self.Destroy()
from enchant.checker import SpellChecker
text = "This is sme text with a fw speling errors in it. Here are a fw more to tst it ut."
printf(["BEFORE:", text])
app = wx.PySimpleApp()
dlg = TestDialog()
chkr = SpellChecker("en_US", text)
dlg.SetSpellChecker(chkr)
dlg.Show()
app.MainLoop()
if __name__ == "__main__":
_test()

View File

@@ -1,57 +0,0 @@
# pyenchant
#
# Copyright (C) 2004-2008, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPsE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
enchant.errors: Error class definitions for the enchant library
================================================================
All error classes are defined in this separate sub-module, so that they
can safely be imported without causing circular dependencies.
"""
class Error(Exception):
"""Base exception class for the enchant module."""
pass
class DictNotFoundError(Error):
"""Exception raised when a requested dictionary could not be found."""
pass
class TokenizerNotFoundError(Error):
"""Exception raised when a requested tokenizer could not be found."""
pass
class DefaultLanguageNotFoundError(Error):
"""Exception raised when a default language could not be found."""
pass

View File

@@ -1,4 +0,0 @@
This directory contains the plugin DLLs for enchant when installed on
a Microsoft Windows system.

View File

@@ -1,285 +0,0 @@
# pyenchant
#
# Copyright (C) 2004-2011 Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
pypwl: pure-python personal word list in the style of Enchant
==============================================================
This module provides a pure-python version of the personal word list
functionality found in the spellchecking package Enchant. While the
same effect can be achieved (with better performance) using the python
bindings for Enchant, it requires a C extension.
This pure-python implementation uses the same algorithm but without any
external dependencies or C code (in fact, it was the author's original
prototype for the C version found in Enchant).
"""
from __future__ import generators
import os
import warnings
class Trie:
"""Class implementing a trie-based dictionary of words.
A Trie is a recursive data structure storing words by their prefix.
"Fuzzy matching" can be done by allowing a certain number of missteps
when traversing the Trie.
"""
def __init__(self, words=()):
self._eos = False # whether I am the end of a word
self._keys = {} # letters at this level of the trie
for w in words:
self.insert(w)
def insert(self, word):
if word == "":
self._eos = True
else:
key = word[0]
try:
subtrie = self[key]
except KeyError:
subtrie = Trie()
self[key] = subtrie
subtrie.insert(word[1:])
def remove(self, word):
if word == "":
self._eos = False
else:
key = word[0]
try:
subtrie = self[key]
except KeyError:
pass
else:
subtrie.remove(word[1:])
def search(self, word, nerrs=0):
"""Search for the given word, possibly making errors.
This method searches the trie for the given <word>, making
precisely <nerrs> errors. It returns a list of words found.
"""
res = []
# Terminate if we've run out of errors
if nerrs < 0:
return res
# Precise match at the end of the word
if nerrs == 0 and word == "":
if self._eos:
res.append("")
# Precisely match word[0]
try:
subtrie = self[word[0]]
subres = subtrie.search(word[1:], nerrs)
for w in subres:
w2 = word[0] + w
if w2 not in res:
res.append(w2)
except (IndexError, KeyError):
pass
# match with deletion of word[0]
try:
subres = self.search(word[1:], nerrs - 1)
for w in subres:
if w not in res:
res.append(w)
except (IndexError,):
pass
# match with insertion before word[0]
try:
for k in self._keys:
subres = self[k].search(word, nerrs - 1)
for w in subres:
w2 = k + w
if w2 not in res:
res.append(w2)
except (IndexError, KeyError):
pass
# match on substitution of word[0]
try:
for k in self._keys:
subres = self[k].search(word[1:], nerrs - 1)
for w in subres:
w2 = k + w
if w2 not in res:
res.append(w2)
except (IndexError, KeyError):
pass
# All done!
return res
search._DOC_ERRORS = ["nerrs"]
def __getitem__(self, key):
return self._keys[key]
def __setitem__(self, key, val):
self._keys[key] = val
def __iter__(self):
if self._eos:
yield ""
for k in self._keys:
for w2 in self._keys[k]:
yield k + w2
class PyPWL:
"""Pure-python implementation of Personal Word List dictionary.
This class emulates the PWL objects provided by PyEnchant, but
implemented purely in python.
"""
def __init__(self, pwl=None):
"""PyPWL constructor.
This method takes as its only argument the name of a file
containing the personal word list, one word per line. Entries
will be read from this file, and new entries will be written to
it automatically.
If <pwl> is not specified or None, the list is maintained in
memory only.
"""
self.provider = None
self._words = Trie()
if pwl is not None:
self.pwl = os.path.abspath(pwl)
self.tag = self.pwl
pwlF = file(pwl)
for ln in pwlF:
word = ln.strip()
self.add_to_session(word)
pwlF.close()
else:
self.pwl = None
self.tag = "PyPWL"
def check(self, word):
"""Check spelling of a word.
This method takes a word in the dictionary language and returns
True if it is correctly spelled, and false otherwise.
"""
res = self._words.search(word)
return bool(res)
def suggest(self, word):
"""Suggest possible spellings for a word.
This method tries to guess the correct spelling for a given
word, returning the possibilities in a list.
"""
limit = 10
maxdepth = 5
# Iterative deepening until we get enough matches
depth = 0
res = self._words.search(word, depth)
while len(res) < limit and depth < maxdepth:
depth += 1
for w in self._words.search(word, depth):
if w not in res:
res.append(w)
# Limit number of suggs
return res[:limit]
def add(self, word):
"""Add a word to the user's personal dictionary.
For a PWL, this means appending it to the file.
"""
if self.pwl is not None:
pwlF = file(self.pwl, "a")
pwlF.write("%s\n" % (word.strip(),))
pwlF.close()
self.add_to_session(word)
def add_to_pwl(self, word):
"""Add a word to the user's personal dictionary.
For a PWL, this means appending it to the file.
"""
warnings.warn("PyPWL.add_to_pwl is deprecated, please use PyPWL.add",
category=DeprecationWarning, stacklevel=2)
self.add(word)
def remove(self, word):
"""Add a word to the user's personal exclude list."""
# There's no exclude list for a stand-alone PWL.
# Just remove it from the list.
self._words.remove(word)
if self.pwl is not None:
pwlF = file(self.pwl, "wt")
for w in self._words:
pwlF.write("%s\n" % (w.strip(),))
pwlF.close()
def add_to_session(self, word):
"""Add a word to the session list."""
self._words.insert(word)
def is_in_session(self, word):
"""Check whether a word is in the session list."""
warnings.warn("PyPWL.is_in_session is deprecated, please use PyPWL.is_added", category=DeprecationWarning)
# Consider all words to be in the session list
return self.check(word)
def store_replacement(self, mis, cor):
"""Store a replacement spelling for a miss-spelled word.
This method makes a suggestion to the spellchecking engine that the
miss-spelled word <mis> is in fact correctly spelled as <cor>. Such
a suggestion will typically mean that <cor> appears early in the
list of suggested spellings offered for later instances of <mis>.
"""
# Too much work for this simple spellchecker
pass
store_replacement._DOC_ERRORS = ["mis", "mis"]
def is_added(self, word):
"""Check whether a word is in the personal word list."""
return self.check(word)
def is_removed(self, word):
"""Check whether a word is in the personal exclude list."""
return False
# No-op methods to support internal use as a Dict() replacement
def _check_this(self, msg):
pass
def _free(self):
pass

View File

@@ -1,4 +0,0 @@
This directory contains dictionary files for Enchant when installed on a
Microsoft Windows system. Each subdirectory contains dictionaries for
a particular spellchecking system.

View File

@@ -1,3 +0,0 @@
This directory contains dictionaries for the myspell backend to enchant.

View File

@@ -1,616 +0,0 @@
# pyenchant
#
# Copyright (C) 2004-2009, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPsE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
enchant.tests: testcases for pyenchant
"""
import os
import sys
import unittest
import pickle
try:
import subprocess
except ImportError:
subprocess = None
import enchant
from enchant import *
from enchant import _enchant as _e
from enchant.utils import unicode, raw_unicode, printf, trim_suggestions
def runcmd(cmd):
if subprocess is not None:
kwds = dict(stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
p = subprocess.Popen(cmd, **kwds)
(stdout, stderr) = p.communicate()
if p.returncode:
if sys.version_info[0] >= 3:
stderr = stderr.decode(sys.getdefaultencoding(), "replace")
sys.stderr.write(stderr)
return p.returncode
else:
return os.system(cmd)
class TestBroker(unittest.TestCase):
"""Test cases for the proper functioning of Broker objects.
These tests assume that there is at least one working provider
with a dictionary for the "en_US" language.
"""
def setUp(self):
self.broker = Broker()
def tearDown(self):
del self.broker
def test_HasENUS(self):
"""Test that the en_US language is available."""
self.assertTrue(self.broker.dict_exists("en_US"))
def test_LangsAreAvail(self):
"""Test whether all advertised languages are in fact available."""
for lang in self.broker.list_languages():
if not self.broker.dict_exists(lang):
assert False, "language '" + lang + "' advertised but non-existent"
def test_ProvsAreAvail(self):
"""Test whether all advertised providers are in fact available."""
for (lang, prov) in self.broker.list_dicts():
self.assertTrue(self.broker.dict_exists(lang))
if not self.broker.dict_exists(lang):
assert False, "language '" + lang + "' advertised but non-existent"
if prov not in self.broker.describe():
assert False, "provier '" + str(prov) + "' advertised but non-existent"
def test_ProvOrdering(self):
"""Test that provider ordering works correctly."""
langs = {}
provs = []
# Find the providers for each language, and a list of all providers
for (tag, prov) in self.broker.list_dicts():
# Skip hyphenation dictionaries installed by OOo
if tag.startswith("hyph_") and prov.name == "myspell":
continue
# Canonicalize separators
tag = tag.replace("-", "_")
langs[tag] = []
# NOTE: we are excluding Zemberek here as it appears to return
# a broker for any language, even nonexistent ones
if prov not in provs and prov.name != "zemberek":
provs.append(prov)
for prov in provs:
for tag in langs:
b2 = Broker()
b2.set_ordering(tag, prov.name)
try:
d = b2.request_dict(tag)
if d.provider != prov:
raise ValueError()
langs[tag].append(prov)
except:
pass
# Check availability using a single entry in ordering
for tag in langs:
for prov in langs[tag]:
b2 = Broker()
b2.set_ordering(tag, prov.name)
d = b2.request_dict(tag)
self.assertEqual((d.provider, tag), (prov, tag))
del d
del b2
# Place providers that dont have the language in the ordering
for tag in langs:
for prov in langs[tag]:
order = prov.name
for prov2 in provs:
if prov2 not in langs[tag]:
order = prov2.name + "," + order
b2 = Broker()
b2.set_ordering(tag, order)
d = b2.request_dict(tag)
self.assertEqual((d.provider, tag, order), (prov, tag, order))
del d
del b2
def test_UnicodeTag(self):
"""Test that unicode language tags are accepted"""
d1 = self.broker._request_dict_data(raw_unicode("en_US"))
self.assertTrue(d1)
_e.broker_free_dict(self.broker._this, d1)
d1 = Dict(raw_unicode("en_US"))
self.assertTrue(d1)
def test_GetSetParam(self):
try:
self.broker.get_param("pyenchant.unittest")
except AttributeError:
return
self.assertEqual(self.broker.get_param("pyenchant.unittest"), None)
self.broker.set_param("pyenchant.unittest", "testing")
self.assertEqual(self.broker.get_param("pyenchant.unittest"), "testing")
self.assertEqual(Broker().get_param("pyenchant.unittest"), None)
class TestDict(unittest.TestCase):
"""Test cases for the proper functioning of Dict objects.
These tests assume that there is at least one working provider
with a dictionary for the "en_US" language.
"""
def setUp(self):
self.dict = Dict("en_US")
def tearDown(self):
del self.dict
def test_HasENUS(self):
"""Test that the en_US language is available through default broker."""
self.assertTrue(dict_exists("en_US"))
def test_check(self):
"""Test that check() works on some common words."""
self.assertTrue(self.dict.check("hello"))
self.assertTrue(self.dict.check("test"))
self.assertFalse(self.dict.check("helo"))
self.assertFalse(self.dict.check("testt"))
def test_broker(self):
"""Test that the dict's broker is set correctly."""
self.assertTrue(self.dict._broker is enchant._broker)
def test_tag(self):
"""Test that the dict's tag is set correctly."""
self.assertEqual(self.dict.tag, "en_US")
def test_suggest(self):
"""Test that suggest() gets simple suggestions right."""
self.assertTrue(self.dict.check("hello"))
self.assertTrue("hello" in self.dict.suggest("helo"))
def test_suggestHang1(self):
"""Test whether suggest() hangs on some inputs (Bug #1404196)"""
self.assertTrue(len(self.dict.suggest("Thiis")) >= 0)
self.assertTrue(len(self.dict.suggest("Thiiis")) >= 0)
self.assertTrue(len(self.dict.suggest("Thiiiis")) >= 0)
def test_unicode1(self):
"""Test checking/suggesting for unicode strings"""
# TODO: find something that actually returns suggestions
us1 = raw_unicode(r"he\u2149lo")
self.assertTrue(type(us1) is unicode)
self.assertFalse(self.dict.check(us1))
for s in self.dict.suggest(us1):
self.assertTrue(type(s) is unicode)
def test_session(self):
"""Test that adding words to the session works as required."""
self.assertFalse(self.dict.check("Lozz"))
self.assertFalse(self.dict.is_added("Lozz"))
self.dict.add_to_session("Lozz")
self.assertTrue(self.dict.is_added("Lozz"))
self.assertTrue(self.dict.check("Lozz"))
self.dict.remove_from_session("Lozz")
self.assertFalse(self.dict.check("Lozz"))
self.assertFalse(self.dict.is_added("Lozz"))
self.dict.remove_from_session("hello")
self.assertFalse(self.dict.check("hello"))
self.assertTrue(self.dict.is_removed("hello"))
self.dict.add_to_session("hello")
def test_AddRemove(self):
"""Test adding/removing from default user dictionary."""
nonsense = "kxhjsddsi"
self.assertFalse(self.dict.check(nonsense))
self.dict.add(nonsense)
self.assertTrue(self.dict.is_added(nonsense))
self.assertTrue(self.dict.check(nonsense))
self.dict.remove(nonsense)
self.assertFalse(self.dict.is_added(nonsense))
self.assertFalse(self.dict.check(nonsense))
self.dict.remove("pineapple")
self.assertFalse(self.dict.check("pineapple"))
self.assertTrue(self.dict.is_removed("pineapple"))
self.assertFalse(self.dict.is_added("pineapple"))
self.dict.add("pineapple")
self.assertTrue(self.dict.check("pineapple"))
def test_DefaultLang(self):
"""Test behaviour of default language selection."""
defLang = utils.get_default_language()
if defLang is None:
# If no default language, shouldnt work
self.assertRaises(Error, Dict)
else:
# If there is a default language, should use it
# Of course, no need for the dict to actually exist
try:
d = Dict()
self.assertEqual(d.tag, defLang)
except DictNotFoundError:
pass
def test_pickling(self):
"""Test that pickling doensn't corrupt internal state."""
d1 = Dict("en")
self.assertTrue(d1.check("hello"))
d2 = pickle.loads(pickle.dumps(d1))
self.assertTrue(d1.check("hello"))
self.assertTrue(d2.check("hello"))
d1._free()
self.assertTrue(d2.check("hello"))
class TestPWL(unittest.TestCase):
"""Test cases for the proper functioning of PWLs and DictWithPWL objects.
These tests assume that there is at least one working provider
with a dictionary for the "en_US" language.
"""
def setUp(self):
self._tempDir = self._mkdtemp()
self._fileName = "pwl.txt"
def tearDown(self):
import shutil
shutil.rmtree(self._tempDir)
def _mkdtemp(self):
import tempfile
return tempfile.mkdtemp()
def _path(self, nm=None):
if nm is None:
nm = self._fileName
nm = os.path.join(self._tempDir, nm)
if not os.path.exists(nm):
open(nm, 'w').close()
return nm
def setPWLContents(self, contents):
"""Set the contents of the PWL file."""
pwlFile = open(self._path(), "w")
for ln in contents:
pwlFile.write(ln)
pwlFile.write("\n")
pwlFile.flush()
pwlFile.close()
def getPWLContents(self):
"""Retrieve the contents of the PWL file."""
pwlFile = open(self._path(), "r")
contents = pwlFile.readlines()
pwlFile.close()
return [c.strip() for c in contents]
def test_check(self):
"""Test that basic checking works for PWLs."""
self.setPWLContents(["Sazz", "Lozz"])
d = request_pwl_dict(self._path())
self.assertTrue(d.check("Sazz"))
self.assertTrue(d.check("Lozz"))
self.assertFalse(d.check("hello"))
def test_UnicodeFN(self):
"""Test that unicode PWL filenames are accepted."""
d = request_pwl_dict(unicode(self._path()))
self.assertTrue(d)
def test_add(self):
"""Test that adding words to a PWL works correctly."""
d = request_pwl_dict(self._path())
self.assertFalse(d.check("Flagen"))
d.add("Esquilax")
d.add("Esquilam")
self.assertTrue(d.check("Esquilax"))
self.assertTrue("Esquilax" in self.getPWLContents())
self.assertTrue(d.is_added("Esquilax"))
def test_suggestions(self):
"""Test getting suggestions from a PWL."""
self.setPWLContents(["Sazz", "Lozz"])
d = request_pwl_dict(self._path())
self.assertTrue("Sazz" in d.suggest("Saz"))
self.assertTrue("Lozz" in d.suggest("laz"))
self.assertTrue("Sazz" in d.suggest("laz"))
d.add("Flagen")
self.assertTrue("Flagen" in d.suggest("Flags"))
self.assertFalse("sazz" in d.suggest("Flags"))
def test_DWPWL(self):
"""Test functionality of DictWithPWL."""
self.setPWLContents(["Sazz", "Lozz"])
d = DictWithPWL("en_US", self._path(), self._path("pel.txt"))
self.assertTrue(d.check("Sazz"))
self.assertTrue(d.check("Lozz"))
self.assertTrue(d.check("hello"))
self.assertFalse(d.check("helo"))
self.assertFalse(d.check("Flagen"))
d.add("Flagen")
self.assertTrue(d.check("Flagen"))
self.assertTrue("Flagen" in self.getPWLContents())
self.assertTrue("Flagen" in d.suggest("Flagn"))
self.assertTrue("hello" in d.suggest("helo"))
d.remove("hello")
self.assertFalse(d.check("hello"))
self.assertTrue("hello" not in d.suggest("helo"))
d.remove("Lozz")
self.assertFalse(d.check("Lozz"))
def test_DWPWL_empty(self):
"""Test functionality of DictWithPWL using transient dicts."""
d = DictWithPWL("en_US", None, None)
self.assertTrue(d.check("hello"))
self.assertFalse(d.check("helo"))
self.assertFalse(d.check("Flagen"))
d.add("Flagen")
self.assertTrue(d.check("Flagen"))
d.remove("hello")
self.assertFalse(d.check("hello"))
d.add("hello")
self.assertTrue(d.check("hello"))
def test_PyPWL(self):
"""Test our pure-python PWL implementation."""
d = PyPWL()
self.assertTrue(list(d._words) == [])
d.add("hello")
d.add("there")
d.add("duck")
ws = list(d._words)
self.assertTrue(len(ws) == 3)
self.assertTrue("hello" in ws)
self.assertTrue("there" in ws)
self.assertTrue("duck" in ws)
d.remove("duck")
d.remove("notinthere")
ws = list(d._words)
self.assertTrue(len(ws) == 2)
self.assertTrue("hello" in ws)
self.assertTrue("there" in ws)
def test_UnicodeCharsInPath(self):
"""Test that unicode chars in PWL paths are accepted."""
self._fileName = raw_unicode(r"test_\xe5\xe4\xf6_ing")
d = request_pwl_dict(self._path())
self.assertTrue(d)
class TestUtils(unittest.TestCase):
"""Test cases for various utility functions."""
def test_trim_suggestions(self):
word = "gud"
suggs = ["good", "god", "bad+"]
self.assertEquals(trim_suggestions(word, suggs, 40), ["god", "good", "bad+"])
self.assertEquals(trim_suggestions(word, suggs, 4), ["god", "good", "bad+"])
self.assertEquals(trim_suggestions(word, suggs, 3), ["god", "good", "bad+"])
self.assertEquals(trim_suggestions(word, suggs, 2), ["god", "good"])
self.assertEquals(trim_suggestions(word, suggs, 1), ["god"])
self.assertEquals(trim_suggestions(word, suggs, 0), [])
class TestDocStrings(unittest.TestCase):
"""Test the spelling on all docstrings we can find in this module.
This serves two purposes - to provide a lot of test data for the
checker routines, and to make sure we don't suffer the embarrassment
of having spelling errors in a spellchecking package!
"""
WORDS = ["spellchecking", "utf", "dict", "unicode", "bytestring", "bytestrings",
"str", "pyenchant", "ascii", "utils", "setup", "distutils", "pkg",
"filename", "tokenization", "tuple", "tuples", "tokenizer",
"tokenizers", "testcase", "testcases", "whitespace", "wxpython",
"spellchecker", "dialog", "urls", "wikiwords", "enchantobject",
"providerdesc", "spellcheck", "pwl", "aspell", "myspell",
"docstring", "docstrings", "stopiteration", "pwls", "pypwl",
"dictwithpwl", "skippable", "dicts", "dict's", "filenames",
"trie", "api", "ctypes", "wxspellcheckerdialog", "stateful",
"cmdlinechecker", "spellchecks", "callback", "clunkier", "iterator",
"ispell", "cor", "backends"]
def test_docstrings(self):
"""Test that all our docstrings are error-free."""
import enchant
import enchant.utils
import enchant.pypwl
import enchant.tokenize
import enchant.tokenize.en
import enchant.checker
import enchant.checker.CmdLineChecker
try:
import enchant.checker.GtkSpellCheckerDialog
except ImportError:
pass
try:
import enchant.checker.wxSpellCheckerDialog
except ImportError:
pass
errors = []
# Naive recursion here would blow the stack, instead we
# simulate it with our own stack
tocheck = [enchant]
checked = []
while tocheck:
obj = tocheck.pop()
checked.append(obj)
newobjs = list(self._check_docstrings(obj, errors))
tocheck.extend([obj for obj in newobjs if obj not in checked])
self.assertEqual(len(errors), 0)
def _check_docstrings(self, obj, errors):
import enchant
if hasattr(obj, "__doc__"):
skip_errors = [w for w in getattr(obj, "_DOC_ERRORS", [])]
chkr = enchant.checker.SpellChecker("en_AU", obj.__doc__, filters=[enchant.tokenize.URLFilter])
for err in chkr:
if len(err.word) == 1:
continue
if err.word.lower() in self.WORDS:
continue
if skip_errors and skip_errors[0] == err.word:
skip_errors.pop(0)
continue
errors.append((obj, err.word, err.wordpos))
msg = "\nDOCSTRING SPELLING ERROR: %s %s %d %s\n" % (obj, err.word, err.wordpos, chkr.suggest())
printf([msg], file=sys.stderr)
# Find and yield all child objects that should be checked
for name in dir(obj):
if name.startswith("__"):
continue
child = getattr(obj, name)
if hasattr(child, "__file__"):
if not hasattr(globals(), "__file__"):
continue
if not child.__file__.startswith(os.path.dirname(__file__)):
continue
else:
cmod = getattr(child, "__module__", None)
if not cmod:
cclass = getattr(child, "__class__", None)
cmod = getattr(cclass, "__module__", None)
if cmod and not cmod.startswith("enchant"):
continue
yield child
class TestInstallEnv(unittest.TestCase):
"""Run all testcases in a variety of install environments."""
def setUp(self):
self._tempDir = self._mkdtemp()
self._insDir = "build"
def tearDown(self):
import shutil
shutil.rmtree(self._tempDir)
def _mkdtemp(self):
import tempfile
return tempfile.mkdtemp()
def install(self):
import os, sys, shutil
insdir = os.path.join(self._tempDir, self._insDir)
os.makedirs(insdir)
shutil.copytree("enchant", os.path.join(insdir, "enchant"))
def runtests(self):
import os, sys
insdir = os.path.join(self._tempDir, self._insDir)
if str is not unicode and isinstance(insdir, unicode):
insdir = insdir.encode(sys.getfilesystemencoding())
os.environ["PYTHONPATH"] = insdir
script = os.path.join(insdir, "enchant", "__init__.py")
res = runcmd("\"%s\" %s" % (sys.executable, script,))
self.assertEqual(res, 0)
def test_basic(self):
"""Test proper functioning of TestInstallEnv suite."""
self.install()
self.runtests()
test_basic._DOC_ERRORS = ["TestInstallEnv"]
def test_UnicodeInstallPath(self):
"""Test installation in a path containing unicode chars."""
self._insDir = raw_unicode(r'test_\xe5\xe4\xf6_ing')
self.install()
self.runtests()
class TestPy2exe(unittest.TestCase):
"""Run all testcases inside a py2exe executable"""
_DOC_ERRORS = ["py", "exe"]
def setUp(self):
self._tempDir = self._mkdtemp()
def tearDown(self):
import shutil
shutil.rmtree(self._tempDir)
def test_py2exe(self):
"""Test pyenchant running inside a py2exe executable."""
import os, sys, shutil
from os import path
from os.path import dirname
try:
import py2exe
except ImportError:
return
os.environ["PYTHONPATH"] = dirname(dirname(__file__))
setup_py = path.join(dirname(__file__), "..", "tools", "setup.py2exe.py")
if not path.exists(setup_py):
return
buildCmd = '%s %s -q py2exe --dist-dir="%s"'
buildCmd = buildCmd % (sys.executable, setup_py, self._tempDir)
res = runcmd(buildCmd)
self.assertEqual(res, 0)
testCmd = self._tempDir + "\\test_pyenchant.exe"
self.assertTrue(os.path.exists(testCmd))
res = runcmd(testCmd)
self.assertEqual(res, 0)
test_py2exe._DOC_ERRORS = ["py", "exe"]
def _mkdtemp(self):
import tempfile
return tempfile.mkdtemp()
def buildtestsuite(recurse=True):
from enchant.checker.tests import TestChecker
from enchant.tokenize.tests import TestTokenization, TestFilters
from enchant.tokenize.tests import TestTokenizeEN
suite = unittest.TestSuite()
if recurse:
suite.addTest(unittest.makeSuite(TestInstallEnv))
suite.addTest(unittest.makeSuite(TestPy2exe))
suite.addTest(unittest.makeSuite(TestBroker))
suite.addTest(unittest.makeSuite(TestDict))
suite.addTest(unittest.makeSuite(TestPWL))
suite.addTest(unittest.makeSuite(TestUtils))
suite.addTest(unittest.makeSuite(TestDocStrings))
suite.addTest(unittest.makeSuite(TestChecker))
suite.addTest(unittest.makeSuite(TestTokenization))
suite.addTest(unittest.makeSuite(TestTokenizeEN))
suite.addTest(unittest.makeSuite(TestFilters))
return suite
def runtestsuite(recurse=False):
return unittest.TextTestRunner(verbosity=0).run(buildtestsuite(recurse=recurse))

View File

@@ -1,536 +0,0 @@
# pyenchant
#
# Copyright (C) 2004-2009, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
enchant.tokenize: String tokenization functions for PyEnchant
================================================================
An important task in spellchecking is breaking up large bodies of
text into their constituent words, each of which is then checked
for correctness. This package provides Python functions to split
strings into words according to the rules of a particular language.
Each tokenization function accepts a string as its only positional
argument, and returns an iterator that yields tuples of the following
form, one for each word found::
(<word>,<pos>)
The meanings of these fields should be clear: <word> is the word
that was found and <pos> is the position within the text at which
the word began (zero indexed, of course). The function will work
on any string-like object that supports array-slicing; in particular
character-array objects from the 'array' module may be used.
The iterator also provides the attribute 'offset' which gives the current
position of the tokenizer inside the string being split, and the method
'set_offset' for manually adjusting this position. This can be used for
example if the string's contents have changed during the tokenization
process.
To obtain an appropriate tokenization function for the language
identified by <tag>, use the function 'get_tokenizer(tag)'::
tknzr = get_tokenizer("en_US")
for (word,pos) in tknzr("text to be tokenized goes here")
do_something(word)
This library is designed to be easily extendible by third-party
authors. To register a tokenization function for the language
<tag>, implement it as the function 'tokenize' within the
module enchant.tokenize.<tag>. The 'get_tokenizer' function
will automatically detect it. Note that the underscore must be
used as the tag component separator in this case, in order to
form a valid python module name. (e.g. "en_US" rather than "en-US")
Currently, a tokenizer has only been implemented for the English
language. Based on the author's limited experience, this should
be at least partially suitable for other languages.
This module also provides various implementations of "Chunkers" and
"Filters". These classes are designed to make it easy to work with
text in a vareity of common formats, by detecting and excluding parts
of the text that don't need to be checked.
A Chunker is a class designed to break a body of text into large chunks
of checkable content; for example the HTMLChunker class extracts the
text content from all HTML tags but excludes the tags themselves.
A Filter is a class designed to skip individual words during the checking
process; for example the URLFilter class skips over any words that
have the format of a URL.
For exmaple, to spellcheck an HTML document it is necessary to split the
text into chunks based on HTML tags, and to filter out common word forms
such as URLs and WikiWords. This would look something like the following::
tknzr = get_tokenier("en_US",(HTMLChunker,),(URLFilter,WikiWordFilter)))
text = "<html><body>the url is http://example.com</body></html>"
for (word,pos) in tknzer(text):
...check each word and react accordingly...
"""
_DOC_ERRORS = ["pos", "pos", "tknzr", "URLFilter", "WikiWordFilter",
"tkns", "tknzr", "pos", "tkns"]
import re
import warnings
import enchant
from enchant.utils import next, xrange
from enchant.errors import *
# For backwards-compatability. This will eventually be removed, but how
# does one mark a module-level constant as deprecated?
Error = TokenizerNotFoundError
class tokenize:
"""Base class for all tokenizer objects.
Each tokenizer must be an iterator and provide the 'offset'
attribute as described in the documentation for this module.
While tokenizers are in fact classes, they should be treated
like functions, and so are named using lower_case rather than
the CamelCase more traditional of class names.
"""
_DOC_ERRORS = ["CamelCase"]
def __init__(self, text):
self._text = text
self._offset = 0
def __next__(self):
return self.next()
def next(self):
raise NotImplementedError()
def __iter__(self):
return self
def set_offset(self, offset, replaced=False):
self._offset = offset
def _get_offset(self):
return self._offset
def _set_offset(self, offset):
msg = "changing a tokenizers 'offset' attribute is deprecated;" \
" use the 'set_offset' method"
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
self.set_offset(offset)
offset = property(_get_offset, _set_offset)
def get_tokenizer(tag=None, chunkers=None, filters=None):
"""Locate an appropriate tokenizer by language tag.
This requires importing the function 'tokenize' from an appropriate
module. Modules tried are named after the language tag, tried in the
following order:
* the entire tag (e.g. "en_AU.py")
* the base country code of the tag (e.g. "en.py")
If the language tag is None, a default tokenizer (actually the English
one) is returned. It's unicode aware and should work OK for most
latin-derived languages.
If a suitable function cannot be found, raises TokenizerNotFoundError.
If given and not None, 'chunkers' and 'filters' must be lists of chunker
classes and filter classes resectively. These will be applied to the
tokenizer during creation.
"""
if tag is None:
tag = "en"
# "filters" used to be the second argument. Try to catch cases
# where it is given positionally and issue a DeprecationWarning.
if chunkers is not None and filters is None:
chunkers = list(chunkers)
if chunkers:
try:
chunkers_are_filters = issubclass(chunkers[0], Filter)
except TypeError:
pass
else:
if chunkers_are_filters:
msg = "passing 'filters' as a non-keyword argument " \
"to get_tokenizer() is deprecated"
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
filters = chunkers
chunkers = None
# Ensure only '_' used as separator
tag = tag.replace("-", "_")
# First try the whole tag
tkFunc = _try_tokenizer(tag)
if tkFunc is None:
# Try just the base
base = tag.split("_")[0]
tkFunc = _try_tokenizer(base)
if tkFunc is None:
msg = "No tokenizer found for language '%s'" % (tag,)
raise TokenizerNotFoundError(msg)
# Given the language-specific tokenizer, we now build up the
# end result as follows:
# * chunk the text using any given chunkers in turn
# * begin with basic whitespace tokenization
# * apply each of the given filters in turn
# * apply language-specific rules
tokenizer = basic_tokenize
if chunkers is not None:
chunkers = list(chunkers)
for i in xrange(len(chunkers) - 1, -1, -1):
tokenizer = wrap_tokenizer(chunkers[i], tokenizer)
if filters is not None:
for f in filters:
tokenizer = f(tokenizer)
tokenizer = wrap_tokenizer(tokenizer, tkFunc)
return tokenizer
get_tokenizer._DOC_ERRORS = ["py", "py"]
class empty_tokenize(tokenize):
"""Tokenizer class that yields no elements."""
_DOC_ERRORS = []
def __init__(self):
tokenize.__init__(self, "")
def next(self):
raise StopIteration()
class unit_tokenize(tokenize):
"""Tokenizer class that yields the text as a single token."""
_DOC_ERRORS = []
def __init__(self, text):
tokenize.__init__(self, text)
self._done = False
def next(self):
if self._done:
raise StopIteration()
self._done = True
return (self._text, 0)
class basic_tokenize(tokenize):
"""Tokenizer class that performs very basic word-finding.
This tokenizer does the most basic thing that could work - it splits
text into words based on whitespace boundaries, and removes basic
punctuation symbols from the start and end of each word.
"""
_DOC_ERRORS = []
# Chars to remove from start/end of words
strip_from_start = '"' + "'`(["
strip_from_end = '"' + "'`]).!,?;:"
def next(self):
text = self._text
offset = self._offset
while True:
if offset >= len(text):
break
# Find start of next word
while offset < len(text) and text[offset].isspace():
offset += 1
sPos = offset
# Find end of word
while offset < len(text) and not text[offset].isspace():
offset += 1
ePos = offset
self._offset = offset
# Strip chars from font/end of word
while sPos < len(text) and text[sPos] in self.strip_from_start:
sPos += 1
while 0 < ePos and text[ePos - 1] in self.strip_from_end:
ePos -= 1
# Return if word isnt empty
if (sPos < ePos):
return (text[sPos:ePos], sPos)
raise StopIteration()
def _try_tokenizer(modName):
"""Look for a tokenizer in the named module.
Returns the function if found, None otherwise.
"""
modBase = "enchant.tokenize."
funcName = "tokenize"
modName = modBase + modName
try:
mod = __import__(modName, globals(), {}, funcName)
return getattr(mod, funcName)
except ImportError:
return None
def wrap_tokenizer(tk1, tk2):
"""Wrap one tokenizer inside another.
This function takes two tokenizer functions 'tk1' and 'tk2',
and returns a new tokenizer function that passes the output
of tk1 through tk2 before yielding it to the calling code.
"""
# This logic is already implemented in the Filter class.
# We simply use tk2 as the _split() method for a filter
# around tk1.
tkW = Filter(tk1)
tkW._split = tk2
return tkW
wrap_tokenizer._DOC_ERRORS = ["tk", "tk", "tk", "tk"]
class Chunker(tokenize):
"""Base class for text chunking functions.
A chunker is designed to chunk text into large blocks of tokens. It
has the same interface as a tokenizer but is for a different purpose.
"""
pass
class Filter(object):
"""Base class for token filtering functions.
A filter is designed to wrap a tokenizer (or another filter) and do
two things:
* skip over tokens
* split tokens into sub-tokens
Subclasses have two basic options for customising their behaviour. The
method _skip(word) may be overridden to return True for words that
should be skipped, and false otherwise. The method _split(word) may
be overridden as tokenization function that will be applied to further
tokenize any words that aren't skipped.
"""
def __init__(self, tokenizer):
"""Filter class constructor."""
self._tokenizer = tokenizer
def __call__(self, *args, **kwds):
tkn = self._tokenizer(*args, **kwds)
return self._TokenFilter(tkn, self._skip, self._split)
def _skip(self, word):
"""Filter method for identifying skippable tokens.
If this method returns true, the given word will be skipped by
the filter. This should be overridden in subclasses to produce the
desired functionality. The default behaviour is not to skip any words.
"""
return False
def _split(self, word):
"""Filter method for sub-tokenization of tokens.
This method must be a tokenization function that will split the
given word into sub-tokens according to the needs of the filter.
The default behaviour is not to split any words.
"""
return unit_tokenize(word)
class _TokenFilter(object):
"""Private inner class implementing the tokenizer-wrapping logic.
This might seem convoluted, but we're trying to create something
akin to a meta-class - when Filter(tknzr) is called it must return
a *callable* that can then be applied to a particular string to
perform the tokenization. Since we need to manage a lot of state
during tokenization, returning a class is the best option.
"""
_DOC_ERRORS = ["tknzr"]
def __init__(self, tokenizer, skip, split):
self._skip = skip
self._split = split
self._tokenizer = tokenizer
# for managing state of sub-tokenization
self._curtok = empty_tokenize()
self._curword = ""
self._curpos = 0
def __iter__(self):
return self
def __next__(self):
return self.next()
def next(self):
# Try to get the next sub-token from word currently being split.
# If unavailable, move on to the next word and try again.
try:
(word, pos) = next(self._curtok)
return (word, pos + self._curpos)
except StopIteration:
(word, pos) = next(self._tokenizer)
while self._skip(word):
(word, pos) = next(self._tokenizer)
self._curword = word
self._curpos = pos
self._curtok = self._split(word)
return self.next()
# Pass on access to 'offset' to the underlying tokenizer.
def _get_offset(self):
return self._tokenizer.offset
def _set_offset(self, offset):
msg = "changing a tokenizers 'offset' attribute is deprecated;" \
" use the 'set_offset' method"
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
self.set_offset(offset)
offset = property(_get_offset, _set_offset)
def set_offset(self, val, replaced=False):
self._tokenizer.set_offset(val, replaced=replaced)
# If we stay within the current word, also set on _curtok.
# Otherwise, throw away _curtok and set to empty iterator.
subval = val - self._curpos
if subval >= 0 and subval < len(self._curword) and not replaced:
self._curtok.set_offset(subval)
else:
self._curtok = empty_tokenize()
self._curword = ""
self._curpos = 0
# Pre-defined chunkers and filters start here
class URLFilter(Filter):
"""Filter skipping over URLs.
This filter skips any words matching the following regular expression:
^[a-zA-z]+:\/\/[^\s].*
That is, any words that are URLs.
"""
_DOC_ERRORS = ["zA"]
_pattern = re.compile(r"^[a-zA-z]+:\/\/[^\s].*")
def _skip(self, word):
if self._pattern.match(word):
return True
return False
class WikiWordFilter(Filter):
"""Filter skipping over WikiWords.
This filter skips any words matching the following regular expression:
^([A-Z]\w+[A-Z]+\w+)
That is, any words that are WikiWords.
"""
_pattern = re.compile(r"^([A-Z]\w+[A-Z]+\w+)")
def _skip(self, word):
if self._pattern.match(word):
return True
return False
class EmailFilter(Filter):
"""Filter skipping over email addresses.
This filter skips any words matching the following regular expression:
^.+@[^\.].*\.[a-z]{2,}$
That is, any words that resemble email addresses.
"""
_pattern = re.compile(r"^.+@[^\.].*\.[a-z]{2,}$")
def _skip(self, word):
if self._pattern.match(word):
return True
return False
class HTMLChunker(Chunker):
"""Chunker for breaking up HTML documents into chunks of checkable text.
The operation of this chunker is very simple - anything between a "<"
and a ">" will be ignored. Later versions may improve the algorithm
slightly.
"""
def next(self):
text = self._text
offset = self.offset
while True:
if offset >= len(text):
break
# Skip to the end of the current tag, if any.
if text[offset] == "<":
maybeTag = offset
if self._is_tag(text, offset):
while text[offset] != ">":
offset += 1
if offset == len(text):
offset = maybeTag + 1
break
else:
offset += 1
else:
offset = maybeTag + 1
sPos = offset
# Find the start of the next tag.
while offset < len(text) and text[offset] != "<":
offset += 1
ePos = offset
self._offset = offset
# Return if chunk isnt empty
if (sPos < offset):
return (text[sPos:offset], sPos)
raise StopIteration()
def _is_tag(self, text, offset):
if offset + 1 < len(text):
if text[offset + 1].isalpha():
return True
if text[offset + 1] == "/":
return True
return False
# TODO: LaTeXChunker

View File

@@ -1,172 +0,0 @@
# pyenchant
#
# Copyright (C) 2004-2008, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
enchant.tokenize.en: Tokenizer for the English language
This module implements a PyEnchant text tokenizer for the English
language, based on very simple rules.
"""
import unicodedata
import enchant.tokenize
from enchant.utils import unicode
class tokenize(enchant.tokenize.tokenize):
"""Iterator splitting text into words, reporting position.
This iterator takes a text string as input, and yields tuples
representing each distinct word found in the text. The tuples
take the form:
(<word>,<pos>)
Where <word> is the word string found and <pos> is the position
of the start of the word within the text.
The optional argument <valid_chars> may be used to specify a
list of additional characters that can form part of a word.
By default, this list contains only the apostrophe ('). Note that
these characters cannot appear at the start or end of a word.
"""
_DOC_ERRORS = ["pos", "pos"]
def __init__(self, text, valid_chars=("'",)):
self._valid_chars = valid_chars
self._text = text
self._offset = 0
# Select proper implementation of self._consume_alpha.
# 'text' isn't necessarily a string (it could be e.g. a mutable array)
# so we can't use isinstance(text,unicode) to detect unicode.
# Instead we typetest the first character of the text.
# If there's no characters then it doesn't matter what implementation
# we use since it won't be called anyway.
try:
char1 = text[0]
except IndexError:
self._consume_alpha = self._consume_alpha_b
else:
if isinstance(char1, unicode):
self._consume_alpha = self._consume_alpha_u
else:
self._consume_alpha = self._consume_alpha_b
def _consume_alpha_b(self, text, offset):
"""Consume an alphabetic character from the given bytestring.
Given a bytestring and the current offset, this method returns
the number of characters occupied by the next alphabetic character
in the string. Non-ASCII bytes are interpreted as utf-8 and can
result in multiple characters being consumed.
"""
assert offset < len(text)
if text[offset].isalpha():
return 1
elif text[offset] >= "\x80":
return self._consume_alpha_utf8(text, offset)
return 0
def _consume_alpha_utf8(self, text, offset):
"""Consume a sequence of utf8 bytes forming an alphabetic character."""
incr = 2
u = ""
while not u and incr <= 4:
try:
try:
# In the common case this will be a string
u = text[offset:offset + incr].decode("utf8")
except AttributeError:
# Looks like it was e.g. a mutable char array.
try:
s = text[offset:offset + incr].tostring()
except AttributeError:
s = "".join([c for c in text[offset:offset + incr]])
u = s.decode("utf8")
except UnicodeDecodeError:
incr += 1
if not u:
return 0
if u.isalpha():
return incr
if unicodedata.category(u)[0] == "M":
return incr
return 0
def _consume_alpha_u(self, text, offset):
"""Consume an alphabetic character from the given unicode string.
Given a unicode string and the current offset, this method returns
the number of characters occupied by the next alphabetic character
in the string. Trailing combining characters are consumed as a
single letter.
"""
assert offset < len(text)
incr = 0
if text[offset].isalpha():
incr = 1
while offset + incr < len(text):
if unicodedata.category(text[offset + incr])[0] != "M":
break
incr += 1
return incr
def next(self):
text = self._text
offset = self._offset
while offset < len(text):
# Find start of next word (must be alpha)
while offset < len(text):
incr = self._consume_alpha(text, offset)
if incr:
break
offset += 1
curPos = offset
# Find end of word using, allowing valid_chars
while offset < len(text):
incr = self._consume_alpha(text, offset)
if not incr:
if text[offset] in self._valid_chars:
incr = 1
else:
break
offset += incr
# Return if word isnt empty
if (curPos != offset):
# Make sure word doesn't end with a valid_char
while text[offset - 1] in self._valid_chars:
offset = offset - 1
self._offset = offset
return (text[curPos:offset], curPos)
self._offset = offset
raise StopIteration()

View File

@@ -1,326 +0,0 @@
# pyenchant
#
# Copyright (C) 2004-2008, Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
enchant.tokenize.tests: unittests for enchant tokenization functions.
"""
import unittest
import array
from enchant.tokenize import *
from enchant.tokenize.en import tokenize as tokenize_en
from enchant.utils import raw_unicode, unicode, bytes
class TestTokenization(unittest.TestCase):
"""TestCases for testing the basic tokenization functionality."""
def test_basic_tokenize(self):
"""Simple regression test for basic white-space tokenization."""
input = """This is a paragraph. It's not very special, but it's designed
2 show how the splitter works with many-different combos
of words. Also need to "test" the (handling) of 'quoted' words."""
output = [
("This", 0), ("is", 5), ("a", 8), ("paragraph", 10), ("It's", 22),
("not", 27), ("very", 31), ("special", 36), ("but", 45), ("it's", 49),
("designed", 54), ("2", 63), ("show", 65), ("how", 70), ("the", 74),
("splitter", 78), ("works", 87), ("with", 93), ("many-different", 98),
("combos", 113), ("of", 120), ("words", 123),
("Also", 130), ("need", 135),
("to", 140), ("test", 144), ("the", 150), ("handling", 155),
("of", 165), ("quoted", 169), ("words", 177)
]
self.assertEqual(output, [i for i in basic_tokenize(input)])
for (itmO, itmV) in zip(output, basic_tokenize(input)):
self.assertEqual(itmO, itmV)
def test_tokenize_strip(self):
"""Test special-char-stripping edge-cases in basic_tokenize."""
input = "((' <this> \"\" 'text' has (lots) of (special chars} >>]"
output = [("<this>", 4), ("text", 15), ("has", 21), ("lots", 26), ("of", 32),
("special", 36), ("chars}", 44), (">>", 51)]
self.assertEqual(output, [i for i in basic_tokenize(input)])
for (itmO, itmV) in zip(output, basic_tokenize(input)):
self.assertEqual(itmO, itmV)
def test_wrap_tokenizer(self):
"""Test wrapping of one tokenizer with another."""
input = "this-string will be split@according to diff'rnt rules"
from enchant.tokenize import en
tknzr = wrap_tokenizer(basic_tokenize, en.tokenize)
tknzr = tknzr(input)
self.assertEqual(tknzr._tokenizer.__class__, basic_tokenize)
self.assertEqual(tknzr._tokenizer.offset, 0)
for (n, (word, pos)) in enumerate(tknzr):
if n == 0:
self.assertEqual(pos, 0)
self.assertEqual(word, "this")
if n == 1:
self.assertEqual(pos, 5)
self.assertEqual(word, "string")
if n == 2:
self.assertEqual(pos, 12)
self.assertEqual(word, "will")
# Test setting offset to a previous token
tknzr.set_offset(5)
self.assertEqual(tknzr.offset, 5)
self.assertEqual(tknzr._tokenizer.offset, 5)
self.assertEqual(tknzr._curtok.__class__, empty_tokenize)
if n == 3:
self.assertEqual(word, "string")
self.assertEqual(pos, 5)
if n == 4:
self.assertEqual(pos, 12)
self.assertEqual(word, "will")
if n == 5:
self.assertEqual(pos, 17)
self.assertEqual(word, "be")
# Test setting offset past the current token
tknzr.set_offset(20)
self.assertEqual(tknzr.offset, 20)
self.assertEqual(tknzr._tokenizer.offset, 20)
self.assertEqual(tknzr._curtok.__class__, empty_tokenize)
if n == 6:
self.assertEqual(pos, 20)
self.assertEqual(word, "split")
if n == 7:
self.assertEqual(pos, 26)
self.assertEqual(word, "according")
# Test setting offset to middle of current token
tknzr.set_offset(23)
self.assertEqual(tknzr.offset, 23)
self.assertEqual(tknzr._tokenizer.offset, 23)
self.assertEqual(tknzr._curtok.offset, 3)
if n == 8:
self.assertEqual(pos, 23)
self.assertEqual(word, "it")
# OK, I'm pretty happy with the behaviour, no need to
# continue testing the rest of the string
class TestFilters(unittest.TestCase):
"""TestCases for the various Filter subclasses."""
text = """this text with http://url.com and SomeLinksLike
ftp://my.site.com.au/some/file AndOthers not:/quite.a.url
with-an@aemail.address as well"""
def setUp(self):
pass
def test_URLFilter(self):
"""Test filtering of URLs"""
tkns = get_tokenizer("en_US", filters=(URLFilter,))(self.text)
out = [t for t in tkns]
exp = [("this", 0), ("text", 5), ("with", 10), ("and", 30),
("SomeLinksLike", 34), ("AndOthers", 93), ("not", 103), ("quite", 108),
("a", 114), ("url", 116), ("with", 134), ("an", 139), ("aemail", 142),
("address", 149), ("as", 157), ("well", 160)]
self.assertEqual(out, exp)
def test_WikiWordFilter(self):
"""Test filtering of WikiWords"""
tkns = get_tokenizer("en_US", filters=(WikiWordFilter,))(self.text)
out = [t for t in tkns]
exp = [("this", 0), ("text", 5), ("with", 10), ("http", 15), ("url", 22), ("com", 26),
("and", 30), ("ftp", 62), ("my", 68), ("site", 71), ("com", 76), ("au", 80),
("some", 83), ("file", 88), ("not", 103), ("quite", 108),
("a", 114), ("url", 116), ("with", 134), ("an", 139), ("aemail", 142),
("address", 149), ("as", 157), ("well", 160)]
self.assertEqual(out, exp)
def test_EmailFilter(self):
"""Test filtering of email addresses"""
tkns = get_tokenizer("en_US", filters=(EmailFilter,))(self.text)
out = [t for t in tkns]
exp = [("this", 0), ("text", 5), ("with", 10), ("http", 15), ("url", 22), ("com", 26),
("and", 30), ("SomeLinksLike", 34),
("ftp", 62), ("my", 68), ("site", 71), ("com", 76), ("au", 80),
("some", 83), ("file", 88), ("AndOthers", 93), ("not", 103), ("quite", 108),
("a", 114), ("url", 116),
("as", 157), ("well", 160)]
self.assertEqual(out, exp)
def test_CombinedFilter(self):
"""Test several filters combined"""
tkns = get_tokenizer("en_US", filters=(URLFilter, WikiWordFilter, EmailFilter))(self.text)
out = [t for t in tkns]
exp = [("this", 0), ("text", 5), ("with", 10),
("and", 30), ("not", 103), ("quite", 108),
("a", 114), ("url", 116),
("as", 157), ("well", 160)]
self.assertEqual(out, exp)
class TestChunkers(unittest.TestCase):
"""TestCases for the various Chunker subclasses."""
def test_HTMLChunker(self):
"""Test filtering of URLs"""
text = """hello<html><head><title>my title</title></head><body>this is a
<b>simple</b> HTML document for <p> test<i>ing</i> purposes</p>.
It < contains > various <-- special characters.
"""
tkns = get_tokenizer("en_US", chunkers=(HTMLChunker,))(text)
out = [t for t in tkns]
exp = [("hello", 0), ("my", 24), ("title", 27), ("this", 53), ("is", 58),
("a", 61), ("simple", 82), ("HTML", 93), ("document", 98), ("for", 107),
("test", 115), ("ing", 122), ("purposes", 130), ("It", 160),
("contains", 165), ("various", 176), ("special", 188),
("characters", 196)]
self.assertEqual(out, exp)
for (word, pos) in out:
self.assertEqual(text[pos:pos + len(word)], word)
class TestTokenizeEN(unittest.TestCase):
"""TestCases for checking behaviour of English tokenization."""
def test_tokenize_en(self):
"""Simple regression test for English tokenization."""
input = """This is a paragraph. It's not very special, but it's designed
2 show how the splitter works with many-different combos
of words. Also need to "test" the handling of 'quoted' words."""
output = [
("This", 0), ("is", 5), ("a", 8), ("paragraph", 10), ("It's", 22),
("not", 27), ("very", 31), ("special", 36), ("but", 45), ("it's", 49),
("designed", 54), ("show", 65), ("how", 70), ("the", 74),
("splitter", 78), ("works", 87), ("with", 93), ("many", 98),
("different", 103), ("combos", 113), ("of", 120), ("words", 123),
("Also", 130), ("need", 135),
("to", 140), ("test", 144), ("the", 150), ("handling", 154),
("of", 163), ("quoted", 167), ("words", 175)
]
for (itmO, itmV) in zip(output, tokenize_en(input)):
self.assertEqual(itmO, itmV)
def test_unicodeBasic(self):
"""Test tokenization of a basic unicode string."""
input = raw_unicode(
r"Ik ben ge\u00EFnteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet \u00E9\u00E9n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao")
output = input.split(" ")
output[8] = output[8][0:-1]
for (itmO, itmV) in zip(output, tokenize_en(input)):
self.assertEqual(itmO, itmV[0])
self.assertTrue(input[itmV[1]:].startswith(itmO))
def test_unicodeCombining(self):
"""Test tokenization with unicode combining symbols."""
input = raw_unicode(
r"Ik ben gei\u0308nteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet e\u0301e\u0301n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao")
output = input.split(" ")
output[8] = output[8][0:-1]
for (itmO, itmV) in zip(output, tokenize_en(input)):
self.assertEqual(itmO, itmV[0])
self.assertTrue(input[itmV[1]:].startswith(itmO))
def test_utf8_bytes(self):
"""Test tokenization of UTF8-encoded bytes (bug #2500184)."""
# Python3 doesn't support bytestrings, don't run this test
if str is unicode:
return
input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume"
output = input.split(" ")
output[1] = output[1][0:-1]
for (itmO, itmV) in zip(output, tokenize_en(input)):
self.assertEqual(itmO, itmV[0])
self.assertTrue(input[itmV[1]:].startswith(itmO))
def test_utf8_bytes_at_end(self):
"""Test tokenization of UTF8-encoded bytes at end of word."""
# Python3 doesn't support bytestrings, don't run this test
if str is unicode:
return
input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume"
output = input.split(" ")
output[1] = output[1][0:-1]
for (itmO, itmV) in zip(output, tokenize_en(input)):
self.assertEqual(itmO, itmV[0])
def test_utf8_bytes_in_an_array(self):
"""Test tokenization of UTF8-encoded bytes stored in an array."""
# Python3 doesn't support bytestrings, don't run this test
if str is unicode:
return
input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume"
output = input.split(" ")
output[1] = output[1][0:-1]
input = array.array('c', input)
output = [array.array('c', w) for w in output]
for (itmO, itmV) in zip(output, tokenize_en(array.array('c', input))):
self.assertEqual(itmO, itmV[0])
self.assertEqual(input[itmV[1]:itmV[1] + len(itmV[0])], itmO)
def test_bug1591450(self):
"""Check for tokenization regressions identified in bug #1591450."""
input = """Testing <i>markup</i> and {y:i}so-forth...leading dots and trail--- well, you get-the-point. Also check numbers: 999 1,000 12:00 .45. Done?"""
output = [
("Testing", 0), ("i", 9), ("markup", 11), ("i", 19), ("and", 22),
("y", 27), ("i", 29), ("so", 31), ("forth", 34), ("leading", 42),
("dots", 50), ("and", 55), ("trail", 59), ("well", 68),
("you", 74), ("get", 78), ("the", 82), ("point", 86),
("Also", 93), ("check", 98), ("numbers", 104), ("Done", 134),
]
for (itmO, itmV) in zip(output, tokenize_en(input)):
self.assertEqual(itmO, itmV)
def test_bug2785373(self):
"""Testcases for bug #2785373"""
input = "So, one dey when I wes 17, I left."
for _ in tokenize_en(input):
pass
input = raw_unicode("So, one dey when I wes 17, I left.")
for _ in tokenize_en(input):
pass
def test_finnish_text(self):
"""Test tokenizing some Finnish text.
This really should work since there are no special rules to apply,
just lots of non-ascii characters.
"""
inputT = raw_unicode(
'T\\xe4m\\xe4 on kappale. Eip\\xe4 ole kovin 2 nen, mutta tarkoitus on n\\xe4ytt\\xe4\\xe4 miten sanastaja \\ntoimii useiden-erilaisten sanarypp\\xe4iden kimpussa.\\nPit\\xe4\\xe4p\\xe4 viel\\xe4 \'tarkistaa\' sanat jotka "lainausmerkeiss\\xe4". Heittomerkki ja vaa\'an.\\nUlkomaisia sanoja s\\xfcss, spa\\xdf.')
outputT = [
(raw_unicode('T\\xe4m\\xe4'), 0), (raw_unicode('on'), 5), (raw_unicode('kappale'), 8),
(raw_unicode('Eip\\xe4'), 17), (raw_unicode('ole'), 22), (raw_unicode('kovin'), 26),
(raw_unicode('nen'), 34), (raw_unicode('mutta'), 39), (raw_unicode('tarkoitus'), 45),
(raw_unicode('on'), 55), (raw_unicode('n\\xe4ytt\\xe4\\xe4'), 58), (raw_unicode('miten'), 66),
(raw_unicode('sanastaja'), 72), (raw_unicode('toimii'), 83), (raw_unicode('useiden'), 90),
(raw_unicode('erilaisten'), 98), (raw_unicode('sanarypp\\xe4iden'), 109), (raw_unicode('kimpussa'), 123),
(raw_unicode('Pit\\xe4\\xe4p\\xe4'), 133), (raw_unicode('viel\\xe4'), 141), (raw_unicode('tarkistaa'), 148),
(raw_unicode('sanat'), 159), (raw_unicode('jotka'), 165), (raw_unicode('lainausmerkeiss\\xe4'), 172),
(raw_unicode('Heittomerkki'), 191), (raw_unicode('ja'), 204), (raw_unicode("vaa'an"), 207),
(raw_unicode('Ulkomaisia'), 215), (raw_unicode('sanoja'), 226), (raw_unicode('s\\xfcss'), 233),
(raw_unicode('spa\\xdf'), 239), ]
for (itmO, itmV) in zip(outputT, tokenize_en(inputT)):
self.assertEqual(itmO, itmV)

View File

@@ -1,354 +0,0 @@
# pyenchant
#
# Copyright (C) 2004-2008 Ryan Kelly
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# In addition, as a special exception, you are
# given permission to link the code of this program with
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
# spell checker backend) and distribute linked combinations including
# the two. You must obey the GNU Lesser General Public License in all
# respects for all of the code used other than said providers. If you modify
# this file, you may extend this exception to your version of the
# file, but you are not obligated to do so. If you do not wish to
# do so, delete this exception statement from your version.
#
"""
enchant.utils: Misc utilities for the enchant package
========================================================
This module provides miscellaneous utilities for use with the
enchant spellchecking package. Currently available functionality
includes:
* string/unicode compatibility wrappers
* functions for dealing with locale/language settings
* ability to list supporting data files (win32 only)
* functions for bundling supporting data files from a build
"""
import os
import sys
import codecs
from enchant.errors import *
# Attempt to access local language information
try:
import locale
except ImportError:
locale = None
#
# Unicode/Bytes compatabilty wrappers.
#
# These allow us to support both Python 2.x and Python 3.x from
# the same codebase.
#
# We provide explicit type objects "bytes" and "unicode" that can be
# used to construct instances of the appropriate type. The class
# "EnchantStr" derives from the default "str" type and implements the
# necessary logic for encoding/decoding as strings are passed into
# the underlying C library (where they must always be utf-8 encoded
# byte strings).
#
try:
unicode = unicode
except NameError:
str = str
unicode = str
bytes = bytes
basestring = (str, bytes)
else:
str = str
unicode = unicode
bytes = str
basestring = basestring
def raw_unicode(raw):
"""Make a unicode string from a raw string.
This function takes a string containing unicode escape characters,
and returns the corresponding unicode string. Useful for writing
unicode string literals in your python source while being upwards-
compatible with Python 3. For example, instead of doing this:
s = u"hello\u2149" # syntax error in Python 3
Or this:
s = "hello\u2149" # not what you want in Python 2.x
You can do this:
s = raw_unicode(r"hello\u2149") # works everywhere!
"""
return raw.encode("utf8").decode("unicode-escape")
def raw_bytes(raw):
"""Make a bytes object out of a raw string.
This is analogous to raw_unicode, but processes byte escape characters
to produce a bytes object.
"""
return codecs.escape_decode(raw)[0]
class EnchantStr(str):
"""String subclass for interfacing with enchant C library.
This class encapsulates the logic for interfacing between python native
string/unicode objects and the underlying enchant library, which expects
all strings to be UTF-8 character arrays. It is a subclass of the
default string class 'str' - on Python 2.x that makes it an ascii string,
on Python 3.x it is a unicode object.
Initialise it with a string or unicode object, and use the encode() method
to obtain an object suitable for passing to the underlying C library.
When strings are read back into python, use decode(s) to translate them
back into the appropriate python-level string type.
This allows us to following the common Python 2.x idiom of returning
unicode when unicode is passed in, and byte strings otherwise. It also
lets the interface be upwards-compatible with Python 3, in which string
objects are unicode by default.
"""
def __new__(cls, value):
"""EnchantStr data constructor.
This method records whether the initial string was unicode, then
simply passes it along to the default string constructor.
"""
if type(value) is unicode:
was_unicode = True
if str is not unicode:
value = value.encode("utf-8")
else:
was_unicode = False
if str is not bytes:
raise Error("Don't pass bytestrings to pyenchant")
self = str.__new__(cls, value)
self._was_unicode = was_unicode
return self
def encode(self):
"""Encode this string into a form usable by the enchant C library."""
if str is unicode:
return str.encode(self, "utf-8")
else:
return self
def decode(self, value):
"""Decode a string returned by the enchant C library."""
if self._was_unicode:
if str is unicode:
# On some python3 versions, ctypes converts c_char_p
# to str() rather than bytes()
if isinstance(value, str):
value = value.encode()
return value.decode("utf-8")
else:
return value.decode("utf-8")
else:
return value
def printf(values, sep=" ", end="\n", file=None):
"""Compatability wrapper from print statement/function.
This function is a simple Python2/Python3 compatability wrapper
for printing to stdout.
"""
if file is None:
file = sys.stdout
file.write(sep.join(map(str, values)))
file.write(end)
try:
next = next
except NameError:
def next(iter):
"""Compatability wrapper for advancing an iterator."""
return iter.next()
try:
xrange = xrange
except NameError:
xrange = range
#
# Other useful functions.
#
def levenshtein(s1, s2):
"""Calculate the Levenshtein distance between two strings.
This is straight from Wikipedia.
"""
if len(s1) < len(s2):
return levenshtein(s2, s1)
if not s1:
return len(s2)
previous_row = xrange(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def trim_suggestions(word, suggs, maxlen, calcdist=None):
"""Trim a list of suggestions to a maximum length.
If the list of suggested words is too long, you can use this function
to trim it down to a maximum length. It tries to keep the "best"
suggestions based on similarity to the original word.
If the optional "calcdist" argument is provided, it must be a callable
taking two words and returning the distance between them. It will be
used to determine which words to retain in the list. The default is
a simple Levenshtein distance.
"""
if calcdist is None:
calcdist = levenshtein
decorated = [(calcdist(word, s), s) for s in suggs]
decorated.sort()
return [s for (l, s) in decorated[:maxlen]]
def get_default_language(default=None):
"""Determine the user's default language, if possible.
This function uses the 'locale' module to try to determine
the user's preferred language. The return value is as
follows:
* if a locale is available for the LC_MESSAGES category,
that language is used
* if a default locale is available, that language is used
* if the keyword argument <default> is given, it is used
* if nothing else works, None is returned
Note that determining the user's language is in general only
possible if they have set the necessary environment variables
on their system.
"""
try:
import locale
tag = locale.getlocale()[0]
if tag is None:
tag = locale.getdefaultlocale()[0]
if tag is None:
raise Error("No default language available")
return tag
except Exception:
pass
return default
get_default_language._DOC_ERRORS = ["LC"]
def get_resource_filename(resname):
"""Get the absolute path to the named resource file.
This serves widely the same purpose as pkg_resources.resource_filename(),
but tries to avoid loading pkg_resources unless we're actually in
an egg.
"""
path = os.path.dirname(os.path.abspath(__file__))
path = os.path.join(path, resname)
if os.path.exists(path):
return path
if hasattr(sys, "frozen"):
exe_path = unicode(sys.executable, sys.getfilesystemencoding())
exe_dir = os.path.dirname(exe_path)
path = os.path.join(exe_dir, resname)
if os.path.exists(path):
return path
else:
import pkg_resources
try:
path = pkg_resources.resource_filename("enchant", resname)
except KeyError:
pass
else:
path = os.path.abspath(path)
if os.path.exists(path):
return path
raise Error("Could not locate resource '%s'" % (resname,))
def win32_data_files():
"""Get list of supporting data files, for use with setup.py
This function returns a list of the supporting data files available
to the running version of PyEnchant. This is in the format expected
by the data_files argument of the distutils setup function. It's
very useful, for example, for including the data files in an executable
produced by py2exe.
Only really tested on the win32 platform (it's the only platform for
which we ship our own supporting data files)
"""
# Include the main enchant DLL
try:
libEnchant = get_resource_filename("libenchant.dll")
except Error:
libEnchant = get_resource_filename("libenchant-1.dll")
mainDir = os.path.dirname(libEnchant)
dataFiles = [('', [libEnchant])]
# And some specific supporting DLLs
for dll in os.listdir(mainDir):
if not dll.endswith(".dll"):
continue
for prefix in ("iconv", "intl", "libglib", "libgmodule"):
if dll.startswith(prefix):
break
else:
continue
dataFiles[0][1].append(os.path.join(mainDir, dll))
# And anything found in the supporting data directories
dataDirs = ("share/enchant/myspell", "share/enchant/ispell", "lib/enchant")
for dataDir in dataDirs:
files = []
fullDir = os.path.join(mainDir, os.path.normpath(dataDir))
for fn in os.listdir(fullDir):
fullFn = os.path.join(fullDir, fn)
if os.path.isfile(fullFn):
files.append(fullFn)
dataFiles.append((dataDir, files))
return dataFiles
win32_data_files._DOC_ERRORS = ["py", "py", "exe"]

264
PACK/maskgen.py Normal file → Executable file
View File

@@ -1,4 +1,4 @@
#!/usr/bin/python
#!/usr/bin/python3
# MaskGen - Generate Password Masks
#
# This tool is part of PACK (Password Analysis and Cracking Kit)
@@ -36,6 +36,11 @@ class MaskGen:
self.minoccurrence = None
self.maxoccurrence = None
self.customcharset1len = None
self.customcharset2len = None
self.customcharset3len = None
self.customcharset4len = None
# PPS (Passwords per Second) Cracking Speed
self.pps = 1000000000
self.showmasks = False
@@ -57,42 +62,60 @@ class MaskGen:
count *= 33
elif char == "a":
count *= 95
elif char == "b":
count *= 256
elif char == "h":
count *= 16
elif char == "H":
count *= 16
elif char == "1" and self.customcharset1len:
count *= self.customcharset1len
elif char == "2" and self.customcharset2len:
count *= self.customcharset2len
elif char == "3" and self.customcharset3len:
count *= self.customcharset3len
elif char == "4" and self.customcharset4len:
count *= self.customcharset4len
else:
print
"[!] Error, unknown mask ?%s in a mask %s" % (char, mask)
print("[!] Error, unknown mask ?%s in a mask %s" %
(char, mask))
return count
def loadmasks(self, filename):
""" Load masks and apply filters. """
maskReader = csv.reader(open(args[0], 'r'), delimiter=',', quotechar='"')
maskReader = csv.reader(
open(args[0], 'r'), delimiter=',', quotechar='"')
for (mask, occurrence) in maskReader:
if mask == "": continue
if mask == "":
continue
mask_occurrence = int(occurrence)
mask_length = len(mask) / 2
mask_length = len(mask)/2
mask_complexity = self.getcomplexity(mask)
mask_time = mask_complexity / self.pps
mask_time = mask_complexity/self.pps
self.total_occurrence += mask_occurrence
# Apply filters based on occurrence, length, complexity and time
if (self.minoccurrence == None or mask_occurrence >= self.minoccurrence) and \
(self.maxoccurrence == None or mask_occurrence <= self.maxoccurrence) and \
(self.mincomplexity == None or mask_complexity <= self.mincomplexity) and \
(self.maxcomplexity == None or mask_complexity <= self.maxcomplexity) and \
(self.mintime == None or mask_time <= self.mintime) and \
(self.maxtime == None or mask_time <= self.maxtime) and \
(self.maxlength == None or mask_length <= self.maxlength) and \
(self.minlength == None or mask_length >= self.minlength):
(self.maxoccurrence == None or mask_occurrence <= self.maxoccurrence) and \
(self.mincomplexity == None or mask_complexity >= self.mincomplexity) and \
(self.maxcomplexity == None or mask_complexity <= self.maxcomplexity) and \
(self.mintime == None or mask_time >= self.mintime) and \
(self.maxtime == None or mask_time <= self.maxtime) and \
(self.maxlength == None or mask_length <= self.maxlength) and \
(self.minlength == None or mask_length >= self.minlength):
self.masks[mask] = dict()
self.masks[mask]['length'] = mask_length
self.masks[mask]['occurrence'] = mask_occurrence
self.masks[mask]['complexity'] = 1 - mask_complexity
self.masks[mask]['time'] = mask_time
self.masks[mask]['optindex'] = 1 - mask_complexity / mask_occurrence
self.masks[mask]['optindex'] = 1 - \
mask_complexity/mask_occurrence
def generate_masks(self, sorting_mode):
""" Generate optimal password masks sorted by occurrence, complexity or optindex """
@@ -104,16 +127,15 @@ class MaskGen:
# Group by length 1,2,3,4,5,6,7,8,9,10....
# Group by occurrence 10%, 20%, 30%, 40%, 50%....
if self.showmasks: print
"[L:] Mask: [ Occ: ] [ Time: ]"
for mask in sorted(self.masks.keys(), key=lambda mask: self.masks[mask][sorting_mode], reverse=True):
if self.showmasks:
print("[L:] Mask: [ Occ: ] [ Time: ]")
for mask in sorted(list(self.masks.keys()), key=lambda mask: self.masks[mask][sorting_mode], reverse=True):
if self.showmasks:
time_human = ">1 year" if self.masks[mask]['time'] > 60 * 60 * 24 * 365 else str(
time_human = ">1 year" if self.masks[mask]['time'] > 60*60*24*365 else str(
datetime.timedelta(seconds=self.masks[mask]['time']))
print
"[{:>2}] {:<30} [{:<7}] [{:>8}] ".format(self.masks[mask]['length'], mask,
self.masks[mask]['occurrence'], time_human)
print("[{:>2}] {:<30} [{:<7}] [{:>8}] ".format(
self.masks[mask]['length'], mask, self.masks[mask]['occurrence'], time_human))
if self.output_file:
self.output_file.write("%s\n" % mask)
@@ -123,20 +145,16 @@ class MaskGen:
sample_count += 1
if self.target_time and sample_time > self.target_time:
print
"[!] Target time exceeded."
print("[!] Target time exceeded.")
break
print
"[*] Finished generating masks:"
print
" Masks generated: %s" % sample_count
print
" Masks coverage: %d%% (%d/%d)" % (
sample_occurrence * 100 / self.total_occurrence, sample_occurrence, self.total_occurrence)
time_human = ">1 year" if sample_time > 60 * 60 * 24 * 365 else str(datetime.timedelta(seconds=sample_time))
print
" Masks runtime: %s" % time_human
print("[*] Finished generating masks:")
print(" Masks generated: %s" % sample_count)
print(" Masks coverage: %d%% (%d/%d)" % (sample_occurrence*100 /
self.total_occurrence, sample_occurrence, self.total_occurrence))
time_human = ">1 year" if sample_time > 60*60*24 * \
365 else str(datetime.timedelta(seconds=sample_time))
print(" Masks runtime: %s" % time_human)
def getmaskscoverage(self, checkmasks):
@@ -145,8 +163,8 @@ class MaskGen:
total_complexity = 0
if self.showmasks: print
"[L:] Mask: [ Occ: ] [ Time: ]"
if self.showmasks:
print("[L:] Mask: [ Occ: ] [ Time: ]")
for mask in checkmasks:
mask = mask.strip()
mask_complexity = self.getcomplexity(mask)
@@ -156,11 +174,10 @@ class MaskGen:
if mask in self.masks:
if self.showmasks:
time_human = ">1 year" if self.masks[mask]['time'] > 60 * 60 * 24 * 365 else str(
time_human = ">1 year" if self.masks[mask]['time'] > 60*60*24*365 else str(
datetime.timedelta(seconds=self.masks[mask]['time']))
print
"[{:>2}] {:<30} [{:<7}] [{:>8}] ".format(self.masks[mask]['length'], mask,
self.masks[mask]['occurrence'], time_human)
print("[{:>2}] {:<30} [{:<7}] [{:>8}] ".format(
self.masks[mask]['length'], mask, self.masks[mask]['occurrence'], time_human))
if self.output_file:
self.output_file.write("%s\n" % mask)
@@ -168,23 +185,19 @@ class MaskGen:
sample_occurrence += self.masks[mask]['occurrence']
sample_count += 1
if self.target_time and total_complexity / self.pps > self.target_time:
print
"[!] Target time exceeded."
if self.target_time and total_complexity/self.pps > self.target_time:
print("[!] Target time exceeded.")
break
# TODO: Something wrong here, complexity and time doesn't match with estimated from policygen
total_time = total_complexity / self.pps
time_human = ">1 year" if total_time > 60 * 60 * 24 * 365 else str(datetime.timedelta(seconds=total_time))
print
"[*] Finished matching masks:"
print
" Masks matched: %s" % sample_count
print
" Masks coverage: %d%% (%d/%d)" % (
sample_occurrence * 100 / self.total_occurrence, sample_occurrence, self.total_occurrence)
print
" Masks runtime: %s" % time_human
total_time = total_complexity/self.pps
time_human = ">1 year" if total_time > 60*60*24 * \
365 else str(datetime.timedelta(seconds=total_time))
print("[*] Finished matching masks:")
print(" Masks matched: %s" % sample_count)
print(" Masks coverage: %d%% (%d/%d)" % (sample_occurrence*100 /
self.total_occurrence, sample_occurrence, self.total_occurrence))
print(" Masks runtime: %s" % time_human)
if __name__ == "__main__":
@@ -199,85 +212,127 @@ if __name__ == "__main__":
header += " |_| iphelix@thesprawl.org\n"
header += "\n"
parser = OptionParser("%prog pass0.masks [pass1.masks ...] [options]", version="%prog " + VERSION)
parser = OptionParser(
"%prog pass0.masks [pass1.masks ...] [options]", version="%prog "+VERSION)
parser.add_option("-t", "--targettime", dest="target_time", type="int", metavar="86400",
help="Target time of all masks (seconds)")
parser.add_option("-o", "--outputmasks", dest="output_masks", metavar="masks.hcmask", help="Save masks to a file")
parser.add_option("-t", "--targettime", dest="target_time", type="int",
metavar="86400", help="Target time of all masks (seconds)")
parser.add_option("-o", "--outputmasks", dest="output_masks",
metavar="masks.hcmask", help="Save masks to a file")
filters = OptionGroup(parser, "Individual Mask Filter Options")
filters.add_option("--minlength", dest="minlength", type="int", metavar="8", help="Minimum password length")
filters.add_option("--maxlength", dest="maxlength", type="int", metavar="8", help="Maximum password length")
filters.add_option("--mintime", dest="mintime", type="int", metavar="3600", help="Minimum mask runtime (seconds)")
filters.add_option("--maxtime", dest="maxtime", type="int", metavar="3600", help="Maximum mask runtime (seconds)")
filters.add_option("--mincomplexity", dest="mincomplexity", type="int", metavar="1", help="Minimum complexity")
filters.add_option("--maxcomplexity", dest="maxcomplexity", type="int", metavar="100", help="Maximum complexity")
filters.add_option("--minoccurrence", dest="minoccurrence", type="int", metavar="1", help="Minimum occurrence")
filters.add_option("--maxoccurrence", dest="maxoccurrence", type="int", metavar="100", help="Maximum occurrence")
filters.add_option("--minlength", dest="minlength",
type="int", metavar="8", help="Minimum password length")
filters.add_option("--maxlength", dest="maxlength",
type="int", metavar="8", help="Maximum password length")
filters.add_option("--mintime", dest="mintime", type="int",
metavar="3600", help="Minimum mask runtime (seconds)")
filters.add_option("--maxtime", dest="maxtime", type="int",
metavar="3600", help="Maximum mask runtime (seconds)")
filters.add_option("--mincomplexity", dest="mincomplexity",
type="int", metavar="1", help="Minimum complexity")
filters.add_option("--maxcomplexity", dest="maxcomplexity",
type="int", metavar="100", help="Maximum complexity")
filters.add_option("--minoccurrence", dest="minoccurrence",
type="int", metavar="1", help="Minimum occurrence")
filters.add_option("--maxoccurrence", dest="maxoccurrence",
type="int", metavar="100", help="Maximum occurrence")
parser.add_option_group(filters)
sorting = OptionGroup(parser, "Mask Sorting Options")
sorting.add_option("--optindex", action="store_true", dest="optindex", help="sort by mask optindex (default)",
default=False)
sorting.add_option("--occurrence", action="store_true", dest="occurrence", help="sort by mask occurrence",
default=False)
sorting.add_option("--complexity", action="store_true", dest="complexity", help="sort by mask complexity",
default=False)
sorting.add_option("--optindex", action="store_true", dest="optindex",
help="sort by mask optindex (default)", default=False)
sorting.add_option("--occurrence", action="store_true", dest="occurrence",
help="sort by mask occurrence", default=False)
sorting.add_option("--complexity", action="store_true", dest="complexity",
help="sort by mask complexity", default=False)
parser.add_option_group(sorting)
coverage = OptionGroup(parser, "Check mask coverage")
coverage.add_option("--checkmasks", dest="checkmasks", help="check mask coverage",
metavar="?u?l?l?l?l?l?d,?l?l?l?l?l?d?d")
coverage.add_option("--checkmasksfile", dest="checkmasks_file", help="check mask coverage in a file",
metavar="masks.hcmask")
coverage.add_option("--checkmasks", dest="checkmasks",
help="check mask coverage", metavar="?u?l?l?l?l?l?d,?l?l?l?l?l?d?d")
coverage.add_option("--checkmasksfile", dest="checkmasks_file",
help="check mask coverage in a file", metavar="masks.hcmask")
parser.add_option_group(coverage)
parser.add_option("--showmasks", dest="showmasks", help="Show matching masks", action="store_true", default=False)
parser.add_option("--showmasks", dest="showmasks",
help="Show matching masks", action="store_true", default=False)
custom = OptionGroup(parser, "Custom charater set options")
custom.add_option("--custom-charset1-len", dest="customcharset1len",
type="int", metavar="26", help="Length of cutom character set 1")
custom.add_option("--custom-charset2-len", dest="customcharset2len",
type="int", metavar="26", help="Length of cutom character set 2")
custom.add_option("--custom-charset3-len", dest="customcharset3len",
type="int", metavar="26", help="Length of cutom character set 3")
custom.add_option("--custom-charset4-len", dest="customcharset4len",
type="int", metavar="26", help="Length of cutom character set 4")
parser.add_option_group(custom)
misc = OptionGroup(parser, "Miscellaneous options")
misc.add_option("--pps", dest="pps", help="Passwords per Second", type="int", metavar="1000000000")
misc.add_option("-q", "--quiet", action="store_true", dest="quiet", default=False, help="Don't show headers.")
misc.add_option("--pps", dest="pps", help="Passwords per Second",
type="int", metavar="1000000000")
misc.add_option("-q", "--quiet", action="store_true",
dest="quiet", default=False, help="Don't show headers.")
parser.add_option_group(misc)
(options, args) = parser.parse_args()
# Print program header
if not options.quiet:
print
header
print(header)
if len(args) < 1:
parser.error("no masks file specified! Please provide statsgen output.")
parser.error(
"no masks file specified! Please provide statsgen output.")
exit(1)
print
"[*] Analyzing masks in [%s]" % args[0]
print("[*] Analyzing masks in [%s]" % args[0])
maskgen = MaskGen()
# Settings
if options.target_time: maskgen.target_time = options.target_time
if options.target_time:
maskgen.target_time = options.target_time
if options.output_masks:
print
"[*] Saving generated masks to [%s]" % options.output_masks
print("[*] Saving generated masks to [%s]" % options.output_masks)
maskgen.output_file = open(options.output_masks, 'w')
# Filters
if options.minlength: maskgen.minlength = options.minlength
if options.maxlength: maskgen.maxlength = options.maxlength
if options.mintime: maskgen.mintime = options.mintime
if options.maxtime: maskgen.maxtime = options.maxtime
if options.mincomplexity: maskgen.mincomplexity = options.mincomplexity
if options.maxcomplexity: maskgen.maxcomplexity = options.maxcomplexity
if options.minoccurrence: maskgen.minoccurrence = options.minoccurrence
if options.maxoccurrence: maskgen.maxoccurrence = options.maxoccurrence
if options.minlength:
maskgen.minlength = options.minlength
if options.maxlength:
maskgen.maxlength = options.maxlength
if options.mintime:
maskgen.mintime = options.mintime
if options.maxtime:
maskgen.maxtime = options.maxtime
if options.mincomplexity:
maskgen.mincomplexity = options.mincomplexity
if options.maxcomplexity:
maskgen.maxcomplexity = options.maxcomplexity
if options.minoccurrence:
maskgen.minoccurrence = options.minoccurrence
if options.maxoccurrence:
maskgen.maxoccurrence = options.maxoccurrence
# Custom
if options.customcharset1len:
maskgen.customcharset1len = options.customcharset1len
if options.customcharset2len:
maskgen.customcharset2len = options.customcharset2len
if options.customcharset3len:
maskgen.customcharset3len = options.customcharset3len
if options.customcharset4len:
maskgen.customcharset4len = options.customcharset4len
# Misc
if options.pps: maskgen.pps = options.pps
if options.showmasks: maskgen.showmasks = options.showmasks
if options.pps:
maskgen.pps = options.pps
if options.showmasks:
maskgen.showmasks = options.showmasks
print
"[*] Using {:,d} keys/sec for calculations.".format(maskgen.pps)
print("[*] Using {:,d} keys/sec for calculations.".format(maskgen.pps))
# Load masks
for arg in args:
@@ -286,15 +341,15 @@ if __name__ == "__main__":
# Matching masks from the command-line
if options.checkmasks:
checkmasks = [m.strip() for m in options.checkmasks.split(',')]
print
"[*] Checking coverage of the these masks [%s]" % ", ".join(checkmasks)
print("[*] Checking coverage of the these masks [%s]" %
", ".join(checkmasks))
maskgen.getmaskscoverage(checkmasks)
# Matching masks from a file
elif options.checkmasks_file:
checkmasks_file = open(options.checkmasks_file, 'r')
print
"[*] Checking coverage of masks in [%s]" % options.checkmasks_file
print("[*] Checking coverage of masks in [%s]" %
options.checkmasks_file)
maskgen.getmaskscoverage(checkmasks_file)
# Printing masks in a file
@@ -307,6 +362,5 @@ if __name__ == "__main__":
else:
sorting_mode = "optindex"
print
"[*] Sorting masks by their [%s]." % sorting_mode
print("[*] Sorting masks by their [%s]." % sorting_mode)
maskgen.generate_masks(sorting_mode)

130
PACK/policygen.py Normal file → Executable file
View File

@@ -1,4 +1,4 @@
#!/usr/bin/python
#!/usr/bin/env python3
# PolicyGen - Analyze and Generate password masks according to a password policy
#
# This tool is part of PACK (Password Analysis and Cracking Kit)
@@ -10,7 +10,9 @@
#
# Please see the attached LICENSE file for additional licensing information.
import sys, string, random
import sys
import string
import random
import datetime
from optparse import OptionParser, OptionGroup
import itertools
@@ -52,7 +54,7 @@ class PolicyGen:
elif char == "a":
count *= 95
else:
print
print()
"[!] Error, unknown mask ?%s in a mask %s" % (char, mask)
return count
@@ -69,8 +71,8 @@ class PolicyGen:
sample_complexity = 0
# TODO: Randomize or even statistically arrange matching masks
for length in xrange(self.minlength, self.maxlength + 1):
print
for length in range(self.minlength, self.maxlength + 1):
print()
"[*] Generating %d character password masks." % length
total_length_count = 0
sample_length_count = 0
@@ -106,14 +108,14 @@ class PolicyGen:
# Filter according to password policy
# NOTE: Perform exact opposite (XOR) operation if noncompliant
# flag was set when calling the function.
if ((self.minlower == None or lowercount >= self.minlower) and \
(self.maxlower == None or lowercount <= self.maxlower) and \
(self.minupper == None or uppercount >= self.minupper) and \
(self.maxupper == None or uppercount <= self.maxupper) and \
(self.mindigit == None or digitcount >= self.mindigit) and \
(self.maxdigit == None or digitcount <= self.maxdigit) and \
(self.minspecial == None or specialcount >= self.minspecial) and \
(self.maxspecial == None or specialcount <= self.maxspecial)) ^ noncompliant:
if ((self.minlower == None or lowercount >= self.minlower) and
(self.maxlower == None or lowercount <= self.maxlower) and
(self.minupper == None or uppercount >= self.minupper) and
(self.maxupper == None or uppercount <= self.maxupper) and
(self.mindigit == None or digitcount >= self.mindigit) and
(self.maxdigit == None or digitcount <= self.maxdigit) and
(self.minspecial == None or specialcount >= self.minspecial) and
(self.maxspecial == None or specialcount <= self.maxspecial)) ^ noncompliant:
sample_length_count += 1
sample_length_complexity += mask_complexity
@@ -122,10 +124,9 @@ class PolicyGen:
mask_time = mask_complexity / self.pps
time_human = ">1 year" if mask_time > 60 * 60 * 24 * 365 else str(
datetime.timedelta(seconds=mask_time))
print
"[{:>2}] {:<30} [l:{:>2} u:{:>2} d:{:>2} s:{:>2}] [{:>8}] ".format(length, mask, lowercount,
print("[{:>2}] {:<30} [l:{:>2} u:{:>2} d:{:>2} s:{:>2}] [{:>8}] ".format(length, mask, lowercount,
uppercount, digitcount,
specialcount, time_human)
specialcount, time_human))
if self.output_file:
self.output_file.write("%s\n" % mask)
@@ -137,15 +138,14 @@ class PolicyGen:
sample_complexity += sample_length_complexity
total_time = total_complexity / self.pps
total_time_human = ">1 year" if total_time > 60 * 60 * 24 * 365 else str(datetime.timedelta(seconds=total_time))
print
"[*] Total Masks: %d Time: %s" % (total_count, total_time_human)
total_time_human = ">1 year" if total_time > 60 * 60 * 24 * \
365 else str(datetime.timedelta(seconds=total_time))
print("[*] Total Masks: %d Time: %s" % (total_count, total_time_human))
sample_time = sample_complexity / self.pps
sample_time_human = ">1 year" if sample_time > 60 * 60 * 24 * 365 else str(
datetime.timedelta(seconds=sample_time))
print
"[*] Policy Masks: %d Time: %s" % (sample_count, sample_time_human)
print("[*] Policy Masks: %d Time: %s" % (sample_count, sample_time_human))
if __name__ == "__main__":
@@ -161,10 +161,14 @@ if __name__ == "__main__":
header += "\n"
# parse command line arguments
parser = OptionParser("%prog [options]\n\nType --help for more options", version="%prog " + VERSION)
parser.add_option("-o", "--outputmasks", dest="output_masks", help="Save masks to a file", metavar="masks.hcmask")
parser.add_option("--pps", dest="pps", help="Passwords per Second", type="int", metavar="1000000000")
parser.add_option("--showmasks", dest="showmasks", help="Show matching masks", action="store_true", default=False)
parser = OptionParser(
"%prog [options]\n\nType --help for more options", version="%prog " + VERSION)
parser.add_option("-o", "--outputmasks", dest="output_masks",
help="Save masks to a file", metavar="masks.hcmask")
parser.add_option("--pps", dest="pps", help="Passwords per Second",
type="int", metavar="1000000000")
parser.add_option("--showmasks", dest="showmasks",
help="Show matching masks", action="store_true", default=False)
parser.add_option("--noncompliant", dest="noncompliant", help="Generate masks for noncompliant passwords",
action="store_true", default=False)
@@ -174,14 +178,16 @@ if __name__ == "__main__":
help="Minimum password length")
group.add_option("--maxlength", dest="maxlength", type="int", metavar="8", default=8,
help="Maximum password length")
group.add_option("--mindigit", dest="mindigit", type="int", metavar="1", help="Minimum number of digits")
group.add_option("--mindigit", dest="mindigit", type="int",
metavar="1", help="Minimum number of digits")
group.add_option("--minlower", dest="minlower", type="int", metavar="1",
help="Minimum number of lower-case characters")
group.add_option("--minupper", dest="minupper", type="int", metavar="1",
help="Minimum number of upper-case characters")
group.add_option("--minspecial", dest="minspecial", type="int", metavar="1",
help="Minimum number of special characters")
group.add_option("--maxdigit", dest="maxdigit", type="int", metavar="3", help="Maximum number of digits")
group.add_option("--maxdigit", dest="maxdigit", type="int",
metavar="3", help="Maximum number of digits")
group.add_option("--maxlower", dest="maxlower", type="int", metavar="3",
help="Maximum number of lower-case characters")
group.add_option("--maxupper", dest="maxupper", type="int", metavar="3",
@@ -190,54 +196,62 @@ if __name__ == "__main__":
help="Maximum number of special characters")
parser.add_option_group(group)
parser.add_option("-q", "--quiet", action="store_true", dest="quiet", default=False, help="Don't show headers.")
parser.add_option("-q", "--quiet", action="store_true",
dest="quiet", default=False, help="Don't show headers.")
(options, args) = parser.parse_args()
# Print program header
if not options.quiet:
print
print()
header
policygen = PolicyGen()
# Settings
# Settings
if options.output_masks:
print
"[*] Saving generated masks to [%s]" % options.output_masks
print("[*] Saving generated masks to [%s]" % options.output_masks)
policygen.output_file = open(options.output_masks, 'w')
# Password policy
if options.minlength != None: policygen.minlength = options.minlength
if options.maxlength != None: policygen.maxlength = options.maxlength
if options.mindigit != None: policygen.mindigit = options.mindigit
if options.minlower != None: policygen.minlower = options.minlower
if options.minupper != None: policygen.minupper = options.minupper
if options.minspecial != None: policygen.minspecial = options.minspecial
if options.maxdigit != None: policygen.maxdigits = options.maxdigit
if options.maxlower != None: policygen.maxlower = options.maxlower
if options.maxupper != None: policygen.maxupper = options.maxupper
if options.maxspecial != None: policygen.maxspecial = options.maxspecial
if options.minlength != None:
policygen.minlength = options.minlength
if options.maxlength != None:
policygen.maxlength = options.maxlength
if options.mindigit != None:
policygen.mindigit = options.mindigit
if options.minlower != None:
policygen.minlower = options.minlower
if options.minupper != None:
policygen.minupper = options.minupper
if options.minspecial != None:
policygen.minspecial = options.minspecial
if options.maxdigit != None:
policygen.maxdigits = options.maxdigit
if options.maxlower != None:
policygen.maxlower = options.maxlower
if options.maxupper != None:
policygen.maxupper = options.maxupper
if options.maxspecial != None:
policygen.maxspecial = options.maxspecial
# Misc
if options.pps: policygen.pps = options.pps
if options.showmasks: policygen.showmasks = options.showmasks
if options.pps:
policygen.pps = options.pps
if options.showmasks:
policygen.showmasks = options.showmasks
print
"[*] Using {:,d} keys/sec for calculations.".format(policygen.pps)
print("[*] Using {:,d} keys/sec for calculations.".format(policygen.pps))
# Print current password policy
print
"[*] Password policy:"
print
" Pass Lengths: min:%d max:%d" % (policygen.minlength, policygen.maxlength)
print
" Min strength: l:%s u:%s d:%s s:%s" % (
policygen.minlower, policygen.minupper, policygen.mindigit, policygen.minspecial)
print
" Max strength: l:%s u:%s d:%s s:%s" % (
policygen.maxlower, policygen.maxupper, policygen.maxdigit, policygen.maxspecial)
print("[*] Password policy:")
print(" Pass Lengths: min:%d max:%d" % (
policygen.minlength, policygen.maxlength))
print(" Min strength: l:%s u:%s d:%s s:%s" % (
policygen.minlower, policygen.minupper, policygen.mindigit, policygen.minspecial))
print(" Max strength: l:%s u:%s d:%s s:%s" % (
policygen.maxlower, policygen.maxupper, policygen.maxdigit, policygen.maxspecial))
print
"[*] Generating [%s] masks." % ("compliant" if not options.noncompliant else "non-compliant")
print("[*] Generating [%s] masks." % (
"compliant" if not options.noncompliant else "non-compliant"))
policygen.generate_masks(options.noncompliant)

File diff suppressed because it is too large Load Diff

239
PACK/statsgen.py Normal file → Executable file
View File

@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# StatsGen - Password Statistical Analysis tool
#
# This tool is part of PACK (Password Analysis and Cracking Kit)
@@ -11,7 +11,9 @@
# Please see the attached LICENSE file for additional licensing information.
import sys
import re, operator, string
import re
import operator
import string
from optparse import OptionParser, OptionGroup
import time
@@ -73,26 +75,30 @@ class StatsGen:
if letter in string.digits:
digit += 1
advancedmask_string += "?d"
if not simplemask or not simplemask[-1] == 'digit': simplemask.append('digit')
if not simplemask or not simplemask[-1] == 'digit':
simplemask.append('digit')
elif letter in string.lowercase:
elif letter in string.ascii_lowercase:
lower += 1
advancedmask_string += "?l"
if not simplemask or not simplemask[-1] == 'string': simplemask.append('string')
if not simplemask or not simplemask[-1] == 'string':
simplemask.append('string')
elif letter in string.uppercase:
elif letter in string.ascii_uppercase:
upper += 1
advancedmask_string += "?u"
if not simplemask or not simplemask[-1] == 'string': simplemask.append('string')
if not simplemask or not simplemask[-1] == 'string':
simplemask.append('string')
else:
special += 1
advancedmask_string += "?s"
if not simplemask or not simplemask[-1] == 'special': simplemask.append('special')
if not simplemask or not simplemask[-1] == 'special':
simplemask.append('special')
# String representation of masks
simplemask_string = ''.join(simplemask) if len(simplemask) <= 3 else 'othermask'
simplemask_string = ''.join(simplemask) if len(
simplemask) <= 3 else 'othermask'
# Policy
policy = (digit, lower, upper, special)
@@ -136,106 +142,109 @@ class StatsGen:
def generate_stats(self, filename):
""" Generate password statistics. """
f = open(filename, 'r')
with open(filename, 'r') as f:
for password in f:
password = password.rstrip('\r\n')
for password in f:
password = password.rstrip('\r\n')
if len(password) == 0: continue
if len(password) == 0:
continue
self.total_counter += 1
self.total_counter += 1
(pass_length, characterset, simplemask, advancedmask, policy) = self.analyze_password(password)
(digit, lower, upper, special) = policy
(pass_length, characterset, simplemask, advancedmask,
policy) = self.analyze_password(password)
(digit, lower, upper, special) = policy
if (self.charsets == None or characterset in self.charsets) and \
(self.simplemasks == None or simplemask in self.simplemasks) and \
(self.maxlength == None or pass_length <= self.maxlength) and \
(self.minlength == None or pass_length >= self.minlength):
if (self.charsets == None or characterset in self.charsets) and \
(self.simplemasks == None or simplemask in self.simplemasks) and \
(self.maxlength == None or pass_length <= self.maxlength) and \
(self.minlength == None or pass_length >= self.minlength):
self.filter_counter += 1
self.filter_counter += 1
if self.mindigit == None or digit < self.mindigit: self.mindigit = digit
if self.maxdigit == None or digit > self.maxdigit: self.maxdigit = digit
if self.mindigit == None or digit < self.mindigit:
self.mindigit = digit
if self.maxdigit == None or digit > self.maxdigit:
self.maxdigit = digit
if self.minupper == None or upper < self.minupper: self.minupper = upper
if self.maxupper == None or upper > self.maxupper: self.maxupper = upper
if self.minupper == None or upper < self.minupper:
self.minupper = upper
if self.maxupper == None or upper > self.maxupper:
self.maxupper = upper
if self.minlower == None or lower < self.minlower: self.minlower = lower
if self.maxlower == None or lower > self.maxlower: self.maxlower = lower
if self.minlower == None or lower < self.minlower:
self.minlower = lower
if self.maxlower == None or lower > self.maxlower:
self.maxlower = lower
if self.minspecial == None or special < self.minspecial: self.minspecial = special
if self.maxspecial == None or special > self.maxspecial: self.maxspecial = special
if self.minspecial == None or special < self.minspecial:
self.minspecial = special
if self.maxspecial == None or special > self.maxspecial:
self.maxspecial = special
if pass_length in self.stats_length:
self.stats_length[pass_length] += 1
else:
self.stats_length[pass_length] = 1
if pass_length in self.stats_length:
self.stats_length[pass_length] += 1
else:
self.stats_length[pass_length] = 1
if characterset in self.stats_charactersets:
self.stats_charactersets[characterset] += 1
else:
self.stats_charactersets[characterset] = 1
if characterset in self.stats_charactersets:
self.stats_charactersets[characterset] += 1
else:
self.stats_charactersets[characterset] = 1
if simplemask in self.stats_simplemasks:
self.stats_simplemasks[simplemask] += 1
else:
self.stats_simplemasks[simplemask] = 1
if simplemask in self.stats_simplemasks:
self.stats_simplemasks[simplemask] += 1
else:
self.stats_simplemasks[simplemask] = 1
if advancedmask in self.stats_advancedmasks:
self.stats_advancedmasks[advancedmask] += 1
else:
self.stats_advancedmasks[advancedmask] = 1
f.close()
if advancedmask in self.stats_advancedmasks:
self.stats_advancedmasks[advancedmask] += 1
else:
self.stats_advancedmasks[advancedmask] = 1
def print_stats(self):
""" Print password statistics. """
print
"[+] Analyzing %d%% (%d/%d) of passwords" % (
self.filter_counter * 100 / self.total_counter, self.filter_counter, self.total_counter)
print
" NOTE: Statistics below is relative to the number of analyzed passwords, not total number of passwords"
print
"\n[*] Length:"
for (length, count) in sorted(self.stats_length.iteritems(), key=operator.itemgetter(1), reverse=True):
if self.hiderare and not count * 100 / self.filter_counter > 0: continue
print
"[+] %25d: %02d%% (%d)" % (length, count * 100 / self.filter_counter, count)
print("[+] Analyzing %d%% (%d/%d) of passwords" % (self.filter_counter *
100/self.total_counter, self.filter_counter, self.total_counter))
print(" NOTE: Statistics below is relative to the number of analyzed passwords, not total number of passwords")
print("\n[*] Length:")
for (length, count) in sorted(iter(self.stats_length.items()), key=operator.itemgetter(1), reverse=True):
if self.hiderare and not count*100/self.filter_counter > 0:
continue
print("[+] %25d: %02d%% (%d)" %
(length, count*100/self.filter_counter, count))
print
"\n[*] Character-set:"
for (char, count) in sorted(self.stats_charactersets.iteritems(), key=operator.itemgetter(1), reverse=True):
if self.hiderare and not count * 100 / self.filter_counter > 0: continue
print
"[+] %25s: %02d%% (%d)" % (char, count * 100 / self.filter_counter, count)
print("\n[*] Character-set:")
for (char, count) in sorted(iter(self.stats_charactersets.items()), key=operator.itemgetter(1), reverse=True):
if self.hiderare and not count*100/self.filter_counter > 0:
continue
print("[+] %25s: %02d%% (%d)" %
(char, count*100/self.filter_counter, count))
print
"\n[*] Password complexity:"
print
"[+] digit: min(%s) max(%s)" % (self.mindigit, self.maxdigit)
print
"[+] lower: min(%s) max(%s)" % (self.minlower, self.maxlower)
print
"[+] upper: min(%s) max(%s)" % (self.minupper, self.maxupper)
print
"[+] special: min(%s) max(%s)" % (self.minspecial, self.maxspecial)
print("\n[*] Password complexity:")
print("[+] digit: min(%s) max(%s)" %
(self.mindigit, self.maxdigit))
print("[+] lower: min(%s) max(%s)" %
(self.minlower, self.maxlower))
print("[+] upper: min(%s) max(%s)" %
(self.minupper, self.maxupper))
print("[+] special: min(%s) max(%s)" %
(self.minspecial, self.maxspecial))
print
"\n[*] Simple Masks:"
for (simplemask, count) in sorted(self.stats_simplemasks.iteritems(), key=operator.itemgetter(1), reverse=True):
if self.hiderare and not count * 100 / self.filter_counter > 0: continue
print
"[+] %25s: %02d%% (%d)" % (simplemask, count * 100 / self.filter_counter, count)
print("\n[*] Simple Masks:")
for (simplemask, count) in sorted(iter(self.stats_simplemasks.items()), key=operator.itemgetter(1), reverse=True):
if self.hiderare and not count*100/self.filter_counter > 0:
continue
print("[+] %25s: %02d%% (%d)" %
(simplemask, count*100/self.filter_counter, count))
print
"\n[*] Advanced Masks:"
for (advancedmask, count) in sorted(self.stats_advancedmasks.iteritems(), key=operator.itemgetter(1),
reverse=True):
if count * 100 / self.filter_counter > 0:
print
"[+] %25s: %02d%% (%d)" % (advancedmask, count * 100 / self.filter_counter, count)
print("\n[*] Advanced Masks:")
for (advancedmask, count) in sorted(iter(self.stats_advancedmasks.items()), key=operator.itemgetter(1), reverse=True):
if count*100/self.filter_counter > 0:
print("[+] %25s: %02d%% (%d)" %
(advancedmask, count*100/self.filter_counter, count))
if self.output_file:
self.output_file.write("%s,%d\n" % (advancedmask, count))
@@ -253,49 +262,57 @@ if __name__ == "__main__":
header += " |_| iphelix@thesprawl.org\n"
header += "\n"
parser = OptionParser("%prog [options] passwords.txt\n\nType --help for more options", version="%prog " + VERSION)
parser = OptionParser(
"%prog [options] passwords.txt\n\nType --help for more options", version="%prog "+VERSION)
filters = OptionGroup(parser, "Password Filters")
filters.add_option("--minlength", dest="minlength", type="int", metavar="8", help="Minimum password length")
filters.add_option("--maxlength", dest="maxlength", type="int", metavar="8", help="Maximum password length")
filters.add_option("--charset", dest="charsets", help="Password charset filter (comma separated)",
metavar="loweralpha,numeric")
filters.add_option("--simplemask", dest="simplemasks", help="Password mask filter (comma separated)",
metavar="stringdigit,allspecial")
filters.add_option("--minlength", dest="minlength",
type="int", metavar="8", help="Minimum password length")
filters.add_option("--maxlength", dest="maxlength",
type="int", metavar="8", help="Maximum password length")
filters.add_option("--charset", dest="charsets",
help="Password charset filter (comma separated)", metavar="loweralpha,numeric")
filters.add_option("--simplemask", dest="simplemasks",
help="Password mask filter (comma separated)", metavar="stringdigit,allspecial")
parser.add_option_group(filters)
parser.add_option("-o", "--output", dest="output_file", help="Save masks and stats to a file",
metavar="password.masks")
parser.add_option("--hiderare", action="store_true", dest="hiderare", default=False,
help="Hide statistics covering less than 1% of the sample")
parser.add_option("-o", "--output", dest="output_file",
help="Save masks and stats to a file", metavar="password.masks")
parser.add_option("--hiderare", action="store_true", dest="hiderare",
default=False, help="Hide statistics covering less than 1% of the sample")
parser.add_option("-q", "--quiet", action="store_true", dest="quiet", default=False, help="Don't show headers.")
parser.add_option("-q", "--quiet", action="store_true",
dest="quiet", default=False, help="Don't show headers.")
(options, args) = parser.parse_args()
# Print program header
if not options.quiet:
print
header
print(header)
if len(args) != 1:
parser.error("no passwords file specified")
exit(1)
print
"[*] Analyzing passwords in [%s]" % args[0]
print("[*] Analyzing passwords in [%s]" % args[0])
statsgen = StatsGen()
if not options.minlength == None: statsgen.minlength = options.minlength
if not options.maxlength == None: statsgen.maxlength = options.maxlength
if not options.charsets == None: statsgen.charsets = [x.strip() for x in options.charsets.split(',')]
if not options.simplemasks == None: statsgen.simplemasks = [x.strip() for x in options.simplemasks.split(',')]
if not options.minlength == None:
statsgen.minlength = options.minlength
if not options.maxlength == None:
statsgen.maxlength = options.maxlength
if not options.charsets == None:
statsgen.charsets = [x.strip() for x in options.charsets.split(',')]
if not options.simplemasks == None:
statsgen.simplemasks = [x.strip()
for x in options.simplemasks.split(',')]
if options.hiderare: statsgen.hiderare = options.hiderare
if options.hiderare:
statsgen.hiderare = options.hiderare
if options.output_file:
print
"[*] Saving advanced masks and occurrences to [%s]" % options.output_file
print("[*] Saving advanced masks and occurrences to [%s]" %
options.output_file)
statsgen.output_file = open(options.output_file, 'w')
statsgen.generate_stats(args[0])