mirror of
https://github.com/trustedsec/hate_crack.git
synced 2025-12-05 20:39:59 -08:00
Updated statsgen.py, maskgen.py, and policygen.py to Python3
Deleted policygen.py (too much work)
This commit is contained in:
@@ -1,907 +0,0 @@
|
||||
# pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2011, Ryan Kelly
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPsE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
"""
|
||||
enchant: Access to the enchant spellchecking library
|
||||
=====================================================
|
||||
|
||||
This module provides several classes for performing spell checking
|
||||
via the Enchant spellchecking library. For more details on Enchant,
|
||||
visit the project website:
|
||||
|
||||
http://www.abisource.com/enchant/
|
||||
|
||||
Spellchecking is performed using 'Dict' objects, which represent
|
||||
a language dictionary. Their use is best demonstrated by a quick
|
||||
example::
|
||||
|
||||
>>> import enchant
|
||||
>>> d = enchant.Dict("en_US") # create dictionary for US English
|
||||
>>> d.check("enchant")
|
||||
True
|
||||
>>> d.check("enchnt")
|
||||
False
|
||||
>>> d.suggest("enchnt")
|
||||
['enchant', 'enchants', 'enchanter', 'penchant', 'incant', 'enchain', 'enchanted']
|
||||
|
||||
Languages are identified by standard string tags such as "en" (English)
|
||||
and "fr" (French). Specific language dialects can be specified by
|
||||
including an additional code - for example, "en_AU" refers to Australian
|
||||
English. The later form is preferred as it is more widely supported.
|
||||
|
||||
To check whether a dictionary exists for a given language, the function
|
||||
'dict_exists' is available. Dictionaries may also be created using the
|
||||
function 'request_dict'.
|
||||
|
||||
A finer degree of control over the dictionaries and how they are created
|
||||
can be obtained using one or more 'Broker' objects. These objects are
|
||||
responsible for locating dictionaries for a specific language.
|
||||
|
||||
In Python 2.x, unicode strings are supported transparently in the
|
||||
standard manner - if a unicode string is given as an argument, the
|
||||
result will be a unicode string. Note that Enchant works in UTF-8
|
||||
internally, so passing an ASCII string to a dictionary for a language
|
||||
requiring Unicode may result in UTF-8 strings being returned.
|
||||
|
||||
In Python 3.x unicode strings are expected throughout. Bytestrings
|
||||
should not be passed into any functions.
|
||||
|
||||
Errors that occur in this module are reported by raising subclasses
|
||||
of 'Error'.
|
||||
|
||||
"""
|
||||
_DOC_ERRORS = ['enchnt', 'enchnt', 'fr']
|
||||
|
||||
# Make version info available
|
||||
__ver_major__ = 1
|
||||
__ver_minor__ = 6
|
||||
__ver_patch__ = 6
|
||||
__ver_sub__ = ""
|
||||
__version__ = "%d.%d.%d%s" % (__ver_major__, __ver_minor__,
|
||||
__ver_patch__, __ver_sub__)
|
||||
|
||||
import os
|
||||
|
||||
try:
|
||||
from enchant import _enchant as _e
|
||||
except ImportError:
|
||||
if not os.environ.get("PYENCHANT_IGNORE_MISSING_LIB", False):
|
||||
raise
|
||||
_e = None
|
||||
|
||||
from enchant.errors import *
|
||||
from enchant.utils import EnchantStr, get_default_language
|
||||
from enchant.pypwl import PyPWL
|
||||
|
||||
# Due to the unfortunate name collision between the enchant "tokenize" module
|
||||
# and the stdlib "tokenize" module, certain values of sys.path can cause
|
||||
# the former to override the latter and break the "warnings" module.
|
||||
# This hacks around it by making a dumming "warnings" module.
|
||||
try:
|
||||
import warnings
|
||||
except ImportError:
|
||||
class warnings(object):
|
||||
def warn(self, *args, **kwds):
|
||||
pass
|
||||
|
||||
|
||||
warnings = warnings()
|
||||
|
||||
|
||||
class ProviderDesc(object):
|
||||
"""Simple class describing an Enchant provider.
|
||||
|
||||
Each provider has the following information associated with it:
|
||||
|
||||
* name: Internal provider name (e.g. "aspell")
|
||||
* desc: Human-readable description (e.g. "Aspell Provider")
|
||||
* file: Location of the library containing the provider
|
||||
|
||||
"""
|
||||
_DOC_ERRORS = ["desc"]
|
||||
|
||||
def __init__(self, name, desc, file):
|
||||
self.name = name
|
||||
self.desc = desc
|
||||
self.file = file
|
||||
|
||||
def __str__(self):
|
||||
return "<Enchant: %s>" % self.desc
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
def __eq__(self, pd):
|
||||
"""Equality operator on ProviderDesc objects."""
|
||||
return (self.name == pd.name and \
|
||||
self.desc == pd.desc and \
|
||||
self.file == pd.file)
|
||||
|
||||
def __hash__(self):
|
||||
"""Hash operator on ProviderDesc objects."""
|
||||
return hash(self.name + self.desc + self.file)
|
||||
|
||||
|
||||
class _EnchantObject(object):
|
||||
"""Base class for enchant objects.
|
||||
|
||||
This class implements some general functionality for interfacing with
|
||||
the '_enchant' C-library in a consistent way. All public objects
|
||||
from the 'enchant' module are subclasses of this class.
|
||||
|
||||
All enchant objects have an attribute '_this' which contains the
|
||||
pointer to the underlying C-library object. The method '_check_this'
|
||||
can be called to ensure that this point is not None, raising an
|
||||
exception if it is.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""_EnchantObject constructor."""
|
||||
self._this = None
|
||||
# To be importable when enchant C lib is missing, we need
|
||||
# to create a dummy default broker.
|
||||
if _e is not None:
|
||||
self._init_this()
|
||||
|
||||
def _check_this(self, msg=None):
|
||||
"""Check that self._this is set to a pointer, rather than None."""
|
||||
if msg is None:
|
||||
msg = "%s unusable: the underlying C-library object has been freed."
|
||||
msg = msg % (self.__class__.__name__,)
|
||||
if self._this is None:
|
||||
raise Error(msg)
|
||||
|
||||
def _init_this(self):
|
||||
"""Initialise the underlying C-library object pointer."""
|
||||
raise NotImplementedError
|
||||
|
||||
def _raise_error(self, default="Unspecified Error", eclass=Error):
|
||||
"""Raise an exception based on available error messages.
|
||||
|
||||
This method causes an Error to be raised. Subclasses should
|
||||
override it to retrieve an error indication from the underlying
|
||||
API if possible. If such a message cannot be retrieved, the
|
||||
argument value <default> is used. The class of the exception
|
||||
can be specified using the argument <eclass>
|
||||
"""
|
||||
raise eclass(default)
|
||||
|
||||
_raise_error._DOC_ERRORS = ["eclass"]
|
||||
|
||||
def __getstate__(self):
|
||||
"""Customize pickling of PyEnchant objects.
|
||||
|
||||
Since it's not safe for multiple objects to share the same C-library
|
||||
object, we make sure it's unset when pickling.
|
||||
"""
|
||||
state = self.__dict__.copy()
|
||||
state["_this"] = None
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.__dict__.update(state)
|
||||
self._init_this()
|
||||
|
||||
|
||||
class Broker(_EnchantObject):
|
||||
"""Broker object for the Enchant spellchecker.
|
||||
|
||||
Broker objects are responsible for locating and managing dictionaries.
|
||||
Unless custom functionality is required, there is no need to use Broker
|
||||
objects directly. The 'enchant' module provides a default broker object
|
||||
so that 'Dict' objects can be created directly.
|
||||
|
||||
The most important methods of this class include:
|
||||
|
||||
* dict_exists: check existence of a specific language dictionary
|
||||
* request_dict: obtain a dictionary for specific language
|
||||
* set_ordering: specify which dictionaries to try for for a
|
||||
given language.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Broker object constructor.
|
||||
|
||||
This method is the constructor for the 'Broker' object. No
|
||||
arguments are required.
|
||||
"""
|
||||
_EnchantObject.__init__(self)
|
||||
|
||||
def _init_this(self):
|
||||
self._this = _e.broker_init()
|
||||
if not self._this:
|
||||
raise Error("Could not initialise an enchant broker.")
|
||||
|
||||
def __del__(self):
|
||||
"""Broker object destructor."""
|
||||
if _e is not None:
|
||||
self._free()
|
||||
|
||||
def _raise_error(self, default="Unspecified Error", eclass=Error):
|
||||
"""Overrides _EnchantObject._raise_error to check broker errors."""
|
||||
err = _e.broker_get_error(self._this)
|
||||
if err == "" or err is None:
|
||||
raise eclass(default)
|
||||
raise eclass(err)
|
||||
|
||||
def _free(self):
|
||||
"""Free system resource associated with a Broker object.
|
||||
|
||||
This method can be called to free the underlying system resources
|
||||
associated with a Broker object. It is called automatically when
|
||||
the object is garbage collected. If called explicitly, the
|
||||
Broker and any associated Dict objects must no longer be used.
|
||||
"""
|
||||
if self._this is not None:
|
||||
_e.broker_free(self._this)
|
||||
self._this = None
|
||||
|
||||
def request_dict(self, tag=None):
|
||||
"""Request a Dict object for the language specified by <tag>.
|
||||
|
||||
This method constructs and returns a Dict object for the
|
||||
requested language. 'tag' should be a string of the appropriate
|
||||
form for specifying a language, such as "fr" (French) or "en_AU"
|
||||
(Australian English). The existence of a specific language can
|
||||
be tested using the 'dict_exists' method.
|
||||
|
||||
If <tag> is not given or is None, an attempt is made to determine
|
||||
the current language in use. If this cannot be determined, Error
|
||||
is raised.
|
||||
|
||||
NOTE: this method is functionally equivalent to calling the Dict()
|
||||
constructor and passing in the <broker> argument.
|
||||
|
||||
"""
|
||||
return Dict(tag, self)
|
||||
|
||||
request_dict._DOC_ERRORS = ["fr"]
|
||||
|
||||
def _request_dict_data(self, tag):
|
||||
"""Request raw C pointer data for a dictionary.
|
||||
|
||||
This method call passes on the call to the C library, and does
|
||||
some internal bookkeeping.
|
||||
"""
|
||||
self._check_this()
|
||||
tag = EnchantStr(tag)
|
||||
new_dict = _e.broker_request_dict(self._this, tag.encode())
|
||||
if new_dict is None:
|
||||
eStr = "Dictionary for language '%s' could not be found"
|
||||
self._raise_error(eStr % (tag,), DictNotFoundError)
|
||||
return new_dict
|
||||
|
||||
def request_pwl_dict(self, pwl):
|
||||
"""Request a Dict object for a personal word list.
|
||||
|
||||
This method behaves as 'request_dict' but rather than returning
|
||||
a dictionary for a specific language, it returns a dictionary
|
||||
referencing a personal word list. A personal word list is a file
|
||||
of custom dictionary entries, one word per line.
|
||||
"""
|
||||
self._check_this()
|
||||
pwl = EnchantStr(pwl)
|
||||
new_dict = _e.broker_request_pwl_dict(self._this, pwl.encode())
|
||||
if new_dict is None:
|
||||
eStr = "Personal Word List file '%s' could not be loaded"
|
||||
self._raise_error(eStr % (pwl,))
|
||||
d = Dict(False)
|
||||
d._switch_this(new_dict, self)
|
||||
return d
|
||||
|
||||
def _free_dict(self, dict):
|
||||
"""Free memory associated with a dictionary.
|
||||
|
||||
This method frees system resources associated with a Dict object.
|
||||
It is equivalent to calling the object's 'free' method. Once this
|
||||
method has been called on a dictionary, it must not be used again.
|
||||
"""
|
||||
self._check_this()
|
||||
_e.broker_free_dict(self._this, dict._this)
|
||||
dict._this = None
|
||||
dict._broker = None
|
||||
|
||||
def dict_exists(self, tag):
|
||||
"""Check availability of a dictionary.
|
||||
|
||||
This method checks whether there is a dictionary available for
|
||||
the language specified by 'tag'. It returns True if a dictionary
|
||||
is available, and False otherwise.
|
||||
"""
|
||||
self._check_this()
|
||||
tag = EnchantStr(tag)
|
||||
val = _e.broker_dict_exists(self._this, tag.encode())
|
||||
return bool(val)
|
||||
|
||||
def set_ordering(self, tag, ordering):
|
||||
"""Set dictionary preferences for a language.
|
||||
|
||||
The Enchant library supports the use of multiple dictionary programs
|
||||
and multiple languages. This method specifies which dictionaries
|
||||
the broker should prefer when dealing with a given language. 'tag'
|
||||
must be an appropriate language specification and 'ordering' is a
|
||||
string listing the dictionaries in order of preference. For example
|
||||
a valid ordering might be "aspell,myspell,ispell".
|
||||
The value of 'tag' can also be set to "*" to set a default ordering
|
||||
for all languages for which one has not been set explicitly.
|
||||
"""
|
||||
self._check_this()
|
||||
tag = EnchantStr(tag)
|
||||
ordering = EnchantStr(ordering)
|
||||
_e.broker_set_ordering(self._this, tag.encode(), ordering.encode())
|
||||
|
||||
def describe(self):
|
||||
"""Return list of provider descriptions.
|
||||
|
||||
This method returns a list of descriptions of each of the
|
||||
dictionary providers available. Each entry in the list is a
|
||||
ProviderDesc object.
|
||||
"""
|
||||
self._check_this()
|
||||
self.__describe_result = []
|
||||
_e.broker_describe(self._this, self.__describe_callback)
|
||||
return [ProviderDesc(*r) for r in self.__describe_result]
|
||||
|
||||
def __describe_callback(self, name, desc, file):
|
||||
"""Collector callback for dictionary description.
|
||||
|
||||
This method is used as a callback into the _enchant function
|
||||
'enchant_broker_describe'. It collects the given arguments in
|
||||
a tuple and appends them to the list '__describe_result'.
|
||||
"""
|
||||
s = EnchantStr("")
|
||||
name = s.decode(name)
|
||||
desc = s.decode(desc)
|
||||
file = s.decode(file)
|
||||
self.__describe_result.append((name, desc, file))
|
||||
|
||||
def list_dicts(self):
|
||||
"""Return list of available dictionaries.
|
||||
|
||||
This method returns a list of dictionaries available to the
|
||||
broker. Each entry in the list is a two-tuple of the form:
|
||||
|
||||
(tag,provider)
|
||||
|
||||
where <tag> is the language lag for the dictionary and
|
||||
<provider> is a ProviderDesc object describing the provider
|
||||
through which that dictionary can be obtained.
|
||||
"""
|
||||
self._check_this()
|
||||
self.__list_dicts_result = []
|
||||
_e.broker_list_dicts(self._this, self.__list_dicts_callback)
|
||||
return [(r[0], ProviderDesc(*r[1])) for r in self.__list_dicts_result]
|
||||
|
||||
def __list_dicts_callback(self, tag, name, desc, file):
|
||||
"""Collector callback for listing dictionaries.
|
||||
|
||||
This method is used as a callback into the _enchant function
|
||||
'enchant_broker_list_dicts'. It collects the given arguments into
|
||||
an appropriate tuple and appends them to '__list_dicts_result'.
|
||||
"""
|
||||
s = EnchantStr("")
|
||||
tag = s.decode(tag)
|
||||
name = s.decode(name)
|
||||
desc = s.decode(desc)
|
||||
file = s.decode(file)
|
||||
self.__list_dicts_result.append((tag, (name, desc, file)))
|
||||
|
||||
def list_languages(self):
|
||||
"""List languages for which dictionaries are available.
|
||||
|
||||
This function returns a list of language tags for which a
|
||||
dictionary is available.
|
||||
"""
|
||||
langs = []
|
||||
for (tag, prov) in self.list_dicts():
|
||||
if tag not in langs:
|
||||
langs.append(tag)
|
||||
return langs
|
||||
|
||||
def __describe_dict(self, dict_data):
|
||||
"""Get the description tuple for a dict data object.
|
||||
<dict_data> must be a C-library pointer to an enchant dictionary.
|
||||
The return value is a tuple of the form:
|
||||
(<tag>,<name>,<desc>,<file>)
|
||||
"""
|
||||
# Define local callback function
|
||||
cb_result = []
|
||||
|
||||
def cb_func(tag, name, desc, file):
|
||||
s = EnchantStr("")
|
||||
tag = s.decode(tag)
|
||||
name = s.decode(name)
|
||||
desc = s.decode(desc)
|
||||
file = s.decode(file)
|
||||
cb_result.append((tag, name, desc, file))
|
||||
|
||||
# Actually call the describer function
|
||||
_e.dict_describe(dict_data, cb_func)
|
||||
return cb_result[0]
|
||||
|
||||
__describe_dict._DOC_ERRORS = ["desc"]
|
||||
|
||||
def get_param(self, name):
|
||||
"""Get the value of a named parameter on this broker.
|
||||
|
||||
Parameters are used to provide runtime information to individual
|
||||
provider backends. See the method 'set_param' for more details.
|
||||
"""
|
||||
name = EnchantStr(name)
|
||||
return name.decode(_e.broker_get_param(self._this, name.encode()))
|
||||
|
||||
get_param._DOC_ERRORS = ["param"]
|
||||
|
||||
def set_param(self, name, value):
|
||||
"""Set the value of a named parameter on this broker.
|
||||
|
||||
Parameters are used to provide runtime information to individual
|
||||
provider backends. For example, the myspell provider will search
|
||||
any directories given in the "enchant.myspell.dictionary.path"
|
||||
parameter when looking for its dictionary files.
|
||||
"""
|
||||
name = EnchantStr(name)
|
||||
value = EnchantStr(value)
|
||||
_e.broker_set_param(self._this, name.encode(), value.encode())
|
||||
|
||||
|
||||
class Dict(_EnchantObject):
|
||||
"""Dictionary object for the Enchant spellchecker.
|
||||
|
||||
Dictionary objects are responsible for checking the spelling of words
|
||||
and suggesting possible corrections. Each dictionary is owned by a
|
||||
Broker object, but unless a new Broker has explicitly been created
|
||||
then this will be the 'enchant' module default Broker and is of little
|
||||
interest.
|
||||
|
||||
The important methods of this class include:
|
||||
|
||||
* check(): check whether a word id spelled correctly
|
||||
* suggest(): suggest correct spellings for a word
|
||||
* add(): add a word to the user's personal dictionary
|
||||
* remove(): add a word to the user's personal exclude list
|
||||
* add_to_session(): add a word to the current spellcheck session
|
||||
* store_replacement(): indicate a replacement for a given word
|
||||
|
||||
Information about the dictionary is available using the following
|
||||
attributes:
|
||||
|
||||
* tag: the language tag of the dictionary
|
||||
* provider: a ProviderDesc object for the dictionary provider
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, tag=None, broker=None):
|
||||
"""Dict object constructor.
|
||||
|
||||
A dictionary belongs to a specific language, identified by the
|
||||
string <tag>. If the tag is not given or is None, an attempt to
|
||||
determine the language currently in use is made using the 'locale'
|
||||
module. If the current language cannot be determined, Error is raised.
|
||||
|
||||
If <tag> is instead given the value of False, a 'dead' Dict object
|
||||
is created without any reference to a language. This is typically
|
||||
only useful within PyEnchant itself. Any other non-string value
|
||||
for <tag> raises Error.
|
||||
|
||||
Each dictionary must also have an associated Broker object which
|
||||
obtains the dictionary information from the underlying system. This
|
||||
may be specified using <broker>. If not given, the default broker
|
||||
is used.
|
||||
"""
|
||||
# Initialise misc object attributes to None
|
||||
self.provider = None
|
||||
# If no tag was given, use the default language
|
||||
if tag is None:
|
||||
tag = get_default_language()
|
||||
if tag is None:
|
||||
err = "No tag specified and default language could not "
|
||||
err = err + "be determined."
|
||||
raise Error(err)
|
||||
self.tag = tag
|
||||
# If no broker was given, use the default broker
|
||||
if broker is None:
|
||||
broker = _broker
|
||||
self._broker = broker
|
||||
# Now let the superclass initialise the C-library object
|
||||
_EnchantObject.__init__(self)
|
||||
|
||||
def _init_this(self):
|
||||
# Create dead object if False was given.
|
||||
# Otherwise, use the broker to get C-library pointer data.
|
||||
self._this = None
|
||||
if self.tag:
|
||||
this = self._broker._request_dict_data(self.tag)
|
||||
self._switch_this(this, self._broker)
|
||||
|
||||
def __del__(self):
|
||||
"""Dict object destructor."""
|
||||
# Calling free() might fail if python is shutting down
|
||||
try:
|
||||
self._free()
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
def _switch_this(self, this, broker):
|
||||
"""Switch the underlying C-library pointer for this object.
|
||||
|
||||
As all useful state for a Dict is stored by the underlying C-library
|
||||
pointer, it is very convenient to allow this to be switched at
|
||||
run-time. Pass a new dict data object into this method to affect
|
||||
the necessary changes. The creating Broker object (at the Python
|
||||
level) must also be provided.
|
||||
|
||||
This should *never* *ever* be used by application code. It's
|
||||
a convenience for developers only, replacing the clunkier <data>
|
||||
parameter to __init__ from earlier versions.
|
||||
"""
|
||||
# Free old dict data
|
||||
Dict._free(self)
|
||||
# Hook in the new stuff
|
||||
self._this = this
|
||||
self._broker = broker
|
||||
# Update object properties
|
||||
desc = self.__describe(check_this=False)
|
||||
self.tag = desc[0]
|
||||
self.provider = ProviderDesc(*desc[1:])
|
||||
|
||||
_switch_this._DOC_ERRORS = ["init"]
|
||||
|
||||
def _check_this(self, msg=None):
|
||||
"""Extend _EnchantObject._check_this() to check Broker validity.
|
||||
|
||||
It is possible for the managing Broker object to be freed without
|
||||
freeing the Dict. Thus validity checking must take into account
|
||||
self._broker._this as well as self._this.
|
||||
"""
|
||||
if self._broker is None or self._broker._this is None:
|
||||
self._this = None
|
||||
_EnchantObject._check_this(self, msg)
|
||||
|
||||
def _raise_error(self, default="Unspecified Error", eclass=Error):
|
||||
"""Overrides _EnchantObject._raise_error to check dict errors."""
|
||||
err = _e.dict_get_error(self._this)
|
||||
if err == "" or err is None:
|
||||
raise eclass(default)
|
||||
raise eclass(err)
|
||||
|
||||
def _free(self):
|
||||
"""Free the system resources associated with a Dict object.
|
||||
|
||||
This method frees underlying system resources for a Dict object.
|
||||
Once it has been called, the Dict object must no longer be used.
|
||||
It is called automatically when the object is garbage collected.
|
||||
"""
|
||||
if self._broker is not None and self._this is not None:
|
||||
self._broker._free_dict(self)
|
||||
|
||||
def check(self, word):
|
||||
"""Check spelling of a word.
|
||||
|
||||
This method takes a word in the dictionary language and returns
|
||||
True if it is correctly spelled, and false otherwise.
|
||||
"""
|
||||
self._check_this()
|
||||
word = EnchantStr(word)
|
||||
val = _e.dict_check(self._this, word.encode())
|
||||
if val == 0:
|
||||
return True
|
||||
if val > 0:
|
||||
return False
|
||||
self._raise_error()
|
||||
|
||||
def suggest(self, word):
|
||||
"""Suggest possible spellings for a word.
|
||||
|
||||
This method tries to guess the correct spelling for a given
|
||||
word, returning the possibilities in a list.
|
||||
"""
|
||||
self._check_this()
|
||||
word = EnchantStr(word)
|
||||
suggs = _e.dict_suggest(self._this, word.encode())
|
||||
return [word.decode(w) for w in suggs]
|
||||
|
||||
def add(self, word):
|
||||
"""Add a word to the user's personal word list."""
|
||||
self._check_this()
|
||||
word = EnchantStr(word)
|
||||
_e.dict_add(self._this, word.encode())
|
||||
|
||||
def remove(self, word):
|
||||
"""Add a word to the user's personal exclude list."""
|
||||
self._check_this()
|
||||
word = EnchantStr(word)
|
||||
_e.dict_remove(self._this, word.encode())
|
||||
|
||||
def add_to_pwl(self, word):
|
||||
"""Add a word to the user's personal word list."""
|
||||
warnings.warn("Dict.add_to_pwl is deprecated, please use Dict.add",
|
||||
category=DeprecationWarning, stacklevel=2)
|
||||
self._check_this()
|
||||
word = EnchantStr(word)
|
||||
_e.dict_add_to_pwl(self._this, word.encode())
|
||||
|
||||
def add_to_session(self, word):
|
||||
"""Add a word to the session personal list."""
|
||||
self._check_this()
|
||||
word = EnchantStr(word)
|
||||
_e.dict_add_to_session(self._this, word.encode())
|
||||
|
||||
def remove_from_session(self, word):
|
||||
"""Add a word to the session exclude list."""
|
||||
self._check_this()
|
||||
word = EnchantStr(word)
|
||||
_e.dict_remove_from_session(self._this, word.encode())
|
||||
|
||||
def is_added(self, word):
|
||||
"""Check whether a word is in the personal word list."""
|
||||
self._check_this()
|
||||
word = EnchantStr(word)
|
||||
return _e.dict_is_added(self._this, word.encode())
|
||||
|
||||
def is_removed(self, word):
|
||||
"""Check whether a word is in the personal exclude list."""
|
||||
self._check_this()
|
||||
word = EnchantStr(word)
|
||||
return _e.dict_is_removed(self._this, word.encode())
|
||||
|
||||
def is_in_session(self, word):
|
||||
"""Check whether a word is in the session list."""
|
||||
warnings.warn("Dict.is_in_session is deprecated, " \
|
||||
"please use Dict.is_added",
|
||||
category=DeprecationWarning, stacklevel=2)
|
||||
self._check_this()
|
||||
word = EnchantStr(word)
|
||||
return _e.dict_is_in_session(self._this, word.encode())
|
||||
|
||||
def store_replacement(self, mis, cor):
|
||||
"""Store a replacement spelling for a miss-spelled word.
|
||||
|
||||
This method makes a suggestion to the spellchecking engine that the
|
||||
miss-spelled word <mis> is in fact correctly spelled as <cor>. Such
|
||||
a suggestion will typically mean that <cor> appears early in the
|
||||
list of suggested spellings offered for later instances of <mis>.
|
||||
"""
|
||||
if not mis:
|
||||
raise ValueError("can't store replacement for an empty string")
|
||||
if not cor:
|
||||
raise ValueError("can't store empty string as a replacement")
|
||||
self._check_this()
|
||||
mis = EnchantStr(mis)
|
||||
cor = EnchantStr(cor)
|
||||
_e.dict_store_replacement(self._this, mis.encode(), cor.encode())
|
||||
|
||||
store_replacement._DOC_ERRORS = ["mis", "mis"]
|
||||
|
||||
def __describe(self, check_this=True):
|
||||
"""Return a tuple describing the dictionary.
|
||||
|
||||
This method returns a four-element tuple describing the underlying
|
||||
spellchecker system providing the dictionary. It will contain the
|
||||
following strings:
|
||||
|
||||
* language tag
|
||||
* name of dictionary provider
|
||||
* description of dictionary provider
|
||||
* dictionary file
|
||||
|
||||
Direct use of this method is not recommended - instead, access this
|
||||
information through the 'tag' and 'provider' attributes.
|
||||
"""
|
||||
if check_this:
|
||||
self._check_this()
|
||||
_e.dict_describe(self._this, self.__describe_callback)
|
||||
return self.__describe_result
|
||||
|
||||
def __describe_callback(self, tag, name, desc, file):
|
||||
"""Collector callback for dictionary description.
|
||||
|
||||
This method is used as a callback into the _enchant function
|
||||
'enchant_dict_describe'. It collects the given arguments in
|
||||
a tuple and stores them in the attribute '__describe_result'.
|
||||
"""
|
||||
s = EnchantStr("")
|
||||
tag = s.decode(tag)
|
||||
name = s.decode(name)
|
||||
desc = s.decode(desc)
|
||||
file = s.decode(file)
|
||||
self.__describe_result = (tag, name, desc, file)
|
||||
|
||||
|
||||
class DictWithPWL(Dict):
|
||||
"""Dictionary with separately-managed personal word list.
|
||||
|
||||
NOTE: As of version 1.4.0, enchant manages a per-user pwl and
|
||||
exclude list. This class is now only needed if you want
|
||||
to explicitly maintain a separate word list in addition to
|
||||
the default one.
|
||||
|
||||
This class behaves as the standard Dict class, but also manages a
|
||||
personal word list stored in a separate file. The file must be
|
||||
specified at creation time by the 'pwl' argument to the constructor.
|
||||
Words added to the dictionary are automatically appended to the pwl file.
|
||||
|
||||
A personal exclude list can also be managed, by passing another filename
|
||||
to the constructor in the optional 'pel' argument. If this is not given,
|
||||
requests to exclude words are ignored.
|
||||
|
||||
If either 'pwl' or 'pel' are None, an in-memory word list is used.
|
||||
This will prevent calls to add() and remove() from affecting the user's
|
||||
default word lists.
|
||||
|
||||
The Dict object managing the PWL is available as the 'pwl' attribute.
|
||||
The Dict object managing the PEL is available as the 'pel' attribute.
|
||||
|
||||
To create a DictWithPWL from the user's default language, use None
|
||||
as the 'tag' argument.
|
||||
"""
|
||||
_DOC_ERRORS = ["pel", "pel", "PEL", "pel"]
|
||||
|
||||
def __init__(self, tag, pwl=None, pel=None, broker=None):
|
||||
"""DictWithPWL constructor.
|
||||
|
||||
The argument 'pwl', if not None, names a file containing the
|
||||
personal word list. If this file does not exist, it is created
|
||||
with default permissions.
|
||||
|
||||
The argument 'pel', if not None, names a file containing the personal
|
||||
exclude list. If this file does not exist, it is created with
|
||||
default permissions.
|
||||
"""
|
||||
Dict.__init__(self, tag, broker)
|
||||
if pwl is not None:
|
||||
if not os.path.exists(pwl):
|
||||
f = open(pwl, "wt")
|
||||
f.close()
|
||||
del f
|
||||
self.pwl = self._broker.request_pwl_dict(pwl)
|
||||
else:
|
||||
self.pwl = PyPWL()
|
||||
if pel is not None:
|
||||
if not os.path.exists(pel):
|
||||
f = open(pel, "wt")
|
||||
f.close()
|
||||
del f
|
||||
self.pel = self._broker.request_pwl_dict(pel)
|
||||
else:
|
||||
self.pel = PyPWL()
|
||||
|
||||
def _check_this(self, msg=None):
|
||||
"""Extend Dict._check_this() to check PWL validity."""
|
||||
if self.pwl is None:
|
||||
self._free()
|
||||
if self.pel is None:
|
||||
self._free()
|
||||
Dict._check_this(self, msg)
|
||||
self.pwl._check_this(msg)
|
||||
self.pel._check_this(msg)
|
||||
|
||||
def _free(self):
|
||||
"""Extend Dict._free() to free the PWL as well."""
|
||||
if self.pwl is not None:
|
||||
self.pwl._free()
|
||||
self.pwl = None
|
||||
if self.pel is not None:
|
||||
self.pel._free()
|
||||
self.pel = None
|
||||
Dict._free(self)
|
||||
|
||||
def check(self, word):
|
||||
"""Check spelling of a word.
|
||||
|
||||
This method takes a word in the dictionary language and returns
|
||||
True if it is correctly spelled, and false otherwise. It checks
|
||||
both the dictionary and the personal word list.
|
||||
"""
|
||||
if self.pel.check(word):
|
||||
return False
|
||||
if self.pwl.check(word):
|
||||
return True
|
||||
if Dict.check(self, word):
|
||||
return True
|
||||
return False
|
||||
|
||||
def suggest(self, word):
|
||||
"""Suggest possible spellings for a word.
|
||||
|
||||
This method tries to guess the correct spelling for a given
|
||||
word, returning the possibilities in a list.
|
||||
"""
|
||||
suggs = Dict.suggest(self, word)
|
||||
suggs.extend([w for w in self.pwl.suggest(word) if w not in suggs])
|
||||
for i in range(len(suggs) - 1, -1, -1):
|
||||
if self.pel.check(suggs[i]):
|
||||
del suggs[i]
|
||||
return suggs
|
||||
|
||||
def add(self, word):
|
||||
"""Add a word to the associated personal word list.
|
||||
|
||||
This method adds the given word to the personal word list, and
|
||||
automatically saves the list to disk.
|
||||
"""
|
||||
self._check_this()
|
||||
self.pwl.add(word)
|
||||
self.pel.remove(word)
|
||||
|
||||
def remove(self, word):
|
||||
"""Add a word to the associated exclude list."""
|
||||
self._check_this()
|
||||
self.pwl.remove(word)
|
||||
self.pel.add(word)
|
||||
|
||||
def add_to_pwl(self, word):
|
||||
"""Add a word to the associated personal word list.
|
||||
|
||||
This method adds the given word to the personal word list, and
|
||||
automatically saves the list to disk.
|
||||
"""
|
||||
self._check_this()
|
||||
self.pwl.add_to_pwl(word)
|
||||
self.pel.remove(word)
|
||||
|
||||
def is_added(self, word):
|
||||
"""Check whether a word is in the personal word list."""
|
||||
self._check_this()
|
||||
return self.pwl.is_added(word)
|
||||
|
||||
def is_removed(self, word):
|
||||
"""Check whether a word is in the personal exclude list."""
|
||||
self._check_this()
|
||||
return self.pel.is_added(word)
|
||||
|
||||
|
||||
## Create a module-level default broker object, and make its important
|
||||
## methods available at the module level.
|
||||
_broker = Broker()
|
||||
request_dict = _broker.request_dict
|
||||
request_pwl_dict = _broker.request_pwl_dict
|
||||
dict_exists = _broker.dict_exists
|
||||
list_dicts = _broker.list_dicts
|
||||
list_languages = _broker.list_languages
|
||||
get_param = _broker.get_param
|
||||
set_param = _broker.set_param
|
||||
|
||||
|
||||
# Expose the "get_version" function.
|
||||
def get_enchant_version():
|
||||
"""Get the version string for the underlying enchant library."""
|
||||
return _e.get_version()
|
||||
|
||||
|
||||
# Run unit tests when called from comand-line
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
import enchant.tests
|
||||
|
||||
res = enchant.tests.runtestsuite()
|
||||
if len(res.errors) > 0 or len(res.failures) > 0:
|
||||
sys.exit(1)
|
||||
sys.exit(0)
|
||||
@@ -1,366 +0,0 @@
|
||||
# pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2008, Ryan Kelly
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPsE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
"""
|
||||
|
||||
enchant._enchant: ctypes-based wrapper for enchant C library
|
||||
|
||||
This module implements the low-level interface to the underlying
|
||||
C library for enchant. The interface is based on ctypes and tries
|
||||
to do as little as possible while making the higher-level components
|
||||
easier to write.
|
||||
|
||||
The following conveniences are provided that differ from the underlying
|
||||
C API:
|
||||
|
||||
* the "enchant" prefix has been removed from all functions, since
|
||||
python has a proper module system
|
||||
* callback functions do not take a user_data argument, since
|
||||
python has proper closures that can manage this internally
|
||||
* string lengths are not passed into functions such as dict_check,
|
||||
since python strings know how long they are
|
||||
|
||||
"""
|
||||
|
||||
import sys, os, os.path
|
||||
from ctypes import *
|
||||
from ctypes.util import find_library
|
||||
|
||||
from enchant import utils
|
||||
from enchant.errors import *
|
||||
from enchant.utils import unicode
|
||||
|
||||
# Locate and load the enchant dll.
|
||||
# We've got several options based on the host platform.
|
||||
|
||||
e = None
|
||||
|
||||
|
||||
def _e_path_possibilities():
|
||||
"""Generator yielding possible locations of the enchant library."""
|
||||
yield os.environ.get("PYENCHANT_LIBRARY_PATH")
|
||||
yield find_library("enchant")
|
||||
yield find_library("libenchant")
|
||||
yield find_library("libenchant-1")
|
||||
if sys.platform == 'darwin':
|
||||
# enchant lib installed by macports
|
||||
yield "/opt/local/lib/libenchant.dylib"
|
||||
|
||||
|
||||
# On win32 we ship a bundled version of the enchant DLLs.
|
||||
# Use them if they're present.
|
||||
if sys.platform == "win32":
|
||||
e_path = None
|
||||
try:
|
||||
e_path = utils.get_resource_filename("libenchant.dll")
|
||||
except (Error, ImportError):
|
||||
try:
|
||||
e_path = utils.get_resource_filename("libenchant-1.dll")
|
||||
except (Error, ImportError):
|
||||
pass
|
||||
if e_path is not None:
|
||||
# We need to use LoadLibraryEx with LOAD_WITH_ALTERED_SEARCH_PATH so
|
||||
# that we don't accidentally suck in other versions of e.g. glib.
|
||||
if not isinstance(e_path, unicode):
|
||||
e_path = unicode(e_path, sys.getfilesystemencoding())
|
||||
LoadLibraryEx = windll.kernel32.LoadLibraryExW
|
||||
LOAD_WITH_ALTERED_SEARCH_PATH = 0x00000008
|
||||
e_handle = LoadLibraryEx(e_path, None, LOAD_WITH_ALTERED_SEARCH_PATH)
|
||||
if not e_handle:
|
||||
raise WinError()
|
||||
e = CDLL(e_path, handle=e_handle)
|
||||
|
||||
# On darwin we ship a bundled version of the enchant DLLs.
|
||||
# Use them if they're present.
|
||||
if e is None and sys.platform == "darwin":
|
||||
try:
|
||||
e_path = utils.get_resource_filename("lib/libenchant.1.dylib")
|
||||
except (Error, ImportError):
|
||||
pass
|
||||
else:
|
||||
# Enchant doesn't natively support relocatable binaries on OSX.
|
||||
# We fake it by patching the enchant source to expose a char**, which
|
||||
# we can write the runtime path into ourelves.
|
||||
e = CDLL(e_path)
|
||||
try:
|
||||
e_dir = os.path.dirname(os.path.dirname(e_path))
|
||||
prefix_dir = POINTER(c_char_p).in_dll(e, "enchant_prefix_dir_p")
|
||||
prefix_dir.contents = c_char_p(e_dir)
|
||||
except AttributeError:
|
||||
e = None
|
||||
|
||||
# Not found yet, search various standard system locations.
|
||||
if e is None:
|
||||
for e_path in _e_path_possibilities():
|
||||
if e_path is not None:
|
||||
try:
|
||||
e = cdll.LoadLibrary(e_path)
|
||||
except OSError:
|
||||
pass
|
||||
else:
|
||||
break
|
||||
|
||||
# No usable enchant install was found :-(
|
||||
if e is None:
|
||||
raise ImportError("enchant C library not found")
|
||||
|
||||
|
||||
# Define various callback function types
|
||||
|
||||
def CALLBACK(restype, *argtypes):
|
||||
"""Factory for generating callback function prototypes.
|
||||
|
||||
This is factored into a factory so I can easily change the definition
|
||||
for experimentation or debugging.
|
||||
"""
|
||||
return CFUNCTYPE(restype, *argtypes)
|
||||
|
||||
|
||||
t_broker_desc_func = CALLBACK(None, c_char_p, c_char_p, c_char_p, c_void_p)
|
||||
t_dict_desc_func = CALLBACK(None, c_char_p, c_char_p, c_char_p, c_char_p, c_void_p)
|
||||
|
||||
# Simple typedefs for readability
|
||||
|
||||
t_broker = c_void_p
|
||||
t_dict = c_void_p
|
||||
|
||||
# Now we can define the types of each function we are going to use
|
||||
|
||||
broker_init = e.enchant_broker_init
|
||||
broker_init.argtypes = []
|
||||
broker_init.restype = t_broker
|
||||
|
||||
broker_free = e.enchant_broker_free
|
||||
broker_free.argtypes = [t_broker]
|
||||
broker_free.restype = None
|
||||
|
||||
broker_request_dict = e.enchant_broker_request_dict
|
||||
broker_request_dict.argtypes = [t_broker, c_char_p]
|
||||
broker_request_dict.restype = t_dict
|
||||
|
||||
broker_request_pwl_dict = e.enchant_broker_request_pwl_dict
|
||||
broker_request_pwl_dict.argtypes = [t_broker, c_char_p]
|
||||
broker_request_pwl_dict.restype = t_dict
|
||||
|
||||
broker_free_dict = e.enchant_broker_free_dict
|
||||
broker_free_dict.argtypes = [t_broker, t_dict]
|
||||
broker_free_dict.restype = None
|
||||
|
||||
broker_dict_exists = e.enchant_broker_dict_exists
|
||||
broker_dict_exists.argtypes = [t_broker, c_char_p]
|
||||
broker_free_dict.restype = c_int
|
||||
|
||||
broker_set_ordering = e.enchant_broker_set_ordering
|
||||
broker_set_ordering.argtypes = [t_broker, c_char_p, c_char_p]
|
||||
broker_set_ordering.restype = None
|
||||
|
||||
broker_get_error = e.enchant_broker_get_error
|
||||
broker_get_error.argtypes = [t_broker]
|
||||
broker_get_error.restype = c_char_p
|
||||
|
||||
broker_describe1 = e.enchant_broker_describe
|
||||
broker_describe1.argtypes = [t_broker, t_broker_desc_func, c_void_p]
|
||||
broker_describe1.restype = None
|
||||
|
||||
|
||||
def broker_describe(broker, cbfunc):
|
||||
def cbfunc1(*args):
|
||||
cbfunc(*args[:-1])
|
||||
|
||||
broker_describe1(broker, t_broker_desc_func(cbfunc1), None)
|
||||
|
||||
|
||||
broker_list_dicts1 = e.enchant_broker_list_dicts
|
||||
broker_list_dicts1.argtypes = [t_broker, t_dict_desc_func, c_void_p]
|
||||
broker_list_dicts1.restype = None
|
||||
|
||||
|
||||
def broker_list_dicts(broker, cbfunc):
|
||||
def cbfunc1(*args):
|
||||
cbfunc(*args[:-1])
|
||||
|
||||
broker_list_dicts1(broker, t_dict_desc_func(cbfunc1), None)
|
||||
|
||||
|
||||
try:
|
||||
broker_get_param = e.enchant_broker_get_param
|
||||
except AttributeError:
|
||||
# Make the lookup error occur at runtime
|
||||
def broker_get_param(broker, param_name):
|
||||
return e.enchant_broker_get_param(param_name)
|
||||
else:
|
||||
broker_get_param.argtypes = [t_broker, c_char_p]
|
||||
broker_get_param.restype = c_char_p
|
||||
|
||||
try:
|
||||
broker_set_param = e.enchant_broker_set_param
|
||||
except AttributeError:
|
||||
# Make the lookup error occur at runtime
|
||||
def broker_set_param(broker, param_name):
|
||||
return e.enchant_broker_set_param(param_name)
|
||||
else:
|
||||
broker_set_param.argtypes = [t_broker, c_char_p, c_char_p]
|
||||
broker_set_param.restype = None
|
||||
|
||||
try:
|
||||
get_version = e.enchant_get_version
|
||||
except AttributeError:
|
||||
# Make the lookup error occur at runtime
|
||||
def get_version():
|
||||
return e.enchant_get_version()
|
||||
else:
|
||||
get_version.argtypes = []
|
||||
get_version.restype = c_char_p
|
||||
|
||||
dict_check1 = e.enchant_dict_check
|
||||
dict_check1.argtypes = [t_dict, c_char_p, c_size_t]
|
||||
dict_check1.restype = c_int
|
||||
|
||||
|
||||
def dict_check(dict, word):
|
||||
return dict_check1(dict, word, len(word))
|
||||
|
||||
|
||||
dict_suggest1 = e.enchant_dict_suggest
|
||||
dict_suggest1.argtypes = [t_dict, c_char_p, c_size_t, POINTER(c_size_t)]
|
||||
dict_suggest1.restype = POINTER(c_char_p)
|
||||
|
||||
|
||||
def dict_suggest(dict, word):
|
||||
numSuggsP = pointer(c_size_t(0))
|
||||
suggs_c = dict_suggest1(dict, word, len(word), numSuggsP)
|
||||
suggs = []
|
||||
n = 0
|
||||
while n < numSuggsP.contents.value:
|
||||
suggs.append(suggs_c[n])
|
||||
n = n + 1
|
||||
if numSuggsP.contents.value > 0:
|
||||
dict_free_string_list(dict, suggs_c)
|
||||
return suggs
|
||||
|
||||
|
||||
dict_add1 = e.enchant_dict_add
|
||||
dict_add1.argtypes = [t_dict, c_char_p, c_size_t]
|
||||
dict_add1.restype = None
|
||||
|
||||
|
||||
def dict_add(dict, word):
|
||||
return dict_add1(dict, word, len(word))
|
||||
|
||||
|
||||
dict_add_to_pwl1 = e.enchant_dict_add
|
||||
dict_add_to_pwl1.argtypes = [t_dict, c_char_p, c_size_t]
|
||||
dict_add_to_pwl1.restype = None
|
||||
|
||||
|
||||
def dict_add_to_pwl(dict, word):
|
||||
return dict_add_to_pwl1(dict, word, len(word))
|
||||
|
||||
|
||||
dict_add_to_session1 = e.enchant_dict_add_to_session
|
||||
dict_add_to_session1.argtypes = [t_dict, c_char_p, c_size_t]
|
||||
dict_add_to_session1.restype = None
|
||||
|
||||
|
||||
def dict_add_to_session(dict, word):
|
||||
return dict_add_to_session1(dict, word, len(word))
|
||||
|
||||
|
||||
dict_remove1 = e.enchant_dict_remove
|
||||
dict_remove1.argtypes = [t_dict, c_char_p, c_size_t]
|
||||
dict_remove1.restype = None
|
||||
|
||||
|
||||
def dict_remove(dict, word):
|
||||
return dict_remove1(dict, word, len(word))
|
||||
|
||||
|
||||
dict_remove_from_session1 = e.enchant_dict_remove_from_session
|
||||
dict_remove_from_session1.argtypes = [t_dict, c_char_p, c_size_t]
|
||||
dict_remove_from_session1.restype = c_int
|
||||
|
||||
|
||||
def dict_remove_from_session(dict, word):
|
||||
return dict_remove_from_session1(dict, word, len(word))
|
||||
|
||||
|
||||
dict_is_added1 = e.enchant_dict_is_added
|
||||
dict_is_added1.argtypes = [t_dict, c_char_p, c_size_t]
|
||||
dict_is_added1.restype = c_int
|
||||
|
||||
|
||||
def dict_is_added(dict, word):
|
||||
return dict_is_added1(dict, word, len(word))
|
||||
|
||||
|
||||
dict_is_removed1 = e.enchant_dict_is_removed
|
||||
dict_is_removed1.argtypes = [t_dict, c_char_p, c_size_t]
|
||||
dict_is_removed1.restype = c_int
|
||||
|
||||
|
||||
def dict_is_removed(dict, word):
|
||||
return dict_is_removed1(dict, word, len(word))
|
||||
|
||||
|
||||
dict_is_in_session1 = e.enchant_dict_is_in_session
|
||||
dict_is_in_session1.argtypes = [t_dict, c_char_p, c_size_t]
|
||||
dict_is_in_session1.restype = c_int
|
||||
|
||||
|
||||
def dict_is_in_session(dict, word):
|
||||
return dict_is_in_session1(dict, word, len(word))
|
||||
|
||||
|
||||
dict_store_replacement1 = e.enchant_dict_store_replacement
|
||||
dict_store_replacement1.argtypes = [t_dict, c_char_p, c_size_t, c_char_p, c_size_t]
|
||||
dict_store_replacement1.restype = None
|
||||
|
||||
|
||||
def dict_store_replacement(dict, mis, cor):
|
||||
return dict_store_replacement1(dict, mis, len(mis), cor, len(cor))
|
||||
|
||||
|
||||
dict_free_string_list = e.enchant_dict_free_string_list
|
||||
dict_free_string_list.argtypes = [t_dict, POINTER(c_char_p)]
|
||||
dict_free_string_list.restype = None
|
||||
|
||||
dict_get_error = e.enchant_dict_get_error
|
||||
dict_get_error.argtypes = [t_dict]
|
||||
dict_get_error.restype = c_char_p
|
||||
|
||||
dict_describe1 = e.enchant_dict_describe
|
||||
dict_describe1.argtypes = [t_dict, t_dict_desc_func, c_void_p]
|
||||
dict_describe1.restype = None
|
||||
|
||||
|
||||
def dict_describe(dict, cbfunc):
|
||||
def cbfunc1(tag, name, desc, file, data):
|
||||
cbfunc(tag, name, desc, file)
|
||||
|
||||
dict_describe1(dict, t_dict_desc_func(cbfunc1), None)
|
||||
@@ -1,203 +0,0 @@
|
||||
# pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2008, Ryan Kelly
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
"""
|
||||
|
||||
enchant.checker.CmdLineChecker: Command-Line spell checker
|
||||
|
||||
This module provides the class CmdLineChecker, which interactively
|
||||
spellchecks a piece of text by interacting with the user on the
|
||||
command line. It can also be run as a script to spellcheck a file.
|
||||
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
from enchant.checker import SpellChecker
|
||||
from enchant.utils import printf
|
||||
|
||||
|
||||
class CmdLineChecker:
|
||||
"""A simple command-line spell checker.
|
||||
|
||||
This class implements a simple command-line spell checker. It must
|
||||
be given a SpellChecker instance to operate on, and interacts with
|
||||
the user by printing instructions on stdout and reading commands from
|
||||
stdin.
|
||||
"""
|
||||
_DOC_ERRORS = ["stdout", "stdin"]
|
||||
|
||||
def __init__(self):
|
||||
self._stop = False
|
||||
self._checker = None
|
||||
|
||||
def set_checker(self, chkr):
|
||||
self._checker = chkr
|
||||
|
||||
def get_checker(self, chkr):
|
||||
return self._checker
|
||||
|
||||
def run(self):
|
||||
"""Run the spellchecking loop."""
|
||||
self._stop = False
|
||||
for err in self._checker:
|
||||
self.error = err
|
||||
printf(["ERROR:", err.word])
|
||||
printf(["HOW ABOUT:", err.suggest()])
|
||||
status = self.read_command()
|
||||
while not status and not self._stop:
|
||||
status = self.read_command()
|
||||
if self._stop:
|
||||
break
|
||||
printf(["DONE"])
|
||||
|
||||
def print_help(self):
|
||||
printf(["0..N: replace with the numbered suggestion"])
|
||||
printf(["R0..rN: always replace with the numbered suggestion"])
|
||||
printf(["i: ignore this word"])
|
||||
printf(["I: always ignore this word"])
|
||||
printf(["a: add word to personal dictionary"])
|
||||
printf(["e: edit the word"])
|
||||
printf(["q: quit checking"])
|
||||
printf(["h: print this help message"])
|
||||
printf(["----------------------------------------------------"])
|
||||
printf(["HOW ABOUT:", self.error.suggest()])
|
||||
|
||||
def read_command(self):
|
||||
cmd = raw_input(">> ")
|
||||
cmd = cmd.strip()
|
||||
|
||||
if cmd.isdigit():
|
||||
repl = int(cmd)
|
||||
suggs = self.error.suggest()
|
||||
if repl >= len(suggs):
|
||||
printf(["No suggestion number", repl])
|
||||
return False
|
||||
printf(["Replacing '%s' with '%s'" % (self.error.word, suggs[repl])])
|
||||
self.error.replace(suggs[repl])
|
||||
return True
|
||||
|
||||
if cmd[0] == "R":
|
||||
if not cmd[1:].isdigit():
|
||||
printf(["Badly formatted command (try 'help')"])
|
||||
return False
|
||||
repl = int(cmd[1:])
|
||||
suggs = self.error.suggest()
|
||||
if repl >= len(suggs):
|
||||
printf(["No suggestion number", repl])
|
||||
return False
|
||||
self.error.replace_always(suggs[repl])
|
||||
return True
|
||||
|
||||
if cmd == "i":
|
||||
return True
|
||||
|
||||
if cmd == "I":
|
||||
self.error.ignore_always()
|
||||
return True
|
||||
|
||||
if cmd == "a":
|
||||
self.error.add()
|
||||
return True
|
||||
|
||||
if cmd == "e":
|
||||
repl = raw_input("New Word: ")
|
||||
self.error.replace(repl.strip())
|
||||
return True
|
||||
|
||||
if cmd == "q":
|
||||
self._stop = True
|
||||
return True
|
||||
|
||||
if "help".startswith(cmd.lower()):
|
||||
self.print_help()
|
||||
return False
|
||||
|
||||
printf(["Badly formatted command (try 'help')"])
|
||||
return False
|
||||
|
||||
def run_on_file(self, infile, outfile=None, enc=None):
|
||||
"""Run spellchecking on the named file.
|
||||
This method can be used to run the spellchecker over the named file.
|
||||
If <outfile> is not given, the corrected contents replace the contents
|
||||
of <infile>. If <outfile> is given, the corrected contents will be
|
||||
written to that file. Use "-" to have the contents written to stdout.
|
||||
If <enc> is given, it specifies the encoding used to read the
|
||||
file's contents into a unicode string. The output will be written
|
||||
in the same encoding.
|
||||
"""
|
||||
inStr = "".join(file(infile, "r").readlines())
|
||||
if enc is not None:
|
||||
inStr = inStr.decode(enc)
|
||||
self._checker.set_text(inStr)
|
||||
self.run()
|
||||
outStr = self._checker.get_text()
|
||||
if enc is not None:
|
||||
outStr = outStr.encode(enc)
|
||||
if outfile is None:
|
||||
outF = file(infile, "w")
|
||||
elif outfile == "-":
|
||||
outF = sys.stdout
|
||||
else:
|
||||
outF = file(outfile, "w")
|
||||
outF.write(outStr)
|
||||
outF.close()
|
||||
|
||||
run_on_file._DOC_ERRORS = ["outfile", "infile", "outfile", "stdout"]
|
||||
|
||||
|
||||
def _run_as_script():
|
||||
"""Run the command-line spellchecker as a script.
|
||||
This function allows the spellchecker to be invoked from the command-line
|
||||
to check spelling in a file.
|
||||
"""
|
||||
# Check necessary command-line options
|
||||
from optparse import OptionParser
|
||||
op = OptionParser()
|
||||
op.add_option("-o", "--output", dest="outfile", metavar="FILE",
|
||||
help="write changes into FILE")
|
||||
op.add_option("-l", "--lang", dest="lang", metavar="TAG", default="en_US",
|
||||
help="use language idenfified by TAG")
|
||||
op.add_option("-e", "--encoding", dest="enc", metavar="ENC",
|
||||
help="file is unicode with encoding ENC")
|
||||
(opts, args) = op.parse_args()
|
||||
# Sanity check
|
||||
if len(args) < 1:
|
||||
raise ValueError("Must name a file to check")
|
||||
if len(args) > 1:
|
||||
raise ValueError("Can only check a single file")
|
||||
# Create and run the checker
|
||||
chkr = SpellChecker(opts.lang)
|
||||
cmdln = CmdLineChecker()
|
||||
cmdln.set_checker(chkr)
|
||||
cmdln.run_on_file(args[0], opts.outfile, opts.enc)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_run_as_script()
|
||||
@@ -1,304 +0,0 @@
|
||||
# GtkSpellCheckerDialog for pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2005, Fredrik Corneliusson
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
|
||||
import gtk
|
||||
import gobject
|
||||
|
||||
from enchant.utils import printf, unicode
|
||||
|
||||
# columns
|
||||
COLUMN_SUGGESTION = 0
|
||||
|
||||
|
||||
def create_list_view(col_label, ):
|
||||
# create list widget
|
||||
list_ = gtk.ListStore(str)
|
||||
list_view = gtk.TreeView(model=list_)
|
||||
|
||||
list_view.set_rules_hint(True)
|
||||
list_view.get_selection().set_mode(gtk.SELECTION_SINGLE)
|
||||
# Add Colums
|
||||
renderer = gtk.CellRendererText()
|
||||
renderer.set_data("column", COLUMN_SUGGESTION)
|
||||
column = gtk.TreeViewColumn(col_label, renderer, text=COLUMN_SUGGESTION)
|
||||
list_view.append_column(column)
|
||||
return list_view
|
||||
|
||||
|
||||
class GtkSpellCheckerDialog(gtk.Window):
|
||||
def __init__(self, *args, **kwargs):
|
||||
gtk.Window.__init__(self, *args, **kwargs)
|
||||
self.set_title('Spell check')
|
||||
self.set_default_size(350, 200)
|
||||
|
||||
self._checker = None
|
||||
self._numContext = 40
|
||||
|
||||
self.errors = None
|
||||
|
||||
# create accel group
|
||||
accel_group = gtk.AccelGroup()
|
||||
self.add_accel_group(accel_group)
|
||||
|
||||
# list of widgets to disable if there's no spell error left
|
||||
self._conditional_widgets = []
|
||||
conditional = self._conditional_widgets.append
|
||||
|
||||
# layout
|
||||
mainbox = gtk.VBox(spacing=5)
|
||||
hbox = gtk.HBox(spacing=5)
|
||||
self.add(mainbox)
|
||||
mainbox.pack_start(hbox, padding=5)
|
||||
|
||||
box1 = gtk.VBox(spacing=5)
|
||||
hbox.pack_start(box1, padding=5)
|
||||
conditional(box1)
|
||||
|
||||
# unreconized word
|
||||
text_view_lable = gtk.Label('Unreconized word')
|
||||
text_view_lable.set_justify(gtk.JUSTIFY_LEFT)
|
||||
box1.pack_start(text_view_lable, False, False)
|
||||
|
||||
text_view = gtk.TextView()
|
||||
text_view.set_wrap_mode(gtk.WRAP_WORD)
|
||||
text_view.set_editable(False)
|
||||
text_view.set_cursor_visible(False)
|
||||
self.error_text = text_view.get_buffer()
|
||||
text_buffer = text_view.get_buffer()
|
||||
text_buffer.create_tag("fg_black", foreground="black")
|
||||
text_buffer.create_tag("fg_red", foreground="red")
|
||||
|
||||
box1.pack_start(text_view)
|
||||
|
||||
# Change to
|
||||
change_to_box = gtk.HBox()
|
||||
box1.pack_start(change_to_box, False, False)
|
||||
|
||||
change_to_label = gtk.Label('Change to:')
|
||||
self.replace_text = gtk.Entry()
|
||||
text_view_lable.set_justify(gtk.JUSTIFY_LEFT)
|
||||
change_to_box.pack_start(change_to_label, False, False)
|
||||
change_to_box.pack_start(self.replace_text)
|
||||
|
||||
# scrolled window
|
||||
sw = gtk.ScrolledWindow()
|
||||
sw.set_shadow_type(gtk.SHADOW_ETCHED_IN)
|
||||
sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC)
|
||||
box1.pack_start(sw)
|
||||
|
||||
self.suggestion_list_view = create_list_view('Suggestions')
|
||||
self.suggestion_list_view.connect("button_press_event", self._onButtonPress)
|
||||
self.suggestion_list_view.connect("cursor-changed", self._onSuggestionChanged)
|
||||
sw.add(self.suggestion_list_view)
|
||||
|
||||
# ---Buttons---#000000#FFFFFF----------------------------------------------------
|
||||
button_box = gtk.VButtonBox()
|
||||
hbox.pack_start(button_box, False, False)
|
||||
|
||||
# Ignore
|
||||
button = gtk.Button("Ignore")
|
||||
button.connect("clicked", self._onIgnore)
|
||||
button.add_accelerator("activate", accel_group,
|
||||
gtk.keysyms.Return, 0, gtk.ACCEL_VISIBLE)
|
||||
button_box.pack_start(button)
|
||||
conditional(button)
|
||||
|
||||
# Ignore all
|
||||
button = gtk.Button("Ignore All")
|
||||
button.connect("clicked", self._onIgnoreAll)
|
||||
button_box.pack_start(button)
|
||||
conditional(button)
|
||||
|
||||
# Replace
|
||||
button = gtk.Button("Replace")
|
||||
button.connect("clicked", self._onReplace)
|
||||
button_box.pack_start(button)
|
||||
conditional(button)
|
||||
|
||||
# Replace all
|
||||
button = gtk.Button("Replace All")
|
||||
button.connect("clicked", self._onReplaceAll)
|
||||
button_box.pack_start(button)
|
||||
conditional(button)
|
||||
|
||||
# Recheck button
|
||||
button = gtk.Button("_Add")
|
||||
button.connect("clicked", self._onAdd)
|
||||
|
||||
button_box.pack_start(button)
|
||||
conditional(button)
|
||||
|
||||
# Close button
|
||||
button = gtk.Button(stock=gtk.STOCK_CLOSE)
|
||||
button.connect("clicked", self._onClose)
|
||||
button.add_accelerator("activate", accel_group,
|
||||
gtk.keysyms.Escape, 0, gtk.ACCEL_VISIBLE)
|
||||
button_box.pack_end(button)
|
||||
|
||||
# dictionary label
|
||||
self._dict_lable = gtk.Label('')
|
||||
mainbox.pack_start(self._dict_lable, False, False, padding=5)
|
||||
|
||||
mainbox.show_all()
|
||||
|
||||
def _onIgnore(self, w, *args):
|
||||
printf(["ignore"])
|
||||
self._advance()
|
||||
|
||||
def _onIgnoreAll(self, w, *args):
|
||||
printf(["ignore all"])
|
||||
self._checker.ignore_always()
|
||||
self._advance()
|
||||
|
||||
def _onReplace(self, *args):
|
||||
printf(["Replace"])
|
||||
repl = self._getRepl()
|
||||
self._checker.replace(repl)
|
||||
self._advance()
|
||||
|
||||
def _onReplaceAll(self, *args):
|
||||
printf(["Replace all"])
|
||||
repl = self._getRepl()
|
||||
self._checker.replace_always(repl)
|
||||
self._advance()
|
||||
|
||||
def _onAdd(self, *args):
|
||||
"""Callback for the "add" button."""
|
||||
self._checker.add()
|
||||
self._advance()
|
||||
|
||||
def _onClose(self, w, *args):
|
||||
self.emit('delete_event', gtk.gdk.Event(gtk.gdk.BUTTON_PRESS))
|
||||
return True
|
||||
|
||||
def _onButtonPress(self, widget, event):
|
||||
if event.type == gtk.gdk._2BUTTON_PRESS:
|
||||
printf(["Double click!"])
|
||||
self._onReplace()
|
||||
|
||||
def _onSuggestionChanged(self, widget, *args):
|
||||
selection = self.suggestion_list_view.get_selection()
|
||||
model, iter = selection.get_selected()
|
||||
if iter:
|
||||
suggestion = model.get_value(iter, COLUMN_SUGGESTION)
|
||||
self.replace_text.set_text(suggestion)
|
||||
|
||||
def _getRepl(self):
|
||||
"""Get the chosen replacement string."""
|
||||
repl = self.replace_text.get_text()
|
||||
repl = self._checker.coerce_string(repl)
|
||||
return repl
|
||||
|
||||
def _fillSuggestionList(self, suggestions):
|
||||
model = self.suggestion_list_view.get_model()
|
||||
model.clear()
|
||||
for suggestion in suggestions:
|
||||
value = unicode("%s" % (suggestion,))
|
||||
model.append([value, ])
|
||||
|
||||
def setSpellChecker(self, checker):
|
||||
assert checker, 'checker cant be None'
|
||||
self._checker = checker
|
||||
self._dict_lable.set_text('Dictionary:%s' % (checker.dict.tag,))
|
||||
|
||||
def getSpellChecker(self, checker):
|
||||
return self._checker
|
||||
|
||||
def updateUI(self):
|
||||
self._advance()
|
||||
|
||||
def _disableButtons(self):
|
||||
for w in self._conditional_widgets:
|
||||
w.set_sensitive(False)
|
||||
|
||||
def _enableButtons(self):
|
||||
for w in self._conditional_widgets:
|
||||
w.set_sensitive(True)
|
||||
|
||||
def _advance(self):
|
||||
"""Advance to the next error.
|
||||
This method advances the SpellChecker to the next error, if
|
||||
any. It then displays the error and some surrounding context,
|
||||
and well as listing the suggested replacements.
|
||||
"""
|
||||
# Disable interaction if no checker
|
||||
if self._checker is None:
|
||||
self._disableButtons()
|
||||
self.emit('check-done')
|
||||
return
|
||||
|
||||
# Advance to next error, disable if not available
|
||||
try:
|
||||
self._checker.next()
|
||||
except StopIteration:
|
||||
self._disableButtons()
|
||||
self.error_text.set_text("")
|
||||
self._fillSuggestionList([])
|
||||
self.replace_text.set_text("")
|
||||
return
|
||||
self._enableButtons()
|
||||
|
||||
# Display error context with erroneous word in red
|
||||
self.error_text.set_text('')
|
||||
iter = self.error_text.get_iter_at_offset(0)
|
||||
append = self.error_text.insert_with_tags_by_name
|
||||
|
||||
lContext = self._checker.leading_context(self._numContext)
|
||||
tContext = self._checker.trailing_context(self._numContext)
|
||||
append(iter, lContext, 'fg_black')
|
||||
append(iter, self._checker.word, 'fg_red')
|
||||
append(iter, tContext, 'fg_black')
|
||||
|
||||
# Display suggestions in the replacements list
|
||||
suggs = self._checker.suggest()
|
||||
self._fillSuggestionList(suggs)
|
||||
if suggs:
|
||||
self.replace_text.set_text(suggs[0])
|
||||
else:
|
||||
self.replace_text.set_text("")
|
||||
|
||||
|
||||
def _test():
|
||||
from enchant.checker import SpellChecker
|
||||
text = "This is sme text with a fw speling errors in it. Here are a fw more to tst it ut."
|
||||
printf(["BEFORE:", text])
|
||||
chk_dlg = GtkSpellCheckerDialog()
|
||||
chk_dlg.show()
|
||||
chk_dlg.connect('delete_event', gtk.main_quit)
|
||||
|
||||
chkr = SpellChecker("en_US", text)
|
||||
|
||||
chk_dlg.setSpellChecker(chkr)
|
||||
chk_dlg.updateUI()
|
||||
gtk.main()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_test()
|
||||
@@ -1,379 +0,0 @@
|
||||
# pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2008, Ryan Kelly
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
"""
|
||||
|
||||
enchant.checker: High-level spellchecking functionality
|
||||
========================================================
|
||||
|
||||
This package is designed to host higher-level spellchecking functionality
|
||||
than is available in the base enchant package. It should make writing
|
||||
applications that follow common usage idioms significantly easier.
|
||||
|
||||
The most useful class is SpellChecker, which implements a spellchecking
|
||||
loop over a block of text. It is capable of modifying the text in-place
|
||||
if given an array of characters to work with.
|
||||
|
||||
This package also contains several interfaces to the SpellChecker class,
|
||||
such as a wxPython GUI dialog and a command-line interface.
|
||||
|
||||
"""
|
||||
|
||||
import array
|
||||
import warnings
|
||||
|
||||
import enchant
|
||||
from enchant.errors import *
|
||||
from enchant.tokenize import get_tokenizer
|
||||
from enchant.utils import bytes, unicode, basestring, next
|
||||
from enchant.utils import get_default_language
|
||||
|
||||
|
||||
class SpellChecker:
|
||||
"""Class implementing stateful spellchecking behaviour.
|
||||
|
||||
This class is designed to implement a spell-checking loop over
|
||||
a block of text, correcting/ignoring/replacing words as required.
|
||||
This loop is implemented using an iterator paradigm so it can be
|
||||
embedded inside other loops of control.
|
||||
|
||||
The SpellChecker object is stateful, and the appropriate methods
|
||||
must be called to alter its state and affect the progress of
|
||||
the spell checking session. At any point during the checking
|
||||
session, the attribute 'word' will hold the current erroneously
|
||||
spelled word under consideration. The action to take on this word
|
||||
is determined by calling methods such as 'replace', 'replace_always'
|
||||
and 'ignore_always'. Once this is done, calling 'next' advances
|
||||
to the next misspelled word.
|
||||
|
||||
As a quick (and rather silly) example, the following code replaces
|
||||
each misspelled word with the string "SPAM":
|
||||
|
||||
>>> text = "This is sme text with a fw speling errors in it."
|
||||
>>> chkr = SpellChecker("en_US",text)
|
||||
>>> for err in chkr:
|
||||
... err.replace("SPAM")
|
||||
...
|
||||
>>> chkr.get_text()
|
||||
'This is SPAM text with a SPAM SPAM errors in it.'
|
||||
>>>
|
||||
|
||||
Internally, the SpellChecker always works with arrays of (possibly
|
||||
unicode) character elements. This allows the in-place modification
|
||||
of the string as it is checked, and is the closest thing Python has
|
||||
to a mutable string. The text can be set as any of a normal string,
|
||||
unicode string, character array or unicode character array. The
|
||||
'get_text' method will return the modified array object if an
|
||||
array is used, or a new string object if a string it used.
|
||||
|
||||
Words input to the SpellChecker may be either plain strings or
|
||||
unicode objects. They will be converted to the same type as the
|
||||
text being checked, using python's default encoding/decoding
|
||||
settings.
|
||||
|
||||
If using an array of characters with this object and the
|
||||
array is modified outside of the spellchecking loop, use the
|
||||
'set_offset' method to reposition the internal loop pointer
|
||||
to make sure it doesn't skip any words.
|
||||
|
||||
"""
|
||||
_DOC_ERRORS = ["sme", "fw", "speling", "chkr", "chkr", "chkr"]
|
||||
|
||||
def __init__(self, lang=None, text=None, tokenize=None, chunkers=None, filters=None):
|
||||
"""Constructor for the SpellChecker class.
|
||||
|
||||
SpellChecker objects can be created in two ways, depending on
|
||||
the nature of the first argument. If it is a string, it
|
||||
specifies a language tag from which a dictionary is created.
|
||||
Otherwise, it must be an enchant Dict object to be used.
|
||||
|
||||
Optional keyword arguments are:
|
||||
|
||||
* text: to set the text to be checked at creation time
|
||||
* tokenize: a custom tokenization function to use
|
||||
* chunkers: a list of chunkers to apply during tokenization
|
||||
* filters: a list of filters to apply during tokenization
|
||||
|
||||
If <tokenize> is not given and the first argument is a Dict,
|
||||
its 'tag' attribute must be a language tag so that a tokenization
|
||||
function can be created automatically. If this attribute is missing
|
||||
the user's default language will be used.
|
||||
"""
|
||||
if lang is None:
|
||||
lang = get_default_language()
|
||||
if isinstance(lang, basestring):
|
||||
dict = enchant.Dict(lang)
|
||||
else:
|
||||
dict = lang
|
||||
try:
|
||||
lang = dict.tag
|
||||
except AttributeError:
|
||||
lang = get_default_language()
|
||||
if lang is None:
|
||||
raise DefaultLanguageNotFoundError
|
||||
self.lang = lang
|
||||
self.dict = dict
|
||||
if tokenize is None:
|
||||
try:
|
||||
tokenize = get_tokenizer(lang, chunkers, filters)
|
||||
except TokenizerNotFoundError:
|
||||
# Fall back to default tokenization if no match for 'lang'
|
||||
tokenize = get_tokenizer(None, chunkers, filters)
|
||||
self._tokenize = tokenize
|
||||
|
||||
self.word = None
|
||||
self.wordpos = None
|
||||
self._ignore_words = {}
|
||||
self._replace_words = {}
|
||||
# Default to the empty string as the text to be checked
|
||||
self._text = array.array('u')
|
||||
self._use_tostring = False
|
||||
self._tokens = iter([])
|
||||
|
||||
if text is not None:
|
||||
self.set_text(text)
|
||||
|
||||
def __iter__(self):
|
||||
"""Each SpellChecker object is its own iterator"""
|
||||
return self
|
||||
|
||||
def set_text(self, text):
|
||||
"""Set the text to be spell-checked.
|
||||
|
||||
This method must be called, or the 'text' argument supplied
|
||||
to the constructor, before calling the 'next()' method.
|
||||
"""
|
||||
# Convert to an array object if necessary
|
||||
if isinstance(text, basestring):
|
||||
if type(text) is unicode:
|
||||
self._text = array.array('u', text)
|
||||
else:
|
||||
self._text = array.array('c', text)
|
||||
self._use_tostring = True
|
||||
else:
|
||||
self._text = text
|
||||
self._use_tostring = False
|
||||
self._tokens = self._tokenize(self._text)
|
||||
|
||||
def get_text(self):
|
||||
"""Return the spell-checked text."""
|
||||
if self._use_tostring:
|
||||
return self._array_to_string(self._text)
|
||||
return self._text
|
||||
|
||||
def _array_to_string(self, text):
|
||||
"""Format an internal array as a standard string."""
|
||||
if text.typecode == 'u':
|
||||
return text.tounicode()
|
||||
return text.tostring()
|
||||
|
||||
def wants_unicode(self):
|
||||
"""Check whether the checker wants unicode strings.
|
||||
|
||||
This method will return True if the checker wants unicode strings
|
||||
as input, False if it wants normal strings. It's important to
|
||||
provide the correct type of string to the checker.
|
||||
"""
|
||||
if self._text.typecode == 'u':
|
||||
return True
|
||||
return False
|
||||
|
||||
def coerce_string(self, text, enc=None):
|
||||
"""Coerce string into the required type.
|
||||
|
||||
This method can be used to automatically ensure that strings
|
||||
are of the correct type required by this checker - either unicode
|
||||
or standard. If there is a mismatch, conversion is done using
|
||||
python's default encoding unless another encoding is specified.
|
||||
"""
|
||||
if self.wants_unicode():
|
||||
if not isinstance(text, unicode):
|
||||
if enc is None:
|
||||
return text.decode()
|
||||
else:
|
||||
return text.decode(enc)
|
||||
return text
|
||||
if not isinstance(text, bytes):
|
||||
if enc is None:
|
||||
return text.encode()
|
||||
else:
|
||||
return text.encode(enc)
|
||||
return text
|
||||
|
||||
def __next__(self):
|
||||
return self.next()
|
||||
|
||||
def next(self):
|
||||
"""Process text up to the next spelling error.
|
||||
|
||||
This method is designed to support the iterator protocol.
|
||||
Each time it is called, it will advance the 'word' attribute
|
||||
to the next spelling error in the text. When no more errors
|
||||
are found, it will raise StopIteration.
|
||||
|
||||
The method will always return self, so that it can be used
|
||||
sensibly in common idioms such as:
|
||||
|
||||
for err in checker:
|
||||
err.do_something()
|
||||
|
||||
"""
|
||||
# Find the next spelling error.
|
||||
# The uncaught StopIteration from next(self._tokens)
|
||||
# will provide the StopIteration for this method
|
||||
while True:
|
||||
(word, pos) = next(self._tokens)
|
||||
# decode back to a regular string
|
||||
word = self._array_to_string(word)
|
||||
if self.dict.check(word):
|
||||
continue
|
||||
if word in self._ignore_words:
|
||||
continue
|
||||
self.word = word
|
||||
self.wordpos = pos
|
||||
if word in self._replace_words:
|
||||
self.replace(self._replace_words[word])
|
||||
continue
|
||||
break
|
||||
return self
|
||||
|
||||
def replace(self, repl):
|
||||
"""Replace the current erroneous word with the given string."""
|
||||
repl = self.coerce_string(repl)
|
||||
aRepl = array.array(self._text.typecode, repl)
|
||||
if repl:
|
||||
self.dict.store_replacement(self.word, repl)
|
||||
self._text[self.wordpos:self.wordpos + len(self.word)] = aRepl
|
||||
incr = len(repl) - len(self.word)
|
||||
self._tokens.set_offset(self._tokens.offset + incr, replaced=True)
|
||||
|
||||
def replace_always(self, word, repl=None):
|
||||
"""Always replace given word with given replacement.
|
||||
|
||||
If a single argument is given, this is used to replace the
|
||||
current erroneous word. If two arguments are given, that
|
||||
combination is added to the list for future use.
|
||||
"""
|
||||
if repl is None:
|
||||
repl = word
|
||||
word = self.word
|
||||
repl = self.coerce_string(repl)
|
||||
word = self.coerce_string(word)
|
||||
self._replace_words[word] = repl
|
||||
if self.word == word:
|
||||
self.replace(repl)
|
||||
|
||||
def ignore_always(self, word=None):
|
||||
"""Add given word to list of words to ignore.
|
||||
|
||||
If no word is given, the current erroneous word is added.
|
||||
"""
|
||||
if word is None:
|
||||
word = self.word
|
||||
word = self.coerce_string(word)
|
||||
if word not in self._ignore_words:
|
||||
self._ignore_words[word] = True
|
||||
|
||||
def add_to_personal(self, word=None):
|
||||
"""Add given word to the personal word list.
|
||||
|
||||
If no word is given, the current erroneous word is added.
|
||||
"""
|
||||
warnings.warn("SpellChecker.add_to_personal is deprecated, " \
|
||||
"please use SpellChecker.add",
|
||||
category=DeprecationWarning, stacklevel=2)
|
||||
self.add(word)
|
||||
|
||||
def add(self, word=None):
|
||||
"""Add given word to the personal word list.
|
||||
|
||||
If no word is given, the current erroneous word is added.
|
||||
"""
|
||||
if word is None:
|
||||
word = self.word
|
||||
self.dict.add(word)
|
||||
|
||||
def suggest(self, word=None):
|
||||
"""Return suggested spellings for the given word.
|
||||
|
||||
If no word is given, the current erroneous word is used.
|
||||
"""
|
||||
if word is None:
|
||||
word = self.word
|
||||
suggs = self.dict.suggest(word)
|
||||
return suggs
|
||||
|
||||
def check(self, word):
|
||||
"""Check correctness of the given word."""
|
||||
return self.dict.check(word)
|
||||
|
||||
def set_offset(self, off, whence=0):
|
||||
"""Set the offset of the tokenization routine.
|
||||
|
||||
For more details on the purpose of the tokenization offset,
|
||||
see the documentation of the 'enchant.tokenize' module.
|
||||
The optional argument whence indicates the method by
|
||||
which to change the offset:
|
||||
* 0 (the default) treats <off> as an increment
|
||||
* 1 treats <off> as a distance from the start
|
||||
* 2 treats <off> as a distance from the end
|
||||
"""
|
||||
if whence == 0:
|
||||
self._tokens.set_offset(self._tokens.offset + off)
|
||||
elif whence == 1:
|
||||
assert (off > 0)
|
||||
self._tokens.set_offset(off)
|
||||
elif whence == 2:
|
||||
assert (off > 0)
|
||||
self._tokens.set_offset(len(self._text) - 1 - off)
|
||||
else:
|
||||
raise ValueError("Invalid value for whence: %s" % (whence,))
|
||||
|
||||
def leading_context(self, chars):
|
||||
"""Get <chars> characters of leading context.
|
||||
|
||||
This method returns up to <chars> characters of leading
|
||||
context - the text that occurs in the string immediately
|
||||
before the current erroneous word.
|
||||
"""
|
||||
start = max(self.wordpos - chars, 0)
|
||||
context = self._text[start:self.wordpos]
|
||||
return self._array_to_string(context)
|
||||
|
||||
def trailing_context(self, chars):
|
||||
"""Get <chars> characters of trailing context.
|
||||
|
||||
This method returns up to <chars> characters of trailing
|
||||
context - the text that occurs in the string immediately
|
||||
after the current erroneous word.
|
||||
"""
|
||||
start = self.wordpos + len(self.word)
|
||||
end = min(start + chars, len(self._text))
|
||||
context = self._text[start:end]
|
||||
return self._array_to_string(context)
|
||||
@@ -1,246 +0,0 @@
|
||||
# pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2009, Ryan Kelly
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
"""
|
||||
|
||||
enchant.checker.tests: Unittests for enchant SpellChecker class
|
||||
|
||||
"""
|
||||
|
||||
import unittest
|
||||
|
||||
import enchant
|
||||
import enchant.tokenize
|
||||
from enchant.utils import *
|
||||
from enchant.errors import *
|
||||
from enchant.checker import *
|
||||
|
||||
|
||||
class TestChecker(unittest.TestCase):
|
||||
"""TestCases for checking behaviour of SpellChecker class."""
|
||||
|
||||
def test_basic(self):
|
||||
"""Test a basic run of the SpellChecker class."""
|
||||
text = """This is sme text with a few speling erors in it. Its gret
|
||||
for checking wheather things are working proprly with the SpellChecker
|
||||
class. Not gret for much elss though."""
|
||||
chkr = SpellChecker("en_US", text=text)
|
||||
for n, err in enumerate(chkr):
|
||||
if n == 0:
|
||||
# Fix up "sme" -> "some" properly
|
||||
self.assertEqual(err.word, "sme")
|
||||
self.assertEqual(err.wordpos, 8)
|
||||
self.assertTrue("some" in err.suggest())
|
||||
err.replace("some")
|
||||
if n == 1:
|
||||
# Ignore "speling"
|
||||
self.assertEqual(err.word, "speling")
|
||||
if n == 2:
|
||||
# Check context around "erors", and replace
|
||||
self.assertEqual(err.word, "erors")
|
||||
self.assertEqual(err.leading_context(5), "ling ")
|
||||
self.assertEqual(err.trailing_context(5), " in i")
|
||||
err.replace(raw_unicode("errors"))
|
||||
if n == 3:
|
||||
# Replace-all on gret as it appears twice
|
||||
self.assertEqual(err.word, "gret")
|
||||
err.replace_always("great")
|
||||
if n == 4:
|
||||
# First encounter with "wheather", move offset back
|
||||
self.assertEqual(err.word, "wheather")
|
||||
err.set_offset(-1 * len(err.word))
|
||||
if n == 5:
|
||||
# Second encounter, fix up "wheather'
|
||||
self.assertEqual(err.word, "wheather")
|
||||
err.replace("whether")
|
||||
if n == 6:
|
||||
# Just replace "proprly", but also add an ignore
|
||||
# for "SpellChecker"
|
||||
self.assertEqual(err.word, "proprly")
|
||||
err.replace("properly")
|
||||
err.ignore_always("SpellChecker")
|
||||
if n == 7:
|
||||
# The second "gret" should have been replaced
|
||||
# So it's now on "elss"
|
||||
self.assertEqual(err.word, "elss")
|
||||
err.replace("else")
|
||||
if n > 7:
|
||||
self.fail("Extraneous spelling errors were found")
|
||||
text2 = """This is some text with a few speling errors in it. Its great
|
||||
for checking whether things are working properly with the SpellChecker
|
||||
class. Not great for much else though."""
|
||||
self.assertEqual(chkr.get_text(), text2)
|
||||
|
||||
def test_filters(self):
|
||||
"""Test SpellChecker with the 'filters' argument."""
|
||||
text = """I contain WikiWords that ShouldBe skipped by the filters"""
|
||||
chkr = SpellChecker("en_US", text=text,
|
||||
filters=[enchant.tokenize.WikiWordFilter])
|
||||
for err in chkr:
|
||||
# There are no errors once the WikiWords are skipped
|
||||
self.fail("Extraneous spelling errors were found")
|
||||
self.assertEqual(chkr.get_text(), text)
|
||||
|
||||
def test_chunkers(self):
|
||||
"""Test SpellChecker with the 'chunkers' argument."""
|
||||
text = """I contain <html a=xjvf>tags</html> that should be skipped"""
|
||||
chkr = SpellChecker("en_US", text=text,
|
||||
chunkers=[enchant.tokenize.HTMLChunker])
|
||||
for err in chkr:
|
||||
# There are no errors when the <html> tag is skipped
|
||||
self.fail("Extraneous spelling errors were found")
|
||||
self.assertEqual(chkr.get_text(), text)
|
||||
|
||||
def test_chunkers_and_filters(self):
|
||||
"""Test SpellChecker with the 'chunkers' and 'filters' arguments."""
|
||||
text = """I contain <html a=xjvf>tags</html> that should be skipped
|
||||
along with a <a href='http://example.com/">link to
|
||||
http://example.com/</a> that should also be skipped"""
|
||||
# There are no errors when things are correctly skipped
|
||||
chkr = SpellChecker("en_US", text=text,
|
||||
filters=[enchant.tokenize.URLFilter],
|
||||
chunkers=[enchant.tokenize.HTMLChunker])
|
||||
for err in chkr:
|
||||
self.fail("Extraneous spelling errors were found")
|
||||
self.assertEqual(chkr.get_text(), text)
|
||||
# The "html" is an error when not using HTMLChunker
|
||||
chkr = SpellChecker("en_US", text=text,
|
||||
filters=[enchant.tokenize.URLFilter])
|
||||
for err in chkr:
|
||||
self.assertEqual(err.word, "html")
|
||||
break
|
||||
self.assertEqual(chkr.get_text(), text)
|
||||
# The "http" from the URL is an error when not using URLFilter
|
||||
chkr = SpellChecker("en_US", text=text,
|
||||
chunkers=[enchant.tokenize.HTMLChunker])
|
||||
for err in chkr:
|
||||
self.assertEqual(err.word, "http")
|
||||
break
|
||||
self.assertEqual(chkr.get_text(), text)
|
||||
|
||||
def test_unicode(self):
|
||||
"""Test SpellChecker with a unicode string."""
|
||||
text = raw_unicode("""I am a unicode strng with unicode erors.""")
|
||||
chkr = SpellChecker("en_US", text)
|
||||
for n, err in enumerate(chkr):
|
||||
if n == 0:
|
||||
self.assertEqual(err.word, raw_unicode("unicode"))
|
||||
self.assertEqual(err.wordpos, 7)
|
||||
chkr.ignore_always()
|
||||
if n == 1:
|
||||
self.assertEqual(err.word, raw_unicode("strng"))
|
||||
chkr.replace_always("string")
|
||||
self.assertEqual(chkr._replace_words[raw_unicode("strng")], raw_unicode("string"))
|
||||
if n == 2:
|
||||
self.assertEqual(err.word, raw_unicode("erors"))
|
||||
chkr.replace("erros")
|
||||
chkr.set_offset(-6)
|
||||
if n == 3:
|
||||
self.assertEqual(err.word, raw_unicode("erros"))
|
||||
chkr.replace("errors")
|
||||
self.assertEqual(n, 3)
|
||||
self.assertEqual(chkr.get_text(), raw_unicode("I am a unicode string with unicode errors."))
|
||||
|
||||
def test_chararray(self):
|
||||
"""Test SpellChecker with a character array as input."""
|
||||
# Python 3 does not provide 'c' array type
|
||||
if str is unicode:
|
||||
atype = 'u'
|
||||
else:
|
||||
atype = 'c'
|
||||
text = "I wll be stord in an aray"
|
||||
txtarr = array.array(atype, text)
|
||||
chkr = SpellChecker("en_US", txtarr)
|
||||
for (n, err) in enumerate(chkr):
|
||||
if n == 0:
|
||||
self.assertEqual(err.word, "wll")
|
||||
self.assertEqual(err.word.__class__, str)
|
||||
if n == 1:
|
||||
self.assertEqual(err.word, "stord")
|
||||
txtarr[err.wordpos:err.wordpos + len(err.word)] = array.array(atype, "stored")
|
||||
chkr.set_offset(-1 * len(err.word))
|
||||
if n == 2:
|
||||
self.assertEqual(err.word, "aray")
|
||||
chkr.replace("array")
|
||||
self.assertEqual(n, 2)
|
||||
if str is unicode:
|
||||
self.assertEqual(txtarr.tounicode(), "I wll be stored in an array")
|
||||
else:
|
||||
self.assertEqual(txtarr.tostring(), "I wll be stored in an array")
|
||||
|
||||
def test_pwl(self):
|
||||
"""Test checker loop with PWL."""
|
||||
from enchant import DictWithPWL
|
||||
d = DictWithPWL("en_US", None, None)
|
||||
txt = "I am sme text to be cheked with personal list of cheked words"
|
||||
chkr = SpellChecker(d, txt)
|
||||
for n, err in enumerate(chkr):
|
||||
if n == 0:
|
||||
self.assertEqual(err.word, "sme")
|
||||
if n == 1:
|
||||
self.assertEqual(err.word, "cheked")
|
||||
chkr.add()
|
||||
self.assertEqual(n, 1)
|
||||
|
||||
def test_bug2785373(self):
|
||||
"""Testcases for bug #2785373."""
|
||||
c = SpellChecker(enchant.Dict("en"), "")
|
||||
c.set_text("So, one dey when I wes 17, I left.")
|
||||
for err in c:
|
||||
pass
|
||||
c = SpellChecker(enchant.Dict("en"), "")
|
||||
c.set_text(raw_unicode("So, one dey when I wes 17, I left."))
|
||||
for err in c:
|
||||
pass
|
||||
|
||||
def test_default_language(self):
|
||||
lang = get_default_language()
|
||||
if lang is None:
|
||||
self.assertRaises(DefaultLanguageNotFoundError, SpellChecker)
|
||||
else:
|
||||
checker = SpellChecker()
|
||||
self.assertEqual(checker.lang, lang)
|
||||
|
||||
def test_replace_with_shorter_string(self):
|
||||
"""Testcase for replacing with a shorter string (bug #10)"""
|
||||
text = ". I Bezwaar tegen verguning."
|
||||
chkr = SpellChecker("en_US", text)
|
||||
for i, err in enumerate(chkr):
|
||||
err.replace("SPAM")
|
||||
assert i < 3
|
||||
self.assertEquals(chkr.get_text(), ". I SPAM SPAM SPAM.")
|
||||
|
||||
def test_replace_with_empty_string(self):
|
||||
"""Testcase for replacing with an empty string (bug #10)"""
|
||||
text = ". I Bezwaar tegen verguning."
|
||||
chkr = SpellChecker("en_US", text)
|
||||
for i, err in enumerate(chkr):
|
||||
err.replace("")
|
||||
assert i < 3
|
||||
self.assertEquals(chkr.get_text(), ". I .")
|
||||
@@ -1,272 +0,0 @@
|
||||
# pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2008, Ryan Kelly
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
# Major code cleanup and re-write thanks to Phil Mayes, 2007
|
||||
#
|
||||
"""
|
||||
|
||||
enchant.checker.wxSpellCheckerDialog: wxPython spellchecker interface
|
||||
|
||||
This module provides the class wxSpellCheckerDialog, which provides
|
||||
a wxPython dialog that can be used as an interface to a spell checking
|
||||
session. Currently it is intended as a proof-of-concept and demonstration
|
||||
class, but it should be suitable for general-purpose use in a program.
|
||||
|
||||
The class must be given an enchant.checker.SpellChecker object with
|
||||
which to operate. It can (in theory...) be used in modal and non-modal
|
||||
modes. Use Show() when operating on an array of characters as it will
|
||||
modify the array in place, meaning other work can be done at the same
|
||||
time. Use ShowModal() when operating on a static string.
|
||||
|
||||
"""
|
||||
_DOC_ERRORS = ["ShowModal"]
|
||||
|
||||
import wx
|
||||
|
||||
from enchant.utils import printf
|
||||
|
||||
|
||||
class wxSpellCheckerDialog(wx.Dialog):
|
||||
"""Simple spellcheck dialog for wxPython
|
||||
|
||||
This class implements a simple spellcheck interface for wxPython,
|
||||
in the form of a dialog. It's intended mainly of an example of
|
||||
how to do this, although it should be useful for applications that
|
||||
just need a simple graphical spellchecker.
|
||||
|
||||
To use, a SpellChecker instance must be created and passed to the
|
||||
dialog before it is shown:
|
||||
|
||||
>>> dlg = wxSpellCheckerDialog(None,-1,"")
|
||||
>>> chkr = SpellChecker("en_AU",text)
|
||||
>>> dlg.SetSpellChecker(chkr)
|
||||
>>> dlg.Show()
|
||||
|
||||
This is most useful when the text to be checked is in the form of
|
||||
a character array, as it will be modified in place as the user
|
||||
interacts with the dialog. For checking strings, the final result
|
||||
will need to be obtained from the SpellChecker object:
|
||||
|
||||
>>> dlg = wxSpellCheckerDialog(None,-1,"")
|
||||
>>> chkr = SpellChecker("en_AU",text)
|
||||
>>> dlg.SetSpellChecker(chkr)
|
||||
>>> dlg.ShowModal()
|
||||
>>> text = dlg.GetSpellChecker().get_text()
|
||||
|
||||
Currently the checker must deal with strings of the same type as
|
||||
returned by wxPython - unicode or normal string depending on the
|
||||
underlying system. This needs to be fixed, somehow...
|
||||
"""
|
||||
_DOC_ERRORS = ["dlg", "chkr", "dlg", "SetSpellChecker", "chkr", "dlg",
|
||||
"dlg", "chkr", "dlg", "SetSpellChecker", "chkr", "dlg",
|
||||
"ShowModal", "dlg", "GetSpellChecker"]
|
||||
|
||||
# Remember dialog size across invocations by storing it on the class
|
||||
sz = (300, 70)
|
||||
|
||||
def __init__(self, parent=None, id=-1, title="Checking Spelling..."):
|
||||
wx.Dialog.__init__(self, parent, id, title, size=wxSpellCheckerDialog.sz,
|
||||
style=wx.DEFAULT_DIALOG_STYLE | wx.RESIZE_BORDER)
|
||||
self._numContext = 40
|
||||
self._checker = None
|
||||
self._buttonsEnabled = True
|
||||
self.error_text = wx.TextCtrl(self, -1, "", style=wx.TE_MULTILINE | wx.TE_READONLY | wx.TE_RICH)
|
||||
self.replace_text = wx.TextCtrl(self, -1, "", style=wx.TE_PROCESS_ENTER)
|
||||
self.replace_list = wx.ListBox(self, -1, style=wx.LB_SINGLE)
|
||||
self.InitLayout()
|
||||
wx.EVT_LISTBOX(self, self.replace_list.GetId(), self.OnReplSelect)
|
||||
wx.EVT_LISTBOX_DCLICK(self, self.replace_list.GetId(), self.OnReplace)
|
||||
|
||||
def InitLayout(self):
|
||||
"""Lay out controls and add buttons."""
|
||||
sizer = wx.BoxSizer(wx.HORIZONTAL)
|
||||
txtSizer = wx.BoxSizer(wx.VERTICAL)
|
||||
btnSizer = wx.BoxSizer(wx.VERTICAL)
|
||||
replaceSizer = wx.BoxSizer(wx.HORIZONTAL)
|
||||
txtSizer.Add(wx.StaticText(self, -1, "Unrecognised Word:"), 0, wx.LEFT | wx.TOP, 5)
|
||||
txtSizer.Add(self.error_text, 1, wx.ALL | wx.EXPAND, 5)
|
||||
replaceSizer.Add(wx.StaticText(self, -1, "Replace with:"), 0, wx.ALL | wx.ALIGN_CENTER_VERTICAL, 5)
|
||||
replaceSizer.Add(self.replace_text, 1, wx.ALL | wx.ALIGN_CENTER_VERTICAL, 5)
|
||||
txtSizer.Add(replaceSizer, 0, wx.EXPAND, 0)
|
||||
txtSizer.Add(self.replace_list, 2, wx.ALL | wx.EXPAND, 5)
|
||||
sizer.Add(txtSizer, 1, wx.EXPAND, 0)
|
||||
self.buttons = []
|
||||
for label, action, tip in ( \
|
||||
("Ignore", self.OnIgnore, "Ignore this word and continue"),
|
||||
("Ignore All", self.OnIgnoreAll, "Ignore all instances of this word and continue"),
|
||||
("Replace", self.OnReplace, "Replace this word"),
|
||||
("Replace All", self.OnReplaceAll, "Replace all instances of this word"),
|
||||
("Add", self.OnAdd, "Add this word to the dictionary"),
|
||||
("Done", self.OnDone, "Finish spell-checking and accept changes"),
|
||||
):
|
||||
btn = wx.Button(self, -1, label)
|
||||
btn.SetToolTip(wx.ToolTip(tip))
|
||||
btnSizer.Add(btn, 0, wx.ALIGN_RIGHT | wx.ALL, 4)
|
||||
btn.Bind(wx.EVT_BUTTON, action)
|
||||
self.buttons.append(btn)
|
||||
sizer.Add(btnSizer, 0, wx.ALL | wx.EXPAND, 5)
|
||||
self.SetAutoLayout(True)
|
||||
self.SetSizer(sizer)
|
||||
sizer.Fit(self)
|
||||
|
||||
def Advance(self):
|
||||
"""Advance to the next error.
|
||||
|
||||
This method advances the SpellChecker to the next error, if
|
||||
any. It then displays the error and some surrounding context,
|
||||
and well as listing the suggested replacements.
|
||||
"""
|
||||
# Disable interaction if no checker
|
||||
if self._checker is None:
|
||||
self.EnableButtons(False)
|
||||
return False
|
||||
# Advance to next error, disable if not available
|
||||
try:
|
||||
self._checker.next()
|
||||
except StopIteration:
|
||||
self.EnableButtons(False)
|
||||
self.error_text.SetValue("")
|
||||
self.replace_list.Clear()
|
||||
self.replace_text.SetValue("")
|
||||
if self.IsModal(): # test needed for SetSpellChecker call
|
||||
# auto-exit when checking complete
|
||||
self.EndModal(wx.ID_OK)
|
||||
return False
|
||||
self.EnableButtons()
|
||||
# Display error context with erroneous word in red.
|
||||
# Restoring default style was misbehaving under win32, so
|
||||
# I am forcing the rest of the text to be black.
|
||||
self.error_text.SetValue("")
|
||||
self.error_text.SetDefaultStyle(wx.TextAttr(wx.BLACK))
|
||||
lContext = self._checker.leading_context(self._numContext)
|
||||
self.error_text.AppendText(lContext)
|
||||
self.error_text.SetDefaultStyle(wx.TextAttr(wx.RED))
|
||||
self.error_text.AppendText(self._checker.word)
|
||||
self.error_text.SetDefaultStyle(wx.TextAttr(wx.BLACK))
|
||||
tContext = self._checker.trailing_context(self._numContext)
|
||||
self.error_text.AppendText(tContext)
|
||||
# Display suggestions in the replacements list
|
||||
suggs = self._checker.suggest()
|
||||
self.replace_list.Set(suggs)
|
||||
self.replace_text.SetValue(suggs and suggs[0] or '')
|
||||
return True
|
||||
|
||||
def EnableButtons(self, state=True):
|
||||
"""Enable the checking-related buttons"""
|
||||
if state != self._buttonsEnabled:
|
||||
for btn in self.buttons[:-1]:
|
||||
btn.Enable(state)
|
||||
self._buttonsEnabled = state
|
||||
|
||||
def GetRepl(self):
|
||||
"""Get the chosen replacement string."""
|
||||
repl = self.replace_text.GetValue()
|
||||
return repl
|
||||
|
||||
def OnAdd(self, evt):
|
||||
"""Callback for the "add" button."""
|
||||
self._checker.add()
|
||||
self.Advance()
|
||||
|
||||
def OnDone(self, evt):
|
||||
"""Callback for the "close" button."""
|
||||
wxSpellCheckerDialog.sz = self.error_text.GetSizeTuple()
|
||||
if self.IsModal():
|
||||
self.EndModal(wx.ID_OK)
|
||||
else:
|
||||
self.Close()
|
||||
|
||||
def OnIgnore(self, evt):
|
||||
"""Callback for the "ignore" button.
|
||||
This simply advances to the next error.
|
||||
"""
|
||||
self.Advance()
|
||||
|
||||
def OnIgnoreAll(self, evt):
|
||||
"""Callback for the "ignore all" button."""
|
||||
self._checker.ignore_always()
|
||||
self.Advance()
|
||||
|
||||
def OnReplace(self, evt):
|
||||
"""Callback for the "replace" button."""
|
||||
repl = self.GetRepl()
|
||||
if repl:
|
||||
self._checker.replace(repl)
|
||||
self.Advance()
|
||||
|
||||
def OnReplaceAll(self, evt):
|
||||
"""Callback for the "replace all" button."""
|
||||
repl = self.GetRepl()
|
||||
self._checker.replace_always(repl)
|
||||
self.Advance()
|
||||
|
||||
def OnReplSelect(self, evt):
|
||||
"""Callback when a new replacement option is selected."""
|
||||
sel = self.replace_list.GetSelection()
|
||||
if sel == -1:
|
||||
return
|
||||
opt = self.replace_list.GetString(sel)
|
||||
self.replace_text.SetValue(opt)
|
||||
|
||||
def GetSpellChecker(self):
|
||||
"""Get the spell checker object."""
|
||||
return self._checker
|
||||
|
||||
def SetSpellChecker(self, chkr):
|
||||
"""Set the spell checker, advancing to the first error.
|
||||
Return True if error(s) to correct, else False."""
|
||||
self._checker = chkr
|
||||
return self.Advance()
|
||||
|
||||
|
||||
def _test():
|
||||
class TestDialog(wxSpellCheckerDialog):
|
||||
def __init__(self, *args):
|
||||
wxSpellCheckerDialog.__init__(self, *args)
|
||||
wx.EVT_CLOSE(self, self.OnClose)
|
||||
|
||||
def OnClose(self, evnt):
|
||||
chkr = dlg.GetSpellChecker()
|
||||
if chkr is not None:
|
||||
printf(["AFTER:", chkr.get_text()])
|
||||
self.Destroy()
|
||||
|
||||
from enchant.checker import SpellChecker
|
||||
text = "This is sme text with a fw speling errors in it. Here are a fw more to tst it ut."
|
||||
printf(["BEFORE:", text])
|
||||
app = wx.PySimpleApp()
|
||||
dlg = TestDialog()
|
||||
chkr = SpellChecker("en_US", text)
|
||||
dlg.SetSpellChecker(chkr)
|
||||
dlg.Show()
|
||||
app.MainLoop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_test()
|
||||
@@ -1,57 +0,0 @@
|
||||
# pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2008, Ryan Kelly
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPsE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
"""
|
||||
enchant.errors: Error class definitions for the enchant library
|
||||
================================================================
|
||||
|
||||
All error classes are defined in this separate sub-module, so that they
|
||||
can safely be imported without causing circular dependencies.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
class Error(Exception):
|
||||
"""Base exception class for the enchant module."""
|
||||
pass
|
||||
|
||||
|
||||
class DictNotFoundError(Error):
|
||||
"""Exception raised when a requested dictionary could not be found."""
|
||||
pass
|
||||
|
||||
|
||||
class TokenizerNotFoundError(Error):
|
||||
"""Exception raised when a requested tokenizer could not be found."""
|
||||
pass
|
||||
|
||||
|
||||
class DefaultLanguageNotFoundError(Error):
|
||||
"""Exception raised when a default language could not be found."""
|
||||
pass
|
||||
@@ -1,4 +0,0 @@
|
||||
|
||||
This directory contains the plugin DLLs for enchant when installed on
|
||||
a Microsoft Windows system.
|
||||
|
||||
@@ -1,285 +0,0 @@
|
||||
# pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2011 Ryan Kelly
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
"""
|
||||
|
||||
pypwl: pure-python personal word list in the style of Enchant
|
||||
==============================================================
|
||||
|
||||
This module provides a pure-python version of the personal word list
|
||||
functionality found in the spellchecking package Enchant. While the
|
||||
same effect can be achieved (with better performance) using the python
|
||||
bindings for Enchant, it requires a C extension.
|
||||
|
||||
This pure-python implementation uses the same algorithm but without any
|
||||
external dependencies or C code (in fact, it was the author's original
|
||||
prototype for the C version found in Enchant).
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import generators
|
||||
|
||||
import os
|
||||
import warnings
|
||||
|
||||
|
||||
class Trie:
|
||||
"""Class implementing a trie-based dictionary of words.
|
||||
|
||||
A Trie is a recursive data structure storing words by their prefix.
|
||||
"Fuzzy matching" can be done by allowing a certain number of missteps
|
||||
when traversing the Trie.
|
||||
"""
|
||||
|
||||
def __init__(self, words=()):
|
||||
self._eos = False # whether I am the end of a word
|
||||
self._keys = {} # letters at this level of the trie
|
||||
for w in words:
|
||||
self.insert(w)
|
||||
|
||||
def insert(self, word):
|
||||
if word == "":
|
||||
self._eos = True
|
||||
else:
|
||||
key = word[0]
|
||||
try:
|
||||
subtrie = self[key]
|
||||
except KeyError:
|
||||
subtrie = Trie()
|
||||
self[key] = subtrie
|
||||
subtrie.insert(word[1:])
|
||||
|
||||
def remove(self, word):
|
||||
if word == "":
|
||||
self._eos = False
|
||||
else:
|
||||
key = word[0]
|
||||
try:
|
||||
subtrie = self[key]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
subtrie.remove(word[1:])
|
||||
|
||||
def search(self, word, nerrs=0):
|
||||
"""Search for the given word, possibly making errors.
|
||||
|
||||
This method searches the trie for the given <word>, making
|
||||
precisely <nerrs> errors. It returns a list of words found.
|
||||
"""
|
||||
res = []
|
||||
# Terminate if we've run out of errors
|
||||
if nerrs < 0:
|
||||
return res
|
||||
# Precise match at the end of the word
|
||||
if nerrs == 0 and word == "":
|
||||
if self._eos:
|
||||
res.append("")
|
||||
# Precisely match word[0]
|
||||
try:
|
||||
subtrie = self[word[0]]
|
||||
subres = subtrie.search(word[1:], nerrs)
|
||||
for w in subres:
|
||||
w2 = word[0] + w
|
||||
if w2 not in res:
|
||||
res.append(w2)
|
||||
except (IndexError, KeyError):
|
||||
pass
|
||||
# match with deletion of word[0]
|
||||
try:
|
||||
subres = self.search(word[1:], nerrs - 1)
|
||||
for w in subres:
|
||||
if w not in res:
|
||||
res.append(w)
|
||||
except (IndexError,):
|
||||
pass
|
||||
# match with insertion before word[0]
|
||||
try:
|
||||
for k in self._keys:
|
||||
subres = self[k].search(word, nerrs - 1)
|
||||
for w in subres:
|
||||
w2 = k + w
|
||||
if w2 not in res:
|
||||
res.append(w2)
|
||||
except (IndexError, KeyError):
|
||||
pass
|
||||
# match on substitution of word[0]
|
||||
try:
|
||||
for k in self._keys:
|
||||
subres = self[k].search(word[1:], nerrs - 1)
|
||||
for w in subres:
|
||||
w2 = k + w
|
||||
if w2 not in res:
|
||||
res.append(w2)
|
||||
except (IndexError, KeyError):
|
||||
pass
|
||||
# All done!
|
||||
return res
|
||||
|
||||
search._DOC_ERRORS = ["nerrs"]
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self._keys[key]
|
||||
|
||||
def __setitem__(self, key, val):
|
||||
self._keys[key] = val
|
||||
|
||||
def __iter__(self):
|
||||
if self._eos:
|
||||
yield ""
|
||||
for k in self._keys:
|
||||
for w2 in self._keys[k]:
|
||||
yield k + w2
|
||||
|
||||
|
||||
class PyPWL:
|
||||
"""Pure-python implementation of Personal Word List dictionary.
|
||||
This class emulates the PWL objects provided by PyEnchant, but
|
||||
implemented purely in python.
|
||||
"""
|
||||
|
||||
def __init__(self, pwl=None):
|
||||
"""PyPWL constructor.
|
||||
This method takes as its only argument the name of a file
|
||||
containing the personal word list, one word per line. Entries
|
||||
will be read from this file, and new entries will be written to
|
||||
it automatically.
|
||||
|
||||
If <pwl> is not specified or None, the list is maintained in
|
||||
memory only.
|
||||
"""
|
||||
self.provider = None
|
||||
self._words = Trie()
|
||||
if pwl is not None:
|
||||
self.pwl = os.path.abspath(pwl)
|
||||
self.tag = self.pwl
|
||||
pwlF = file(pwl)
|
||||
for ln in pwlF:
|
||||
word = ln.strip()
|
||||
self.add_to_session(word)
|
||||
pwlF.close()
|
||||
else:
|
||||
self.pwl = None
|
||||
self.tag = "PyPWL"
|
||||
|
||||
def check(self, word):
|
||||
"""Check spelling of a word.
|
||||
|
||||
This method takes a word in the dictionary language and returns
|
||||
True if it is correctly spelled, and false otherwise.
|
||||
"""
|
||||
res = self._words.search(word)
|
||||
return bool(res)
|
||||
|
||||
def suggest(self, word):
|
||||
"""Suggest possible spellings for a word.
|
||||
|
||||
This method tries to guess the correct spelling for a given
|
||||
word, returning the possibilities in a list.
|
||||
"""
|
||||
limit = 10
|
||||
maxdepth = 5
|
||||
# Iterative deepening until we get enough matches
|
||||
depth = 0
|
||||
res = self._words.search(word, depth)
|
||||
while len(res) < limit and depth < maxdepth:
|
||||
depth += 1
|
||||
for w in self._words.search(word, depth):
|
||||
if w not in res:
|
||||
res.append(w)
|
||||
# Limit number of suggs
|
||||
return res[:limit]
|
||||
|
||||
def add(self, word):
|
||||
"""Add a word to the user's personal dictionary.
|
||||
For a PWL, this means appending it to the file.
|
||||
"""
|
||||
if self.pwl is not None:
|
||||
pwlF = file(self.pwl, "a")
|
||||
pwlF.write("%s\n" % (word.strip(),))
|
||||
pwlF.close()
|
||||
self.add_to_session(word)
|
||||
|
||||
def add_to_pwl(self, word):
|
||||
"""Add a word to the user's personal dictionary.
|
||||
For a PWL, this means appending it to the file.
|
||||
"""
|
||||
warnings.warn("PyPWL.add_to_pwl is deprecated, please use PyPWL.add",
|
||||
category=DeprecationWarning, stacklevel=2)
|
||||
self.add(word)
|
||||
|
||||
def remove(self, word):
|
||||
"""Add a word to the user's personal exclude list."""
|
||||
# There's no exclude list for a stand-alone PWL.
|
||||
# Just remove it from the list.
|
||||
self._words.remove(word)
|
||||
if self.pwl is not None:
|
||||
pwlF = file(self.pwl, "wt")
|
||||
for w in self._words:
|
||||
pwlF.write("%s\n" % (w.strip(),))
|
||||
pwlF.close()
|
||||
|
||||
def add_to_session(self, word):
|
||||
"""Add a word to the session list."""
|
||||
self._words.insert(word)
|
||||
|
||||
def is_in_session(self, word):
|
||||
"""Check whether a word is in the session list."""
|
||||
warnings.warn("PyPWL.is_in_session is deprecated, please use PyPWL.is_added", category=DeprecationWarning)
|
||||
# Consider all words to be in the session list
|
||||
return self.check(word)
|
||||
|
||||
def store_replacement(self, mis, cor):
|
||||
"""Store a replacement spelling for a miss-spelled word.
|
||||
|
||||
This method makes a suggestion to the spellchecking engine that the
|
||||
miss-spelled word <mis> is in fact correctly spelled as <cor>. Such
|
||||
a suggestion will typically mean that <cor> appears early in the
|
||||
list of suggested spellings offered for later instances of <mis>.
|
||||
"""
|
||||
# Too much work for this simple spellchecker
|
||||
pass
|
||||
|
||||
store_replacement._DOC_ERRORS = ["mis", "mis"]
|
||||
|
||||
def is_added(self, word):
|
||||
"""Check whether a word is in the personal word list."""
|
||||
return self.check(word)
|
||||
|
||||
def is_removed(self, word):
|
||||
"""Check whether a word is in the personal exclude list."""
|
||||
return False
|
||||
|
||||
# No-op methods to support internal use as a Dict() replacement
|
||||
|
||||
def _check_this(self, msg):
|
||||
pass
|
||||
|
||||
def _free(self):
|
||||
pass
|
||||
@@ -1,4 +0,0 @@
|
||||
|
||||
This directory contains dictionary files for Enchant when installed on a
|
||||
Microsoft Windows system. Each subdirectory contains dictionaries for
|
||||
a particular spellchecking system.
|
||||
@@ -1,3 +0,0 @@
|
||||
|
||||
This directory contains dictionaries for the myspell backend to enchant.
|
||||
|
||||
@@ -1,616 +0,0 @@
|
||||
# pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2009, Ryan Kelly
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPsE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
"""
|
||||
|
||||
enchant.tests: testcases for pyenchant
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
import pickle
|
||||
|
||||
try:
|
||||
import subprocess
|
||||
except ImportError:
|
||||
subprocess = None
|
||||
|
||||
import enchant
|
||||
from enchant import *
|
||||
from enchant import _enchant as _e
|
||||
from enchant.utils import unicode, raw_unicode, printf, trim_suggestions
|
||||
|
||||
|
||||
def runcmd(cmd):
|
||||
if subprocess is not None:
|
||||
kwds = dict(stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
|
||||
p = subprocess.Popen(cmd, **kwds)
|
||||
(stdout, stderr) = p.communicate()
|
||||
if p.returncode:
|
||||
if sys.version_info[0] >= 3:
|
||||
stderr = stderr.decode(sys.getdefaultencoding(), "replace")
|
||||
sys.stderr.write(stderr)
|
||||
return p.returncode
|
||||
else:
|
||||
return os.system(cmd)
|
||||
|
||||
|
||||
class TestBroker(unittest.TestCase):
|
||||
"""Test cases for the proper functioning of Broker objects.
|
||||
|
||||
These tests assume that there is at least one working provider
|
||||
with a dictionary for the "en_US" language.
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
self.broker = Broker()
|
||||
|
||||
def tearDown(self):
|
||||
del self.broker
|
||||
|
||||
def test_HasENUS(self):
|
||||
"""Test that the en_US language is available."""
|
||||
self.assertTrue(self.broker.dict_exists("en_US"))
|
||||
|
||||
def test_LangsAreAvail(self):
|
||||
"""Test whether all advertised languages are in fact available."""
|
||||
for lang in self.broker.list_languages():
|
||||
if not self.broker.dict_exists(lang):
|
||||
assert False, "language '" + lang + "' advertised but non-existent"
|
||||
|
||||
def test_ProvsAreAvail(self):
|
||||
"""Test whether all advertised providers are in fact available."""
|
||||
for (lang, prov) in self.broker.list_dicts():
|
||||
self.assertTrue(self.broker.dict_exists(lang))
|
||||
if not self.broker.dict_exists(lang):
|
||||
assert False, "language '" + lang + "' advertised but non-existent"
|
||||
if prov not in self.broker.describe():
|
||||
assert False, "provier '" + str(prov) + "' advertised but non-existent"
|
||||
|
||||
def test_ProvOrdering(self):
|
||||
"""Test that provider ordering works correctly."""
|
||||
langs = {}
|
||||
provs = []
|
||||
# Find the providers for each language, and a list of all providers
|
||||
for (tag, prov) in self.broker.list_dicts():
|
||||
# Skip hyphenation dictionaries installed by OOo
|
||||
if tag.startswith("hyph_") and prov.name == "myspell":
|
||||
continue
|
||||
# Canonicalize separators
|
||||
tag = tag.replace("-", "_")
|
||||
langs[tag] = []
|
||||
# NOTE: we are excluding Zemberek here as it appears to return
|
||||
# a broker for any language, even nonexistent ones
|
||||
if prov not in provs and prov.name != "zemberek":
|
||||
provs.append(prov)
|
||||
for prov in provs:
|
||||
for tag in langs:
|
||||
b2 = Broker()
|
||||
b2.set_ordering(tag, prov.name)
|
||||
try:
|
||||
d = b2.request_dict(tag)
|
||||
if d.provider != prov:
|
||||
raise ValueError()
|
||||
langs[tag].append(prov)
|
||||
except:
|
||||
pass
|
||||
# Check availability using a single entry in ordering
|
||||
for tag in langs:
|
||||
for prov in langs[tag]:
|
||||
b2 = Broker()
|
||||
b2.set_ordering(tag, prov.name)
|
||||
d = b2.request_dict(tag)
|
||||
self.assertEqual((d.provider, tag), (prov, tag))
|
||||
del d
|
||||
del b2
|
||||
# Place providers that dont have the language in the ordering
|
||||
for tag in langs:
|
||||
for prov in langs[tag]:
|
||||
order = prov.name
|
||||
for prov2 in provs:
|
||||
if prov2 not in langs[tag]:
|
||||
order = prov2.name + "," + order
|
||||
b2 = Broker()
|
||||
b2.set_ordering(tag, order)
|
||||
d = b2.request_dict(tag)
|
||||
self.assertEqual((d.provider, tag, order), (prov, tag, order))
|
||||
del d
|
||||
del b2
|
||||
|
||||
def test_UnicodeTag(self):
|
||||
"""Test that unicode language tags are accepted"""
|
||||
d1 = self.broker._request_dict_data(raw_unicode("en_US"))
|
||||
self.assertTrue(d1)
|
||||
_e.broker_free_dict(self.broker._this, d1)
|
||||
d1 = Dict(raw_unicode("en_US"))
|
||||
self.assertTrue(d1)
|
||||
|
||||
def test_GetSetParam(self):
|
||||
try:
|
||||
self.broker.get_param("pyenchant.unittest")
|
||||
except AttributeError:
|
||||
return
|
||||
self.assertEqual(self.broker.get_param("pyenchant.unittest"), None)
|
||||
self.broker.set_param("pyenchant.unittest", "testing")
|
||||
self.assertEqual(self.broker.get_param("pyenchant.unittest"), "testing")
|
||||
self.assertEqual(Broker().get_param("pyenchant.unittest"), None)
|
||||
|
||||
|
||||
class TestDict(unittest.TestCase):
|
||||
"""Test cases for the proper functioning of Dict objects.
|
||||
These tests assume that there is at least one working provider
|
||||
with a dictionary for the "en_US" language.
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
self.dict = Dict("en_US")
|
||||
|
||||
def tearDown(self):
|
||||
del self.dict
|
||||
|
||||
def test_HasENUS(self):
|
||||
"""Test that the en_US language is available through default broker."""
|
||||
self.assertTrue(dict_exists("en_US"))
|
||||
|
||||
def test_check(self):
|
||||
"""Test that check() works on some common words."""
|
||||
self.assertTrue(self.dict.check("hello"))
|
||||
self.assertTrue(self.dict.check("test"))
|
||||
self.assertFalse(self.dict.check("helo"))
|
||||
self.assertFalse(self.dict.check("testt"))
|
||||
|
||||
def test_broker(self):
|
||||
"""Test that the dict's broker is set correctly."""
|
||||
self.assertTrue(self.dict._broker is enchant._broker)
|
||||
|
||||
def test_tag(self):
|
||||
"""Test that the dict's tag is set correctly."""
|
||||
self.assertEqual(self.dict.tag, "en_US")
|
||||
|
||||
def test_suggest(self):
|
||||
"""Test that suggest() gets simple suggestions right."""
|
||||
self.assertTrue(self.dict.check("hello"))
|
||||
self.assertTrue("hello" in self.dict.suggest("helo"))
|
||||
|
||||
def test_suggestHang1(self):
|
||||
"""Test whether suggest() hangs on some inputs (Bug #1404196)"""
|
||||
self.assertTrue(len(self.dict.suggest("Thiis")) >= 0)
|
||||
self.assertTrue(len(self.dict.suggest("Thiiis")) >= 0)
|
||||
self.assertTrue(len(self.dict.suggest("Thiiiis")) >= 0)
|
||||
|
||||
def test_unicode1(self):
|
||||
"""Test checking/suggesting for unicode strings"""
|
||||
# TODO: find something that actually returns suggestions
|
||||
us1 = raw_unicode(r"he\u2149lo")
|
||||
self.assertTrue(type(us1) is unicode)
|
||||
self.assertFalse(self.dict.check(us1))
|
||||
for s in self.dict.suggest(us1):
|
||||
self.assertTrue(type(s) is unicode)
|
||||
|
||||
def test_session(self):
|
||||
"""Test that adding words to the session works as required."""
|
||||
self.assertFalse(self.dict.check("Lozz"))
|
||||
self.assertFalse(self.dict.is_added("Lozz"))
|
||||
self.dict.add_to_session("Lozz")
|
||||
self.assertTrue(self.dict.is_added("Lozz"))
|
||||
self.assertTrue(self.dict.check("Lozz"))
|
||||
self.dict.remove_from_session("Lozz")
|
||||
self.assertFalse(self.dict.check("Lozz"))
|
||||
self.assertFalse(self.dict.is_added("Lozz"))
|
||||
self.dict.remove_from_session("hello")
|
||||
self.assertFalse(self.dict.check("hello"))
|
||||
self.assertTrue(self.dict.is_removed("hello"))
|
||||
self.dict.add_to_session("hello")
|
||||
|
||||
def test_AddRemove(self):
|
||||
"""Test adding/removing from default user dictionary."""
|
||||
nonsense = "kxhjsddsi"
|
||||
self.assertFalse(self.dict.check(nonsense))
|
||||
self.dict.add(nonsense)
|
||||
self.assertTrue(self.dict.is_added(nonsense))
|
||||
self.assertTrue(self.dict.check(nonsense))
|
||||
self.dict.remove(nonsense)
|
||||
self.assertFalse(self.dict.is_added(nonsense))
|
||||
self.assertFalse(self.dict.check(nonsense))
|
||||
self.dict.remove("pineapple")
|
||||
self.assertFalse(self.dict.check("pineapple"))
|
||||
self.assertTrue(self.dict.is_removed("pineapple"))
|
||||
self.assertFalse(self.dict.is_added("pineapple"))
|
||||
self.dict.add("pineapple")
|
||||
self.assertTrue(self.dict.check("pineapple"))
|
||||
|
||||
def test_DefaultLang(self):
|
||||
"""Test behaviour of default language selection."""
|
||||
defLang = utils.get_default_language()
|
||||
if defLang is None:
|
||||
# If no default language, shouldnt work
|
||||
self.assertRaises(Error, Dict)
|
||||
else:
|
||||
# If there is a default language, should use it
|
||||
# Of course, no need for the dict to actually exist
|
||||
try:
|
||||
d = Dict()
|
||||
self.assertEqual(d.tag, defLang)
|
||||
except DictNotFoundError:
|
||||
pass
|
||||
|
||||
def test_pickling(self):
|
||||
"""Test that pickling doensn't corrupt internal state."""
|
||||
d1 = Dict("en")
|
||||
self.assertTrue(d1.check("hello"))
|
||||
d2 = pickle.loads(pickle.dumps(d1))
|
||||
self.assertTrue(d1.check("hello"))
|
||||
self.assertTrue(d2.check("hello"))
|
||||
d1._free()
|
||||
self.assertTrue(d2.check("hello"))
|
||||
|
||||
|
||||
class TestPWL(unittest.TestCase):
|
||||
"""Test cases for the proper functioning of PWLs and DictWithPWL objects.
|
||||
These tests assume that there is at least one working provider
|
||||
with a dictionary for the "en_US" language.
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
self._tempDir = self._mkdtemp()
|
||||
self._fileName = "pwl.txt"
|
||||
|
||||
def tearDown(self):
|
||||
import shutil
|
||||
shutil.rmtree(self._tempDir)
|
||||
|
||||
def _mkdtemp(self):
|
||||
import tempfile
|
||||
return tempfile.mkdtemp()
|
||||
|
||||
def _path(self, nm=None):
|
||||
if nm is None:
|
||||
nm = self._fileName
|
||||
nm = os.path.join(self._tempDir, nm)
|
||||
if not os.path.exists(nm):
|
||||
open(nm, 'w').close()
|
||||
return nm
|
||||
|
||||
def setPWLContents(self, contents):
|
||||
"""Set the contents of the PWL file."""
|
||||
pwlFile = open(self._path(), "w")
|
||||
for ln in contents:
|
||||
pwlFile.write(ln)
|
||||
pwlFile.write("\n")
|
||||
pwlFile.flush()
|
||||
pwlFile.close()
|
||||
|
||||
def getPWLContents(self):
|
||||
"""Retrieve the contents of the PWL file."""
|
||||
pwlFile = open(self._path(), "r")
|
||||
contents = pwlFile.readlines()
|
||||
pwlFile.close()
|
||||
return [c.strip() for c in contents]
|
||||
|
||||
def test_check(self):
|
||||
"""Test that basic checking works for PWLs."""
|
||||
self.setPWLContents(["Sazz", "Lozz"])
|
||||
d = request_pwl_dict(self._path())
|
||||
self.assertTrue(d.check("Sazz"))
|
||||
self.assertTrue(d.check("Lozz"))
|
||||
self.assertFalse(d.check("hello"))
|
||||
|
||||
def test_UnicodeFN(self):
|
||||
"""Test that unicode PWL filenames are accepted."""
|
||||
d = request_pwl_dict(unicode(self._path()))
|
||||
self.assertTrue(d)
|
||||
|
||||
def test_add(self):
|
||||
"""Test that adding words to a PWL works correctly."""
|
||||
d = request_pwl_dict(self._path())
|
||||
self.assertFalse(d.check("Flagen"))
|
||||
d.add("Esquilax")
|
||||
d.add("Esquilam")
|
||||
self.assertTrue(d.check("Esquilax"))
|
||||
self.assertTrue("Esquilax" in self.getPWLContents())
|
||||
self.assertTrue(d.is_added("Esquilax"))
|
||||
|
||||
def test_suggestions(self):
|
||||
"""Test getting suggestions from a PWL."""
|
||||
self.setPWLContents(["Sazz", "Lozz"])
|
||||
d = request_pwl_dict(self._path())
|
||||
self.assertTrue("Sazz" in d.suggest("Saz"))
|
||||
self.assertTrue("Lozz" in d.suggest("laz"))
|
||||
self.assertTrue("Sazz" in d.suggest("laz"))
|
||||
d.add("Flagen")
|
||||
self.assertTrue("Flagen" in d.suggest("Flags"))
|
||||
self.assertFalse("sazz" in d.suggest("Flags"))
|
||||
|
||||
def test_DWPWL(self):
|
||||
"""Test functionality of DictWithPWL."""
|
||||
self.setPWLContents(["Sazz", "Lozz"])
|
||||
d = DictWithPWL("en_US", self._path(), self._path("pel.txt"))
|
||||
self.assertTrue(d.check("Sazz"))
|
||||
self.assertTrue(d.check("Lozz"))
|
||||
self.assertTrue(d.check("hello"))
|
||||
self.assertFalse(d.check("helo"))
|
||||
self.assertFalse(d.check("Flagen"))
|
||||
d.add("Flagen")
|
||||
self.assertTrue(d.check("Flagen"))
|
||||
self.assertTrue("Flagen" in self.getPWLContents())
|
||||
self.assertTrue("Flagen" in d.suggest("Flagn"))
|
||||
self.assertTrue("hello" in d.suggest("helo"))
|
||||
d.remove("hello")
|
||||
self.assertFalse(d.check("hello"))
|
||||
self.assertTrue("hello" not in d.suggest("helo"))
|
||||
d.remove("Lozz")
|
||||
self.assertFalse(d.check("Lozz"))
|
||||
|
||||
def test_DWPWL_empty(self):
|
||||
"""Test functionality of DictWithPWL using transient dicts."""
|
||||
d = DictWithPWL("en_US", None, None)
|
||||
self.assertTrue(d.check("hello"))
|
||||
self.assertFalse(d.check("helo"))
|
||||
self.assertFalse(d.check("Flagen"))
|
||||
d.add("Flagen")
|
||||
self.assertTrue(d.check("Flagen"))
|
||||
d.remove("hello")
|
||||
self.assertFalse(d.check("hello"))
|
||||
d.add("hello")
|
||||
self.assertTrue(d.check("hello"))
|
||||
|
||||
def test_PyPWL(self):
|
||||
"""Test our pure-python PWL implementation."""
|
||||
d = PyPWL()
|
||||
self.assertTrue(list(d._words) == [])
|
||||
d.add("hello")
|
||||
d.add("there")
|
||||
d.add("duck")
|
||||
ws = list(d._words)
|
||||
self.assertTrue(len(ws) == 3)
|
||||
self.assertTrue("hello" in ws)
|
||||
self.assertTrue("there" in ws)
|
||||
self.assertTrue("duck" in ws)
|
||||
d.remove("duck")
|
||||
d.remove("notinthere")
|
||||
ws = list(d._words)
|
||||
self.assertTrue(len(ws) == 2)
|
||||
self.assertTrue("hello" in ws)
|
||||
self.assertTrue("there" in ws)
|
||||
|
||||
def test_UnicodeCharsInPath(self):
|
||||
"""Test that unicode chars in PWL paths are accepted."""
|
||||
self._fileName = raw_unicode(r"test_\xe5\xe4\xf6_ing")
|
||||
d = request_pwl_dict(self._path())
|
||||
self.assertTrue(d)
|
||||
|
||||
|
||||
class TestUtils(unittest.TestCase):
|
||||
"""Test cases for various utility functions."""
|
||||
|
||||
def test_trim_suggestions(self):
|
||||
word = "gud"
|
||||
suggs = ["good", "god", "bad+"]
|
||||
self.assertEquals(trim_suggestions(word, suggs, 40), ["god", "good", "bad+"])
|
||||
self.assertEquals(trim_suggestions(word, suggs, 4), ["god", "good", "bad+"])
|
||||
self.assertEquals(trim_suggestions(word, suggs, 3), ["god", "good", "bad+"])
|
||||
self.assertEquals(trim_suggestions(word, suggs, 2), ["god", "good"])
|
||||
self.assertEquals(trim_suggestions(word, suggs, 1), ["god"])
|
||||
self.assertEquals(trim_suggestions(word, suggs, 0), [])
|
||||
|
||||
|
||||
class TestDocStrings(unittest.TestCase):
|
||||
"""Test the spelling on all docstrings we can find in this module.
|
||||
|
||||
This serves two purposes - to provide a lot of test data for the
|
||||
checker routines, and to make sure we don't suffer the embarrassment
|
||||
of having spelling errors in a spellchecking package!
|
||||
"""
|
||||
|
||||
WORDS = ["spellchecking", "utf", "dict", "unicode", "bytestring", "bytestrings",
|
||||
"str", "pyenchant", "ascii", "utils", "setup", "distutils", "pkg",
|
||||
"filename", "tokenization", "tuple", "tuples", "tokenizer",
|
||||
"tokenizers", "testcase", "testcases", "whitespace", "wxpython",
|
||||
"spellchecker", "dialog", "urls", "wikiwords", "enchantobject",
|
||||
"providerdesc", "spellcheck", "pwl", "aspell", "myspell",
|
||||
"docstring", "docstrings", "stopiteration", "pwls", "pypwl",
|
||||
"dictwithpwl", "skippable", "dicts", "dict's", "filenames",
|
||||
"trie", "api", "ctypes", "wxspellcheckerdialog", "stateful",
|
||||
"cmdlinechecker", "spellchecks", "callback", "clunkier", "iterator",
|
||||
"ispell", "cor", "backends"]
|
||||
|
||||
def test_docstrings(self):
|
||||
"""Test that all our docstrings are error-free."""
|
||||
import enchant
|
||||
import enchant.utils
|
||||
import enchant.pypwl
|
||||
import enchant.tokenize
|
||||
import enchant.tokenize.en
|
||||
import enchant.checker
|
||||
import enchant.checker.CmdLineChecker
|
||||
try:
|
||||
import enchant.checker.GtkSpellCheckerDialog
|
||||
except ImportError:
|
||||
pass
|
||||
try:
|
||||
import enchant.checker.wxSpellCheckerDialog
|
||||
except ImportError:
|
||||
pass
|
||||
errors = []
|
||||
# Naive recursion here would blow the stack, instead we
|
||||
# simulate it with our own stack
|
||||
tocheck = [enchant]
|
||||
checked = []
|
||||
while tocheck:
|
||||
obj = tocheck.pop()
|
||||
checked.append(obj)
|
||||
newobjs = list(self._check_docstrings(obj, errors))
|
||||
tocheck.extend([obj for obj in newobjs if obj not in checked])
|
||||
self.assertEqual(len(errors), 0)
|
||||
|
||||
def _check_docstrings(self, obj, errors):
|
||||
import enchant
|
||||
if hasattr(obj, "__doc__"):
|
||||
skip_errors = [w for w in getattr(obj, "_DOC_ERRORS", [])]
|
||||
chkr = enchant.checker.SpellChecker("en_AU", obj.__doc__, filters=[enchant.tokenize.URLFilter])
|
||||
for err in chkr:
|
||||
if len(err.word) == 1:
|
||||
continue
|
||||
if err.word.lower() in self.WORDS:
|
||||
continue
|
||||
if skip_errors and skip_errors[0] == err.word:
|
||||
skip_errors.pop(0)
|
||||
continue
|
||||
errors.append((obj, err.word, err.wordpos))
|
||||
msg = "\nDOCSTRING SPELLING ERROR: %s %s %d %s\n" % (obj, err.word, err.wordpos, chkr.suggest())
|
||||
printf([msg], file=sys.stderr)
|
||||
# Find and yield all child objects that should be checked
|
||||
for name in dir(obj):
|
||||
if name.startswith("__"):
|
||||
continue
|
||||
child = getattr(obj, name)
|
||||
if hasattr(child, "__file__"):
|
||||
if not hasattr(globals(), "__file__"):
|
||||
continue
|
||||
if not child.__file__.startswith(os.path.dirname(__file__)):
|
||||
continue
|
||||
else:
|
||||
cmod = getattr(child, "__module__", None)
|
||||
if not cmod:
|
||||
cclass = getattr(child, "__class__", None)
|
||||
cmod = getattr(cclass, "__module__", None)
|
||||
if cmod and not cmod.startswith("enchant"):
|
||||
continue
|
||||
yield child
|
||||
|
||||
|
||||
class TestInstallEnv(unittest.TestCase):
|
||||
"""Run all testcases in a variety of install environments."""
|
||||
|
||||
def setUp(self):
|
||||
self._tempDir = self._mkdtemp()
|
||||
self._insDir = "build"
|
||||
|
||||
def tearDown(self):
|
||||
import shutil
|
||||
shutil.rmtree(self._tempDir)
|
||||
|
||||
def _mkdtemp(self):
|
||||
import tempfile
|
||||
return tempfile.mkdtemp()
|
||||
|
||||
def install(self):
|
||||
import os, sys, shutil
|
||||
insdir = os.path.join(self._tempDir, self._insDir)
|
||||
os.makedirs(insdir)
|
||||
shutil.copytree("enchant", os.path.join(insdir, "enchant"))
|
||||
|
||||
def runtests(self):
|
||||
import os, sys
|
||||
insdir = os.path.join(self._tempDir, self._insDir)
|
||||
if str is not unicode and isinstance(insdir, unicode):
|
||||
insdir = insdir.encode(sys.getfilesystemencoding())
|
||||
os.environ["PYTHONPATH"] = insdir
|
||||
script = os.path.join(insdir, "enchant", "__init__.py")
|
||||
res = runcmd("\"%s\" %s" % (sys.executable, script,))
|
||||
self.assertEqual(res, 0)
|
||||
|
||||
def test_basic(self):
|
||||
"""Test proper functioning of TestInstallEnv suite."""
|
||||
self.install()
|
||||
self.runtests()
|
||||
|
||||
test_basic._DOC_ERRORS = ["TestInstallEnv"]
|
||||
|
||||
def test_UnicodeInstallPath(self):
|
||||
"""Test installation in a path containing unicode chars."""
|
||||
self._insDir = raw_unicode(r'test_\xe5\xe4\xf6_ing')
|
||||
self.install()
|
||||
self.runtests()
|
||||
|
||||
|
||||
class TestPy2exe(unittest.TestCase):
|
||||
"""Run all testcases inside a py2exe executable"""
|
||||
_DOC_ERRORS = ["py", "exe"]
|
||||
|
||||
def setUp(self):
|
||||
self._tempDir = self._mkdtemp()
|
||||
|
||||
def tearDown(self):
|
||||
import shutil
|
||||
shutil.rmtree(self._tempDir)
|
||||
|
||||
def test_py2exe(self):
|
||||
"""Test pyenchant running inside a py2exe executable."""
|
||||
import os, sys, shutil
|
||||
from os import path
|
||||
from os.path import dirname
|
||||
try:
|
||||
import py2exe
|
||||
except ImportError:
|
||||
return
|
||||
os.environ["PYTHONPATH"] = dirname(dirname(__file__))
|
||||
setup_py = path.join(dirname(__file__), "..", "tools", "setup.py2exe.py")
|
||||
if not path.exists(setup_py):
|
||||
return
|
||||
buildCmd = '%s %s -q py2exe --dist-dir="%s"'
|
||||
buildCmd = buildCmd % (sys.executable, setup_py, self._tempDir)
|
||||
res = runcmd(buildCmd)
|
||||
self.assertEqual(res, 0)
|
||||
testCmd = self._tempDir + "\\test_pyenchant.exe"
|
||||
self.assertTrue(os.path.exists(testCmd))
|
||||
res = runcmd(testCmd)
|
||||
self.assertEqual(res, 0)
|
||||
|
||||
test_py2exe._DOC_ERRORS = ["py", "exe"]
|
||||
|
||||
def _mkdtemp(self):
|
||||
import tempfile
|
||||
return tempfile.mkdtemp()
|
||||
|
||||
|
||||
def buildtestsuite(recurse=True):
|
||||
from enchant.checker.tests import TestChecker
|
||||
from enchant.tokenize.tests import TestTokenization, TestFilters
|
||||
from enchant.tokenize.tests import TestTokenizeEN
|
||||
suite = unittest.TestSuite()
|
||||
if recurse:
|
||||
suite.addTest(unittest.makeSuite(TestInstallEnv))
|
||||
suite.addTest(unittest.makeSuite(TestPy2exe))
|
||||
suite.addTest(unittest.makeSuite(TestBroker))
|
||||
suite.addTest(unittest.makeSuite(TestDict))
|
||||
suite.addTest(unittest.makeSuite(TestPWL))
|
||||
suite.addTest(unittest.makeSuite(TestUtils))
|
||||
suite.addTest(unittest.makeSuite(TestDocStrings))
|
||||
suite.addTest(unittest.makeSuite(TestChecker))
|
||||
suite.addTest(unittest.makeSuite(TestTokenization))
|
||||
suite.addTest(unittest.makeSuite(TestTokenizeEN))
|
||||
suite.addTest(unittest.makeSuite(TestFilters))
|
||||
return suite
|
||||
|
||||
|
||||
def runtestsuite(recurse=False):
|
||||
return unittest.TextTestRunner(verbosity=0).run(buildtestsuite(recurse=recurse))
|
||||
@@ -1,536 +0,0 @@
|
||||
# pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2009, Ryan Kelly
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
"""
|
||||
|
||||
enchant.tokenize: String tokenization functions for PyEnchant
|
||||
================================================================
|
||||
|
||||
An important task in spellchecking is breaking up large bodies of
|
||||
text into their constituent words, each of which is then checked
|
||||
for correctness. This package provides Python functions to split
|
||||
strings into words according to the rules of a particular language.
|
||||
|
||||
Each tokenization function accepts a string as its only positional
|
||||
argument, and returns an iterator that yields tuples of the following
|
||||
form, one for each word found::
|
||||
|
||||
(<word>,<pos>)
|
||||
|
||||
The meanings of these fields should be clear: <word> is the word
|
||||
that was found and <pos> is the position within the text at which
|
||||
the word began (zero indexed, of course). The function will work
|
||||
on any string-like object that supports array-slicing; in particular
|
||||
character-array objects from the 'array' module may be used.
|
||||
|
||||
The iterator also provides the attribute 'offset' which gives the current
|
||||
position of the tokenizer inside the string being split, and the method
|
||||
'set_offset' for manually adjusting this position. This can be used for
|
||||
example if the string's contents have changed during the tokenization
|
||||
process.
|
||||
|
||||
To obtain an appropriate tokenization function for the language
|
||||
identified by <tag>, use the function 'get_tokenizer(tag)'::
|
||||
|
||||
tknzr = get_tokenizer("en_US")
|
||||
for (word,pos) in tknzr("text to be tokenized goes here")
|
||||
do_something(word)
|
||||
|
||||
This library is designed to be easily extendible by third-party
|
||||
authors. To register a tokenization function for the language
|
||||
<tag>, implement it as the function 'tokenize' within the
|
||||
module enchant.tokenize.<tag>. The 'get_tokenizer' function
|
||||
will automatically detect it. Note that the underscore must be
|
||||
used as the tag component separator in this case, in order to
|
||||
form a valid python module name. (e.g. "en_US" rather than "en-US")
|
||||
|
||||
Currently, a tokenizer has only been implemented for the English
|
||||
language. Based on the author's limited experience, this should
|
||||
be at least partially suitable for other languages.
|
||||
|
||||
This module also provides various implementations of "Chunkers" and
|
||||
"Filters". These classes are designed to make it easy to work with
|
||||
text in a vareity of common formats, by detecting and excluding parts
|
||||
of the text that don't need to be checked.
|
||||
|
||||
A Chunker is a class designed to break a body of text into large chunks
|
||||
of checkable content; for example the HTMLChunker class extracts the
|
||||
text content from all HTML tags but excludes the tags themselves.
|
||||
A Filter is a class designed to skip individual words during the checking
|
||||
process; for example the URLFilter class skips over any words that
|
||||
have the format of a URL.
|
||||
|
||||
For exmaple, to spellcheck an HTML document it is necessary to split the
|
||||
text into chunks based on HTML tags, and to filter out common word forms
|
||||
such as URLs and WikiWords. This would look something like the following::
|
||||
|
||||
tknzr = get_tokenier("en_US",(HTMLChunker,),(URLFilter,WikiWordFilter)))
|
||||
|
||||
text = "<html><body>the url is http://example.com</body></html>"
|
||||
for (word,pos) in tknzer(text):
|
||||
...check each word and react accordingly...
|
||||
|
||||
"""
|
||||
_DOC_ERRORS = ["pos", "pos", "tknzr", "URLFilter", "WikiWordFilter",
|
||||
"tkns", "tknzr", "pos", "tkns"]
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
import enchant
|
||||
from enchant.utils import next, xrange
|
||||
from enchant.errors import *
|
||||
|
||||
# For backwards-compatability. This will eventually be removed, but how
|
||||
# does one mark a module-level constant as deprecated?
|
||||
Error = TokenizerNotFoundError
|
||||
|
||||
|
||||
class tokenize:
|
||||
"""Base class for all tokenizer objects.
|
||||
|
||||
Each tokenizer must be an iterator and provide the 'offset'
|
||||
attribute as described in the documentation for this module.
|
||||
|
||||
While tokenizers are in fact classes, they should be treated
|
||||
like functions, and so are named using lower_case rather than
|
||||
the CamelCase more traditional of class names.
|
||||
"""
|
||||
_DOC_ERRORS = ["CamelCase"]
|
||||
|
||||
def __init__(self, text):
|
||||
self._text = text
|
||||
self._offset = 0
|
||||
|
||||
def __next__(self):
|
||||
return self.next()
|
||||
|
||||
def next(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def set_offset(self, offset, replaced=False):
|
||||
self._offset = offset
|
||||
|
||||
def _get_offset(self):
|
||||
return self._offset
|
||||
|
||||
def _set_offset(self, offset):
|
||||
msg = "changing a tokenizers 'offset' attribute is deprecated;" \
|
||||
" use the 'set_offset' method"
|
||||
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
|
||||
self.set_offset(offset)
|
||||
|
||||
offset = property(_get_offset, _set_offset)
|
||||
|
||||
|
||||
def get_tokenizer(tag=None, chunkers=None, filters=None):
|
||||
"""Locate an appropriate tokenizer by language tag.
|
||||
|
||||
This requires importing the function 'tokenize' from an appropriate
|
||||
module. Modules tried are named after the language tag, tried in the
|
||||
following order:
|
||||
* the entire tag (e.g. "en_AU.py")
|
||||
* the base country code of the tag (e.g. "en.py")
|
||||
|
||||
If the language tag is None, a default tokenizer (actually the English
|
||||
one) is returned. It's unicode aware and should work OK for most
|
||||
latin-derived languages.
|
||||
|
||||
If a suitable function cannot be found, raises TokenizerNotFoundError.
|
||||
|
||||
If given and not None, 'chunkers' and 'filters' must be lists of chunker
|
||||
classes and filter classes resectively. These will be applied to the
|
||||
tokenizer during creation.
|
||||
"""
|
||||
if tag is None:
|
||||
tag = "en"
|
||||
# "filters" used to be the second argument. Try to catch cases
|
||||
# where it is given positionally and issue a DeprecationWarning.
|
||||
if chunkers is not None and filters is None:
|
||||
chunkers = list(chunkers)
|
||||
if chunkers:
|
||||
try:
|
||||
chunkers_are_filters = issubclass(chunkers[0], Filter)
|
||||
except TypeError:
|
||||
pass
|
||||
else:
|
||||
if chunkers_are_filters:
|
||||
msg = "passing 'filters' as a non-keyword argument " \
|
||||
"to get_tokenizer() is deprecated"
|
||||
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
|
||||
filters = chunkers
|
||||
chunkers = None
|
||||
# Ensure only '_' used as separator
|
||||
tag = tag.replace("-", "_")
|
||||
# First try the whole tag
|
||||
tkFunc = _try_tokenizer(tag)
|
||||
if tkFunc is None:
|
||||
# Try just the base
|
||||
base = tag.split("_")[0]
|
||||
tkFunc = _try_tokenizer(base)
|
||||
if tkFunc is None:
|
||||
msg = "No tokenizer found for language '%s'" % (tag,)
|
||||
raise TokenizerNotFoundError(msg)
|
||||
# Given the language-specific tokenizer, we now build up the
|
||||
# end result as follows:
|
||||
# * chunk the text using any given chunkers in turn
|
||||
# * begin with basic whitespace tokenization
|
||||
# * apply each of the given filters in turn
|
||||
# * apply language-specific rules
|
||||
tokenizer = basic_tokenize
|
||||
if chunkers is not None:
|
||||
chunkers = list(chunkers)
|
||||
for i in xrange(len(chunkers) - 1, -1, -1):
|
||||
tokenizer = wrap_tokenizer(chunkers[i], tokenizer)
|
||||
if filters is not None:
|
||||
for f in filters:
|
||||
tokenizer = f(tokenizer)
|
||||
tokenizer = wrap_tokenizer(tokenizer, tkFunc)
|
||||
return tokenizer
|
||||
|
||||
|
||||
get_tokenizer._DOC_ERRORS = ["py", "py"]
|
||||
|
||||
|
||||
class empty_tokenize(tokenize):
|
||||
"""Tokenizer class that yields no elements."""
|
||||
_DOC_ERRORS = []
|
||||
|
||||
def __init__(self):
|
||||
tokenize.__init__(self, "")
|
||||
|
||||
def next(self):
|
||||
raise StopIteration()
|
||||
|
||||
|
||||
class unit_tokenize(tokenize):
|
||||
"""Tokenizer class that yields the text as a single token."""
|
||||
_DOC_ERRORS = []
|
||||
|
||||
def __init__(self, text):
|
||||
tokenize.__init__(self, text)
|
||||
self._done = False
|
||||
|
||||
def next(self):
|
||||
if self._done:
|
||||
raise StopIteration()
|
||||
self._done = True
|
||||
return (self._text, 0)
|
||||
|
||||
|
||||
class basic_tokenize(tokenize):
|
||||
"""Tokenizer class that performs very basic word-finding.
|
||||
|
||||
This tokenizer does the most basic thing that could work - it splits
|
||||
text into words based on whitespace boundaries, and removes basic
|
||||
punctuation symbols from the start and end of each word.
|
||||
"""
|
||||
_DOC_ERRORS = []
|
||||
|
||||
# Chars to remove from start/end of words
|
||||
strip_from_start = '"' + "'`(["
|
||||
strip_from_end = '"' + "'`]).!,?;:"
|
||||
|
||||
def next(self):
|
||||
text = self._text
|
||||
offset = self._offset
|
||||
while True:
|
||||
if offset >= len(text):
|
||||
break
|
||||
# Find start of next word
|
||||
while offset < len(text) and text[offset].isspace():
|
||||
offset += 1
|
||||
sPos = offset
|
||||
# Find end of word
|
||||
while offset < len(text) and not text[offset].isspace():
|
||||
offset += 1
|
||||
ePos = offset
|
||||
self._offset = offset
|
||||
# Strip chars from font/end of word
|
||||
while sPos < len(text) and text[sPos] in self.strip_from_start:
|
||||
sPos += 1
|
||||
while 0 < ePos and text[ePos - 1] in self.strip_from_end:
|
||||
ePos -= 1
|
||||
# Return if word isnt empty
|
||||
if (sPos < ePos):
|
||||
return (text[sPos:ePos], sPos)
|
||||
raise StopIteration()
|
||||
|
||||
|
||||
def _try_tokenizer(modName):
|
||||
"""Look for a tokenizer in the named module.
|
||||
|
||||
Returns the function if found, None otherwise.
|
||||
"""
|
||||
modBase = "enchant.tokenize."
|
||||
funcName = "tokenize"
|
||||
modName = modBase + modName
|
||||
try:
|
||||
mod = __import__(modName, globals(), {}, funcName)
|
||||
return getattr(mod, funcName)
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
|
||||
def wrap_tokenizer(tk1, tk2):
|
||||
"""Wrap one tokenizer inside another.
|
||||
|
||||
This function takes two tokenizer functions 'tk1' and 'tk2',
|
||||
and returns a new tokenizer function that passes the output
|
||||
of tk1 through tk2 before yielding it to the calling code.
|
||||
"""
|
||||
# This logic is already implemented in the Filter class.
|
||||
# We simply use tk2 as the _split() method for a filter
|
||||
# around tk1.
|
||||
tkW = Filter(tk1)
|
||||
tkW._split = tk2
|
||||
return tkW
|
||||
|
||||
|
||||
wrap_tokenizer._DOC_ERRORS = ["tk", "tk", "tk", "tk"]
|
||||
|
||||
|
||||
class Chunker(tokenize):
|
||||
"""Base class for text chunking functions.
|
||||
|
||||
A chunker is designed to chunk text into large blocks of tokens. It
|
||||
has the same interface as a tokenizer but is for a different purpose.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class Filter(object):
|
||||
"""Base class for token filtering functions.
|
||||
|
||||
A filter is designed to wrap a tokenizer (or another filter) and do
|
||||
two things:
|
||||
|
||||
* skip over tokens
|
||||
* split tokens into sub-tokens
|
||||
|
||||
Subclasses have two basic options for customising their behaviour. The
|
||||
method _skip(word) may be overridden to return True for words that
|
||||
should be skipped, and false otherwise. The method _split(word) may
|
||||
be overridden as tokenization function that will be applied to further
|
||||
tokenize any words that aren't skipped.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer):
|
||||
"""Filter class constructor."""
|
||||
self._tokenizer = tokenizer
|
||||
|
||||
def __call__(self, *args, **kwds):
|
||||
tkn = self._tokenizer(*args, **kwds)
|
||||
return self._TokenFilter(tkn, self._skip, self._split)
|
||||
|
||||
def _skip(self, word):
|
||||
"""Filter method for identifying skippable tokens.
|
||||
|
||||
If this method returns true, the given word will be skipped by
|
||||
the filter. This should be overridden in subclasses to produce the
|
||||
desired functionality. The default behaviour is not to skip any words.
|
||||
"""
|
||||
return False
|
||||
|
||||
def _split(self, word):
|
||||
"""Filter method for sub-tokenization of tokens.
|
||||
|
||||
This method must be a tokenization function that will split the
|
||||
given word into sub-tokens according to the needs of the filter.
|
||||
The default behaviour is not to split any words.
|
||||
"""
|
||||
return unit_tokenize(word)
|
||||
|
||||
class _TokenFilter(object):
|
||||
"""Private inner class implementing the tokenizer-wrapping logic.
|
||||
|
||||
This might seem convoluted, but we're trying to create something
|
||||
akin to a meta-class - when Filter(tknzr) is called it must return
|
||||
a *callable* that can then be applied to a particular string to
|
||||
perform the tokenization. Since we need to manage a lot of state
|
||||
during tokenization, returning a class is the best option.
|
||||
"""
|
||||
_DOC_ERRORS = ["tknzr"]
|
||||
|
||||
def __init__(self, tokenizer, skip, split):
|
||||
self._skip = skip
|
||||
self._split = split
|
||||
self._tokenizer = tokenizer
|
||||
# for managing state of sub-tokenization
|
||||
self._curtok = empty_tokenize()
|
||||
self._curword = ""
|
||||
self._curpos = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
return self.next()
|
||||
|
||||
def next(self):
|
||||
# Try to get the next sub-token from word currently being split.
|
||||
# If unavailable, move on to the next word and try again.
|
||||
try:
|
||||
(word, pos) = next(self._curtok)
|
||||
return (word, pos + self._curpos)
|
||||
except StopIteration:
|
||||
(word, pos) = next(self._tokenizer)
|
||||
while self._skip(word):
|
||||
(word, pos) = next(self._tokenizer)
|
||||
self._curword = word
|
||||
self._curpos = pos
|
||||
self._curtok = self._split(word)
|
||||
return self.next()
|
||||
|
||||
# Pass on access to 'offset' to the underlying tokenizer.
|
||||
def _get_offset(self):
|
||||
return self._tokenizer.offset
|
||||
|
||||
def _set_offset(self, offset):
|
||||
msg = "changing a tokenizers 'offset' attribute is deprecated;" \
|
||||
" use the 'set_offset' method"
|
||||
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
|
||||
self.set_offset(offset)
|
||||
|
||||
offset = property(_get_offset, _set_offset)
|
||||
|
||||
def set_offset(self, val, replaced=False):
|
||||
self._tokenizer.set_offset(val, replaced=replaced)
|
||||
# If we stay within the current word, also set on _curtok.
|
||||
# Otherwise, throw away _curtok and set to empty iterator.
|
||||
subval = val - self._curpos
|
||||
if subval >= 0 and subval < len(self._curword) and not replaced:
|
||||
self._curtok.set_offset(subval)
|
||||
else:
|
||||
self._curtok = empty_tokenize()
|
||||
self._curword = ""
|
||||
self._curpos = 0
|
||||
|
||||
|
||||
# Pre-defined chunkers and filters start here
|
||||
|
||||
class URLFilter(Filter):
|
||||
"""Filter skipping over URLs.
|
||||
This filter skips any words matching the following regular expression:
|
||||
|
||||
^[a-zA-z]+:\/\/[^\s].*
|
||||
|
||||
That is, any words that are URLs.
|
||||
"""
|
||||
_DOC_ERRORS = ["zA"]
|
||||
_pattern = re.compile(r"^[a-zA-z]+:\/\/[^\s].*")
|
||||
|
||||
def _skip(self, word):
|
||||
if self._pattern.match(word):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class WikiWordFilter(Filter):
|
||||
"""Filter skipping over WikiWords.
|
||||
This filter skips any words matching the following regular expression:
|
||||
|
||||
^([A-Z]\w+[A-Z]+\w+)
|
||||
|
||||
That is, any words that are WikiWords.
|
||||
"""
|
||||
_pattern = re.compile(r"^([A-Z]\w+[A-Z]+\w+)")
|
||||
|
||||
def _skip(self, word):
|
||||
if self._pattern.match(word):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class EmailFilter(Filter):
|
||||
"""Filter skipping over email addresses.
|
||||
This filter skips any words matching the following regular expression:
|
||||
|
||||
^.+@[^\.].*\.[a-z]{2,}$
|
||||
|
||||
That is, any words that resemble email addresses.
|
||||
"""
|
||||
_pattern = re.compile(r"^.+@[^\.].*\.[a-z]{2,}$")
|
||||
|
||||
def _skip(self, word):
|
||||
if self._pattern.match(word):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class HTMLChunker(Chunker):
|
||||
"""Chunker for breaking up HTML documents into chunks of checkable text.
|
||||
|
||||
The operation of this chunker is very simple - anything between a "<"
|
||||
and a ">" will be ignored. Later versions may improve the algorithm
|
||||
slightly.
|
||||
"""
|
||||
|
||||
def next(self):
|
||||
text = self._text
|
||||
offset = self.offset
|
||||
while True:
|
||||
if offset >= len(text):
|
||||
break
|
||||
# Skip to the end of the current tag, if any.
|
||||
if text[offset] == "<":
|
||||
maybeTag = offset
|
||||
if self._is_tag(text, offset):
|
||||
while text[offset] != ">":
|
||||
offset += 1
|
||||
if offset == len(text):
|
||||
offset = maybeTag + 1
|
||||
break
|
||||
else:
|
||||
offset += 1
|
||||
else:
|
||||
offset = maybeTag + 1
|
||||
sPos = offset
|
||||
# Find the start of the next tag.
|
||||
while offset < len(text) and text[offset] != "<":
|
||||
offset += 1
|
||||
ePos = offset
|
||||
self._offset = offset
|
||||
# Return if chunk isnt empty
|
||||
if (sPos < offset):
|
||||
return (text[sPos:offset], sPos)
|
||||
raise StopIteration()
|
||||
|
||||
def _is_tag(self, text, offset):
|
||||
if offset + 1 < len(text):
|
||||
if text[offset + 1].isalpha():
|
||||
return True
|
||||
if text[offset + 1] == "/":
|
||||
return True
|
||||
return False
|
||||
|
||||
# TODO: LaTeXChunker
|
||||
@@ -1,172 +0,0 @@
|
||||
# pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2008, Ryan Kelly
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
"""
|
||||
|
||||
enchant.tokenize.en: Tokenizer for the English language
|
||||
|
||||
This module implements a PyEnchant text tokenizer for the English
|
||||
language, based on very simple rules.
|
||||
|
||||
"""
|
||||
|
||||
import unicodedata
|
||||
|
||||
import enchant.tokenize
|
||||
from enchant.utils import unicode
|
||||
|
||||
|
||||
class tokenize(enchant.tokenize.tokenize):
|
||||
"""Iterator splitting text into words, reporting position.
|
||||
|
||||
This iterator takes a text string as input, and yields tuples
|
||||
representing each distinct word found in the text. The tuples
|
||||
take the form:
|
||||
|
||||
(<word>,<pos>)
|
||||
|
||||
Where <word> is the word string found and <pos> is the position
|
||||
of the start of the word within the text.
|
||||
|
||||
The optional argument <valid_chars> may be used to specify a
|
||||
list of additional characters that can form part of a word.
|
||||
By default, this list contains only the apostrophe ('). Note that
|
||||
these characters cannot appear at the start or end of a word.
|
||||
"""
|
||||
|
||||
_DOC_ERRORS = ["pos", "pos"]
|
||||
|
||||
def __init__(self, text, valid_chars=("'",)):
|
||||
self._valid_chars = valid_chars
|
||||
self._text = text
|
||||
self._offset = 0
|
||||
# Select proper implementation of self._consume_alpha.
|
||||
# 'text' isn't necessarily a string (it could be e.g. a mutable array)
|
||||
# so we can't use isinstance(text,unicode) to detect unicode.
|
||||
# Instead we typetest the first character of the text.
|
||||
# If there's no characters then it doesn't matter what implementation
|
||||
# we use since it won't be called anyway.
|
||||
try:
|
||||
char1 = text[0]
|
||||
except IndexError:
|
||||
self._consume_alpha = self._consume_alpha_b
|
||||
else:
|
||||
if isinstance(char1, unicode):
|
||||
self._consume_alpha = self._consume_alpha_u
|
||||
else:
|
||||
self._consume_alpha = self._consume_alpha_b
|
||||
|
||||
def _consume_alpha_b(self, text, offset):
|
||||
"""Consume an alphabetic character from the given bytestring.
|
||||
|
||||
Given a bytestring and the current offset, this method returns
|
||||
the number of characters occupied by the next alphabetic character
|
||||
in the string. Non-ASCII bytes are interpreted as utf-8 and can
|
||||
result in multiple characters being consumed.
|
||||
"""
|
||||
assert offset < len(text)
|
||||
if text[offset].isalpha():
|
||||
return 1
|
||||
elif text[offset] >= "\x80":
|
||||
return self._consume_alpha_utf8(text, offset)
|
||||
return 0
|
||||
|
||||
def _consume_alpha_utf8(self, text, offset):
|
||||
"""Consume a sequence of utf8 bytes forming an alphabetic character."""
|
||||
incr = 2
|
||||
u = ""
|
||||
while not u and incr <= 4:
|
||||
try:
|
||||
try:
|
||||
# In the common case this will be a string
|
||||
u = text[offset:offset + incr].decode("utf8")
|
||||
except AttributeError:
|
||||
# Looks like it was e.g. a mutable char array.
|
||||
try:
|
||||
s = text[offset:offset + incr].tostring()
|
||||
except AttributeError:
|
||||
s = "".join([c for c in text[offset:offset + incr]])
|
||||
u = s.decode("utf8")
|
||||
except UnicodeDecodeError:
|
||||
incr += 1
|
||||
if not u:
|
||||
return 0
|
||||
if u.isalpha():
|
||||
return incr
|
||||
if unicodedata.category(u)[0] == "M":
|
||||
return incr
|
||||
return 0
|
||||
|
||||
def _consume_alpha_u(self, text, offset):
|
||||
"""Consume an alphabetic character from the given unicode string.
|
||||
|
||||
Given a unicode string and the current offset, this method returns
|
||||
the number of characters occupied by the next alphabetic character
|
||||
in the string. Trailing combining characters are consumed as a
|
||||
single letter.
|
||||
"""
|
||||
assert offset < len(text)
|
||||
incr = 0
|
||||
if text[offset].isalpha():
|
||||
incr = 1
|
||||
while offset + incr < len(text):
|
||||
if unicodedata.category(text[offset + incr])[0] != "M":
|
||||
break
|
||||
incr += 1
|
||||
return incr
|
||||
|
||||
def next(self):
|
||||
text = self._text
|
||||
offset = self._offset
|
||||
while offset < len(text):
|
||||
# Find start of next word (must be alpha)
|
||||
while offset < len(text):
|
||||
incr = self._consume_alpha(text, offset)
|
||||
if incr:
|
||||
break
|
||||
offset += 1
|
||||
curPos = offset
|
||||
# Find end of word using, allowing valid_chars
|
||||
while offset < len(text):
|
||||
incr = self._consume_alpha(text, offset)
|
||||
if not incr:
|
||||
if text[offset] in self._valid_chars:
|
||||
incr = 1
|
||||
else:
|
||||
break
|
||||
offset += incr
|
||||
# Return if word isnt empty
|
||||
if (curPos != offset):
|
||||
# Make sure word doesn't end with a valid_char
|
||||
while text[offset - 1] in self._valid_chars:
|
||||
offset = offset - 1
|
||||
self._offset = offset
|
||||
return (text[curPos:offset], curPos)
|
||||
self._offset = offset
|
||||
raise StopIteration()
|
||||
@@ -1,326 +0,0 @@
|
||||
# pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2008, Ryan Kelly
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
"""
|
||||
|
||||
enchant.tokenize.tests: unittests for enchant tokenization functions.
|
||||
|
||||
"""
|
||||
|
||||
import unittest
|
||||
import array
|
||||
|
||||
from enchant.tokenize import *
|
||||
from enchant.tokenize.en import tokenize as tokenize_en
|
||||
from enchant.utils import raw_unicode, unicode, bytes
|
||||
|
||||
|
||||
class TestTokenization(unittest.TestCase):
|
||||
"""TestCases for testing the basic tokenization functionality."""
|
||||
|
||||
def test_basic_tokenize(self):
|
||||
"""Simple regression test for basic white-space tokenization."""
|
||||
input = """This is a paragraph. It's not very special, but it's designed
|
||||
2 show how the splitter works with many-different combos
|
||||
of words. Also need to "test" the (handling) of 'quoted' words."""
|
||||
output = [
|
||||
("This", 0), ("is", 5), ("a", 8), ("paragraph", 10), ("It's", 22),
|
||||
("not", 27), ("very", 31), ("special", 36), ("but", 45), ("it's", 49),
|
||||
("designed", 54), ("2", 63), ("show", 65), ("how", 70), ("the", 74),
|
||||
("splitter", 78), ("works", 87), ("with", 93), ("many-different", 98),
|
||||
("combos", 113), ("of", 120), ("words", 123),
|
||||
("Also", 130), ("need", 135),
|
||||
("to", 140), ("test", 144), ("the", 150), ("handling", 155),
|
||||
("of", 165), ("quoted", 169), ("words", 177)
|
||||
]
|
||||
self.assertEqual(output, [i for i in basic_tokenize(input)])
|
||||
for (itmO, itmV) in zip(output, basic_tokenize(input)):
|
||||
self.assertEqual(itmO, itmV)
|
||||
|
||||
def test_tokenize_strip(self):
|
||||
"""Test special-char-stripping edge-cases in basic_tokenize."""
|
||||
input = "((' <this> \"\" 'text' has (lots) of (special chars} >>]"
|
||||
output = [("<this>", 4), ("text", 15), ("has", 21), ("lots", 26), ("of", 32),
|
||||
("special", 36), ("chars}", 44), (">>", 51)]
|
||||
self.assertEqual(output, [i for i in basic_tokenize(input)])
|
||||
for (itmO, itmV) in zip(output, basic_tokenize(input)):
|
||||
self.assertEqual(itmO, itmV)
|
||||
|
||||
def test_wrap_tokenizer(self):
|
||||
"""Test wrapping of one tokenizer with another."""
|
||||
input = "this-string will be split@according to diff'rnt rules"
|
||||
from enchant.tokenize import en
|
||||
tknzr = wrap_tokenizer(basic_tokenize, en.tokenize)
|
||||
tknzr = tknzr(input)
|
||||
self.assertEqual(tknzr._tokenizer.__class__, basic_tokenize)
|
||||
self.assertEqual(tknzr._tokenizer.offset, 0)
|
||||
for (n, (word, pos)) in enumerate(tknzr):
|
||||
if n == 0:
|
||||
self.assertEqual(pos, 0)
|
||||
self.assertEqual(word, "this")
|
||||
if n == 1:
|
||||
self.assertEqual(pos, 5)
|
||||
self.assertEqual(word, "string")
|
||||
if n == 2:
|
||||
self.assertEqual(pos, 12)
|
||||
self.assertEqual(word, "will")
|
||||
# Test setting offset to a previous token
|
||||
tknzr.set_offset(5)
|
||||
self.assertEqual(tknzr.offset, 5)
|
||||
self.assertEqual(tknzr._tokenizer.offset, 5)
|
||||
self.assertEqual(tknzr._curtok.__class__, empty_tokenize)
|
||||
if n == 3:
|
||||
self.assertEqual(word, "string")
|
||||
self.assertEqual(pos, 5)
|
||||
if n == 4:
|
||||
self.assertEqual(pos, 12)
|
||||
self.assertEqual(word, "will")
|
||||
if n == 5:
|
||||
self.assertEqual(pos, 17)
|
||||
self.assertEqual(word, "be")
|
||||
# Test setting offset past the current token
|
||||
tknzr.set_offset(20)
|
||||
self.assertEqual(tknzr.offset, 20)
|
||||
self.assertEqual(tknzr._tokenizer.offset, 20)
|
||||
self.assertEqual(tknzr._curtok.__class__, empty_tokenize)
|
||||
if n == 6:
|
||||
self.assertEqual(pos, 20)
|
||||
self.assertEqual(word, "split")
|
||||
if n == 7:
|
||||
self.assertEqual(pos, 26)
|
||||
self.assertEqual(word, "according")
|
||||
# Test setting offset to middle of current token
|
||||
tknzr.set_offset(23)
|
||||
self.assertEqual(tknzr.offset, 23)
|
||||
self.assertEqual(tknzr._tokenizer.offset, 23)
|
||||
self.assertEqual(tknzr._curtok.offset, 3)
|
||||
if n == 8:
|
||||
self.assertEqual(pos, 23)
|
||||
self.assertEqual(word, "it")
|
||||
# OK, I'm pretty happy with the behaviour, no need to
|
||||
# continue testing the rest of the string
|
||||
|
||||
|
||||
class TestFilters(unittest.TestCase):
|
||||
"""TestCases for the various Filter subclasses."""
|
||||
|
||||
text = """this text with http://url.com and SomeLinksLike
|
||||
ftp://my.site.com.au/some/file AndOthers not:/quite.a.url
|
||||
with-an@aemail.address as well"""
|
||||
|
||||
def setUp(self):
|
||||
pass
|
||||
|
||||
def test_URLFilter(self):
|
||||
"""Test filtering of URLs"""
|
||||
tkns = get_tokenizer("en_US", filters=(URLFilter,))(self.text)
|
||||
out = [t for t in tkns]
|
||||
exp = [("this", 0), ("text", 5), ("with", 10), ("and", 30),
|
||||
("SomeLinksLike", 34), ("AndOthers", 93), ("not", 103), ("quite", 108),
|
||||
("a", 114), ("url", 116), ("with", 134), ("an", 139), ("aemail", 142),
|
||||
("address", 149), ("as", 157), ("well", 160)]
|
||||
self.assertEqual(out, exp)
|
||||
|
||||
def test_WikiWordFilter(self):
|
||||
"""Test filtering of WikiWords"""
|
||||
tkns = get_tokenizer("en_US", filters=(WikiWordFilter,))(self.text)
|
||||
out = [t for t in tkns]
|
||||
exp = [("this", 0), ("text", 5), ("with", 10), ("http", 15), ("url", 22), ("com", 26),
|
||||
("and", 30), ("ftp", 62), ("my", 68), ("site", 71), ("com", 76), ("au", 80),
|
||||
("some", 83), ("file", 88), ("not", 103), ("quite", 108),
|
||||
("a", 114), ("url", 116), ("with", 134), ("an", 139), ("aemail", 142),
|
||||
("address", 149), ("as", 157), ("well", 160)]
|
||||
self.assertEqual(out, exp)
|
||||
|
||||
def test_EmailFilter(self):
|
||||
"""Test filtering of email addresses"""
|
||||
tkns = get_tokenizer("en_US", filters=(EmailFilter,))(self.text)
|
||||
out = [t for t in tkns]
|
||||
exp = [("this", 0), ("text", 5), ("with", 10), ("http", 15), ("url", 22), ("com", 26),
|
||||
("and", 30), ("SomeLinksLike", 34),
|
||||
("ftp", 62), ("my", 68), ("site", 71), ("com", 76), ("au", 80),
|
||||
("some", 83), ("file", 88), ("AndOthers", 93), ("not", 103), ("quite", 108),
|
||||
("a", 114), ("url", 116),
|
||||
("as", 157), ("well", 160)]
|
||||
self.assertEqual(out, exp)
|
||||
|
||||
def test_CombinedFilter(self):
|
||||
"""Test several filters combined"""
|
||||
tkns = get_tokenizer("en_US", filters=(URLFilter, WikiWordFilter, EmailFilter))(self.text)
|
||||
out = [t for t in tkns]
|
||||
exp = [("this", 0), ("text", 5), ("with", 10),
|
||||
("and", 30), ("not", 103), ("quite", 108),
|
||||
("a", 114), ("url", 116),
|
||||
("as", 157), ("well", 160)]
|
||||
self.assertEqual(out, exp)
|
||||
|
||||
|
||||
class TestChunkers(unittest.TestCase):
|
||||
"""TestCases for the various Chunker subclasses."""
|
||||
|
||||
def test_HTMLChunker(self):
|
||||
"""Test filtering of URLs"""
|
||||
text = """hello<html><head><title>my title</title></head><body>this is a
|
||||
<b>simple</b> HTML document for <p> test<i>ing</i> purposes</p>.
|
||||
It < contains > various <-- special characters.
|
||||
"""
|
||||
tkns = get_tokenizer("en_US", chunkers=(HTMLChunker,))(text)
|
||||
out = [t for t in tkns]
|
||||
exp = [("hello", 0), ("my", 24), ("title", 27), ("this", 53), ("is", 58),
|
||||
("a", 61), ("simple", 82), ("HTML", 93), ("document", 98), ("for", 107),
|
||||
("test", 115), ("ing", 122), ("purposes", 130), ("It", 160),
|
||||
("contains", 165), ("various", 176), ("special", 188),
|
||||
("characters", 196)]
|
||||
self.assertEqual(out, exp)
|
||||
for (word, pos) in out:
|
||||
self.assertEqual(text[pos:pos + len(word)], word)
|
||||
|
||||
|
||||
class TestTokenizeEN(unittest.TestCase):
|
||||
"""TestCases for checking behaviour of English tokenization."""
|
||||
|
||||
def test_tokenize_en(self):
|
||||
"""Simple regression test for English tokenization."""
|
||||
input = """This is a paragraph. It's not very special, but it's designed
|
||||
2 show how the splitter works with many-different combos
|
||||
of words. Also need to "test" the handling of 'quoted' words."""
|
||||
output = [
|
||||
("This", 0), ("is", 5), ("a", 8), ("paragraph", 10), ("It's", 22),
|
||||
("not", 27), ("very", 31), ("special", 36), ("but", 45), ("it's", 49),
|
||||
("designed", 54), ("show", 65), ("how", 70), ("the", 74),
|
||||
("splitter", 78), ("works", 87), ("with", 93), ("many", 98),
|
||||
("different", 103), ("combos", 113), ("of", 120), ("words", 123),
|
||||
("Also", 130), ("need", 135),
|
||||
("to", 140), ("test", 144), ("the", 150), ("handling", 154),
|
||||
("of", 163), ("quoted", 167), ("words", 175)
|
||||
]
|
||||
for (itmO, itmV) in zip(output, tokenize_en(input)):
|
||||
self.assertEqual(itmO, itmV)
|
||||
|
||||
def test_unicodeBasic(self):
|
||||
"""Test tokenization of a basic unicode string."""
|
||||
input = raw_unicode(
|
||||
r"Ik ben ge\u00EFnteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet \u00E9\u00E9n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao")
|
||||
output = input.split(" ")
|
||||
output[8] = output[8][0:-1]
|
||||
for (itmO, itmV) in zip(output, tokenize_en(input)):
|
||||
self.assertEqual(itmO, itmV[0])
|
||||
self.assertTrue(input[itmV[1]:].startswith(itmO))
|
||||
|
||||
def test_unicodeCombining(self):
|
||||
"""Test tokenization with unicode combining symbols."""
|
||||
input = raw_unicode(
|
||||
r"Ik ben gei\u0308nteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet e\u0301e\u0301n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao")
|
||||
output = input.split(" ")
|
||||
output[8] = output[8][0:-1]
|
||||
for (itmO, itmV) in zip(output, tokenize_en(input)):
|
||||
self.assertEqual(itmO, itmV[0])
|
||||
self.assertTrue(input[itmV[1]:].startswith(itmO))
|
||||
|
||||
def test_utf8_bytes(self):
|
||||
"""Test tokenization of UTF8-encoded bytes (bug #2500184)."""
|
||||
# Python3 doesn't support bytestrings, don't run this test
|
||||
if str is unicode:
|
||||
return
|
||||
input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume"
|
||||
output = input.split(" ")
|
||||
output[1] = output[1][0:-1]
|
||||
for (itmO, itmV) in zip(output, tokenize_en(input)):
|
||||
self.assertEqual(itmO, itmV[0])
|
||||
self.assertTrue(input[itmV[1]:].startswith(itmO))
|
||||
|
||||
def test_utf8_bytes_at_end(self):
|
||||
"""Test tokenization of UTF8-encoded bytes at end of word."""
|
||||
# Python3 doesn't support bytestrings, don't run this test
|
||||
if str is unicode:
|
||||
return
|
||||
input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume"
|
||||
output = input.split(" ")
|
||||
output[1] = output[1][0:-1]
|
||||
for (itmO, itmV) in zip(output, tokenize_en(input)):
|
||||
self.assertEqual(itmO, itmV[0])
|
||||
|
||||
def test_utf8_bytes_in_an_array(self):
|
||||
"""Test tokenization of UTF8-encoded bytes stored in an array."""
|
||||
# Python3 doesn't support bytestrings, don't run this test
|
||||
if str is unicode:
|
||||
return
|
||||
input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume"
|
||||
output = input.split(" ")
|
||||
output[1] = output[1][0:-1]
|
||||
input = array.array('c', input)
|
||||
output = [array.array('c', w) for w in output]
|
||||
for (itmO, itmV) in zip(output, tokenize_en(array.array('c', input))):
|
||||
self.assertEqual(itmO, itmV[0])
|
||||
self.assertEqual(input[itmV[1]:itmV[1] + len(itmV[0])], itmO)
|
||||
|
||||
def test_bug1591450(self):
|
||||
"""Check for tokenization regressions identified in bug #1591450."""
|
||||
input = """Testing <i>markup</i> and {y:i}so-forth...leading dots and trail--- well, you get-the-point. Also check numbers: 999 1,000 12:00 .45. Done?"""
|
||||
output = [
|
||||
("Testing", 0), ("i", 9), ("markup", 11), ("i", 19), ("and", 22),
|
||||
("y", 27), ("i", 29), ("so", 31), ("forth", 34), ("leading", 42),
|
||||
("dots", 50), ("and", 55), ("trail", 59), ("well", 68),
|
||||
("you", 74), ("get", 78), ("the", 82), ("point", 86),
|
||||
("Also", 93), ("check", 98), ("numbers", 104), ("Done", 134),
|
||||
]
|
||||
for (itmO, itmV) in zip(output, tokenize_en(input)):
|
||||
self.assertEqual(itmO, itmV)
|
||||
|
||||
def test_bug2785373(self):
|
||||
"""Testcases for bug #2785373"""
|
||||
input = "So, one dey when I wes 17, I left."
|
||||
for _ in tokenize_en(input):
|
||||
pass
|
||||
input = raw_unicode("So, one dey when I wes 17, I left.")
|
||||
for _ in tokenize_en(input):
|
||||
pass
|
||||
|
||||
def test_finnish_text(self):
|
||||
"""Test tokenizing some Finnish text.
|
||||
|
||||
This really should work since there are no special rules to apply,
|
||||
just lots of non-ascii characters.
|
||||
"""
|
||||
inputT = raw_unicode(
|
||||
'T\\xe4m\\xe4 on kappale. Eip\\xe4 ole kovin 2 nen, mutta tarkoitus on n\\xe4ytt\\xe4\\xe4 miten sanastaja \\ntoimii useiden-erilaisten sanarypp\\xe4iden kimpussa.\\nPit\\xe4\\xe4p\\xe4 viel\\xe4 \'tarkistaa\' sanat jotka "lainausmerkeiss\\xe4". Heittomerkki ja vaa\'an.\\nUlkomaisia sanoja s\\xfcss, spa\\xdf.')
|
||||
outputT = [
|
||||
(raw_unicode('T\\xe4m\\xe4'), 0), (raw_unicode('on'), 5), (raw_unicode('kappale'), 8),
|
||||
(raw_unicode('Eip\\xe4'), 17), (raw_unicode('ole'), 22), (raw_unicode('kovin'), 26),
|
||||
(raw_unicode('nen'), 34), (raw_unicode('mutta'), 39), (raw_unicode('tarkoitus'), 45),
|
||||
(raw_unicode('on'), 55), (raw_unicode('n\\xe4ytt\\xe4\\xe4'), 58), (raw_unicode('miten'), 66),
|
||||
(raw_unicode('sanastaja'), 72), (raw_unicode('toimii'), 83), (raw_unicode('useiden'), 90),
|
||||
(raw_unicode('erilaisten'), 98), (raw_unicode('sanarypp\\xe4iden'), 109), (raw_unicode('kimpussa'), 123),
|
||||
(raw_unicode('Pit\\xe4\\xe4p\\xe4'), 133), (raw_unicode('viel\\xe4'), 141), (raw_unicode('tarkistaa'), 148),
|
||||
(raw_unicode('sanat'), 159), (raw_unicode('jotka'), 165), (raw_unicode('lainausmerkeiss\\xe4'), 172),
|
||||
(raw_unicode('Heittomerkki'), 191), (raw_unicode('ja'), 204), (raw_unicode("vaa'an"), 207),
|
||||
(raw_unicode('Ulkomaisia'), 215), (raw_unicode('sanoja'), 226), (raw_unicode('s\\xfcss'), 233),
|
||||
(raw_unicode('spa\\xdf'), 239), ]
|
||||
for (itmO, itmV) in zip(outputT, tokenize_en(inputT)):
|
||||
self.assertEqual(itmO, itmV)
|
||||
@@ -1,354 +0,0 @@
|
||||
# pyenchant
|
||||
#
|
||||
# Copyright (C) 2004-2008 Ryan Kelly
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA.
|
||||
#
|
||||
# In addition, as a special exception, you are
|
||||
# given permission to link the code of this program with
|
||||
# non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
||||
# spell checker backend) and distribute linked combinations including
|
||||
# the two. You must obey the GNU Lesser General Public License in all
|
||||
# respects for all of the code used other than said providers. If you modify
|
||||
# this file, you may extend this exception to your version of the
|
||||
# file, but you are not obligated to do so. If you do not wish to
|
||||
# do so, delete this exception statement from your version.
|
||||
#
|
||||
"""
|
||||
|
||||
enchant.utils: Misc utilities for the enchant package
|
||||
========================================================
|
||||
|
||||
This module provides miscellaneous utilities for use with the
|
||||
enchant spellchecking package. Currently available functionality
|
||||
includes:
|
||||
|
||||
* string/unicode compatibility wrappers
|
||||
* functions for dealing with locale/language settings
|
||||
* ability to list supporting data files (win32 only)
|
||||
* functions for bundling supporting data files from a build
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import codecs
|
||||
|
||||
from enchant.errors import *
|
||||
|
||||
# Attempt to access local language information
|
||||
try:
|
||||
import locale
|
||||
except ImportError:
|
||||
locale = None
|
||||
|
||||
#
|
||||
# Unicode/Bytes compatabilty wrappers.
|
||||
#
|
||||
# These allow us to support both Python 2.x and Python 3.x from
|
||||
# the same codebase.
|
||||
#
|
||||
# We provide explicit type objects "bytes" and "unicode" that can be
|
||||
# used to construct instances of the appropriate type. The class
|
||||
# "EnchantStr" derives from the default "str" type and implements the
|
||||
# necessary logic for encoding/decoding as strings are passed into
|
||||
# the underlying C library (where they must always be utf-8 encoded
|
||||
# byte strings).
|
||||
#
|
||||
|
||||
try:
|
||||
unicode = unicode
|
||||
except NameError:
|
||||
str = str
|
||||
unicode = str
|
||||
bytes = bytes
|
||||
basestring = (str, bytes)
|
||||
else:
|
||||
str = str
|
||||
unicode = unicode
|
||||
bytes = str
|
||||
basestring = basestring
|
||||
|
||||
|
||||
def raw_unicode(raw):
|
||||
"""Make a unicode string from a raw string.
|
||||
|
||||
This function takes a string containing unicode escape characters,
|
||||
and returns the corresponding unicode string. Useful for writing
|
||||
unicode string literals in your python source while being upwards-
|
||||
compatible with Python 3. For example, instead of doing this:
|
||||
|
||||
s = u"hello\u2149" # syntax error in Python 3
|
||||
|
||||
Or this:
|
||||
|
||||
s = "hello\u2149" # not what you want in Python 2.x
|
||||
|
||||
You can do this:
|
||||
|
||||
s = raw_unicode(r"hello\u2149") # works everywhere!
|
||||
|
||||
"""
|
||||
return raw.encode("utf8").decode("unicode-escape")
|
||||
|
||||
|
||||
def raw_bytes(raw):
|
||||
"""Make a bytes object out of a raw string.
|
||||
|
||||
This is analogous to raw_unicode, but processes byte escape characters
|
||||
to produce a bytes object.
|
||||
"""
|
||||
return codecs.escape_decode(raw)[0]
|
||||
|
||||
|
||||
class EnchantStr(str):
|
||||
"""String subclass for interfacing with enchant C library.
|
||||
|
||||
This class encapsulates the logic for interfacing between python native
|
||||
string/unicode objects and the underlying enchant library, which expects
|
||||
all strings to be UTF-8 character arrays. It is a subclass of the
|
||||
default string class 'str' - on Python 2.x that makes it an ascii string,
|
||||
on Python 3.x it is a unicode object.
|
||||
|
||||
Initialise it with a string or unicode object, and use the encode() method
|
||||
to obtain an object suitable for passing to the underlying C library.
|
||||
When strings are read back into python, use decode(s) to translate them
|
||||
back into the appropriate python-level string type.
|
||||
|
||||
This allows us to following the common Python 2.x idiom of returning
|
||||
unicode when unicode is passed in, and byte strings otherwise. It also
|
||||
lets the interface be upwards-compatible with Python 3, in which string
|
||||
objects are unicode by default.
|
||||
"""
|
||||
|
||||
def __new__(cls, value):
|
||||
"""EnchantStr data constructor.
|
||||
|
||||
This method records whether the initial string was unicode, then
|
||||
simply passes it along to the default string constructor.
|
||||
"""
|
||||
if type(value) is unicode:
|
||||
was_unicode = True
|
||||
if str is not unicode:
|
||||
value = value.encode("utf-8")
|
||||
else:
|
||||
was_unicode = False
|
||||
if str is not bytes:
|
||||
raise Error("Don't pass bytestrings to pyenchant")
|
||||
self = str.__new__(cls, value)
|
||||
self._was_unicode = was_unicode
|
||||
return self
|
||||
|
||||
def encode(self):
|
||||
"""Encode this string into a form usable by the enchant C library."""
|
||||
if str is unicode:
|
||||
return str.encode(self, "utf-8")
|
||||
else:
|
||||
return self
|
||||
|
||||
def decode(self, value):
|
||||
"""Decode a string returned by the enchant C library."""
|
||||
if self._was_unicode:
|
||||
if str is unicode:
|
||||
# On some python3 versions, ctypes converts c_char_p
|
||||
# to str() rather than bytes()
|
||||
if isinstance(value, str):
|
||||
value = value.encode()
|
||||
return value.decode("utf-8")
|
||||
else:
|
||||
return value.decode("utf-8")
|
||||
else:
|
||||
return value
|
||||
|
||||
|
||||
def printf(values, sep=" ", end="\n", file=None):
|
||||
"""Compatability wrapper from print statement/function.
|
||||
|
||||
This function is a simple Python2/Python3 compatability wrapper
|
||||
for printing to stdout.
|
||||
"""
|
||||
if file is None:
|
||||
file = sys.stdout
|
||||
file.write(sep.join(map(str, values)))
|
||||
file.write(end)
|
||||
|
||||
|
||||
try:
|
||||
next = next
|
||||
except NameError:
|
||||
def next(iter):
|
||||
"""Compatability wrapper for advancing an iterator."""
|
||||
return iter.next()
|
||||
|
||||
try:
|
||||
xrange = xrange
|
||||
except NameError:
|
||||
xrange = range
|
||||
|
||||
|
||||
#
|
||||
# Other useful functions.
|
||||
#
|
||||
|
||||
|
||||
def levenshtein(s1, s2):
|
||||
"""Calculate the Levenshtein distance between two strings.
|
||||
|
||||
This is straight from Wikipedia.
|
||||
"""
|
||||
if len(s1) < len(s2):
|
||||
return levenshtein(s2, s1)
|
||||
if not s1:
|
||||
return len(s2)
|
||||
|
||||
previous_row = xrange(len(s2) + 1)
|
||||
for i, c1 in enumerate(s1):
|
||||
current_row = [i + 1]
|
||||
for j, c2 in enumerate(s2):
|
||||
insertions = previous_row[j + 1] + 1
|
||||
deletions = current_row[j] + 1
|
||||
substitutions = previous_row[j] + (c1 != c2)
|
||||
current_row.append(min(insertions, deletions, substitutions))
|
||||
previous_row = current_row
|
||||
|
||||
return previous_row[-1]
|
||||
|
||||
|
||||
def trim_suggestions(word, suggs, maxlen, calcdist=None):
|
||||
"""Trim a list of suggestions to a maximum length.
|
||||
|
||||
If the list of suggested words is too long, you can use this function
|
||||
to trim it down to a maximum length. It tries to keep the "best"
|
||||
suggestions based on similarity to the original word.
|
||||
|
||||
If the optional "calcdist" argument is provided, it must be a callable
|
||||
taking two words and returning the distance between them. It will be
|
||||
used to determine which words to retain in the list. The default is
|
||||
a simple Levenshtein distance.
|
||||
"""
|
||||
if calcdist is None:
|
||||
calcdist = levenshtein
|
||||
decorated = [(calcdist(word, s), s) for s in suggs]
|
||||
decorated.sort()
|
||||
return [s for (l, s) in decorated[:maxlen]]
|
||||
|
||||
|
||||
def get_default_language(default=None):
|
||||
"""Determine the user's default language, if possible.
|
||||
|
||||
This function uses the 'locale' module to try to determine
|
||||
the user's preferred language. The return value is as
|
||||
follows:
|
||||
|
||||
* if a locale is available for the LC_MESSAGES category,
|
||||
that language is used
|
||||
* if a default locale is available, that language is used
|
||||
* if the keyword argument <default> is given, it is used
|
||||
* if nothing else works, None is returned
|
||||
|
||||
Note that determining the user's language is in general only
|
||||
possible if they have set the necessary environment variables
|
||||
on their system.
|
||||
"""
|
||||
try:
|
||||
import locale
|
||||
tag = locale.getlocale()[0]
|
||||
if tag is None:
|
||||
tag = locale.getdefaultlocale()[0]
|
||||
if tag is None:
|
||||
raise Error("No default language available")
|
||||
return tag
|
||||
except Exception:
|
||||
pass
|
||||
return default
|
||||
|
||||
|
||||
get_default_language._DOC_ERRORS = ["LC"]
|
||||
|
||||
|
||||
def get_resource_filename(resname):
|
||||
"""Get the absolute path to the named resource file.
|
||||
|
||||
This serves widely the same purpose as pkg_resources.resource_filename(),
|
||||
but tries to avoid loading pkg_resources unless we're actually in
|
||||
an egg.
|
||||
"""
|
||||
path = os.path.dirname(os.path.abspath(__file__))
|
||||
path = os.path.join(path, resname)
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
if hasattr(sys, "frozen"):
|
||||
exe_path = unicode(sys.executable, sys.getfilesystemencoding())
|
||||
exe_dir = os.path.dirname(exe_path)
|
||||
path = os.path.join(exe_dir, resname)
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
else:
|
||||
import pkg_resources
|
||||
try:
|
||||
path = pkg_resources.resource_filename("enchant", resname)
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
path = os.path.abspath(path)
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
raise Error("Could not locate resource '%s'" % (resname,))
|
||||
|
||||
|
||||
def win32_data_files():
|
||||
"""Get list of supporting data files, for use with setup.py
|
||||
|
||||
This function returns a list of the supporting data files available
|
||||
to the running version of PyEnchant. This is in the format expected
|
||||
by the data_files argument of the distutils setup function. It's
|
||||
very useful, for example, for including the data files in an executable
|
||||
produced by py2exe.
|
||||
|
||||
Only really tested on the win32 platform (it's the only platform for
|
||||
which we ship our own supporting data files)
|
||||
"""
|
||||
# Include the main enchant DLL
|
||||
try:
|
||||
libEnchant = get_resource_filename("libenchant.dll")
|
||||
except Error:
|
||||
libEnchant = get_resource_filename("libenchant-1.dll")
|
||||
mainDir = os.path.dirname(libEnchant)
|
||||
dataFiles = [('', [libEnchant])]
|
||||
# And some specific supporting DLLs
|
||||
for dll in os.listdir(mainDir):
|
||||
if not dll.endswith(".dll"):
|
||||
continue
|
||||
for prefix in ("iconv", "intl", "libglib", "libgmodule"):
|
||||
if dll.startswith(prefix):
|
||||
break
|
||||
else:
|
||||
continue
|
||||
dataFiles[0][1].append(os.path.join(mainDir, dll))
|
||||
# And anything found in the supporting data directories
|
||||
dataDirs = ("share/enchant/myspell", "share/enchant/ispell", "lib/enchant")
|
||||
for dataDir in dataDirs:
|
||||
files = []
|
||||
fullDir = os.path.join(mainDir, os.path.normpath(dataDir))
|
||||
for fn in os.listdir(fullDir):
|
||||
fullFn = os.path.join(fullDir, fn)
|
||||
if os.path.isfile(fullFn):
|
||||
files.append(fullFn)
|
||||
dataFiles.append((dataDir, files))
|
||||
return dataFiles
|
||||
|
||||
|
||||
win32_data_files._DOC_ERRORS = ["py", "py", "exe"]
|
||||
264
PACK/maskgen.py
Normal file → Executable file
264
PACK/maskgen.py
Normal file → Executable file
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/python3
|
||||
# MaskGen - Generate Password Masks
|
||||
#
|
||||
# This tool is part of PACK (Password Analysis and Cracking Kit)
|
||||
@@ -36,6 +36,11 @@ class MaskGen:
|
||||
self.minoccurrence = None
|
||||
self.maxoccurrence = None
|
||||
|
||||
self.customcharset1len = None
|
||||
self.customcharset2len = None
|
||||
self.customcharset3len = None
|
||||
self.customcharset4len = None
|
||||
|
||||
# PPS (Passwords per Second) Cracking Speed
|
||||
self.pps = 1000000000
|
||||
self.showmasks = False
|
||||
@@ -57,42 +62,60 @@ class MaskGen:
|
||||
count *= 33
|
||||
elif char == "a":
|
||||
count *= 95
|
||||
elif char == "b":
|
||||
count *= 256
|
||||
elif char == "h":
|
||||
count *= 16
|
||||
elif char == "H":
|
||||
count *= 16
|
||||
elif char == "1" and self.customcharset1len:
|
||||
count *= self.customcharset1len
|
||||
elif char == "2" and self.customcharset2len:
|
||||
count *= self.customcharset2len
|
||||
elif char == "3" and self.customcharset3len:
|
||||
count *= self.customcharset3len
|
||||
elif char == "4" and self.customcharset4len:
|
||||
count *= self.customcharset4len
|
||||
else:
|
||||
print
|
||||
"[!] Error, unknown mask ?%s in a mask %s" % (char, mask)
|
||||
print("[!] Error, unknown mask ?%s in a mask %s" %
|
||||
(char, mask))
|
||||
|
||||
return count
|
||||
|
||||
def loadmasks(self, filename):
|
||||
""" Load masks and apply filters. """
|
||||
maskReader = csv.reader(open(args[0], 'r'), delimiter=',', quotechar='"')
|
||||
maskReader = csv.reader(
|
||||
open(args[0], 'r'), delimiter=',', quotechar='"')
|
||||
|
||||
for (mask, occurrence) in maskReader:
|
||||
|
||||
if mask == "": continue
|
||||
if mask == "":
|
||||
continue
|
||||
|
||||
mask_occurrence = int(occurrence)
|
||||
mask_length = len(mask) / 2
|
||||
mask_length = len(mask)/2
|
||||
mask_complexity = self.getcomplexity(mask)
|
||||
mask_time = mask_complexity / self.pps
|
||||
mask_time = mask_complexity/self.pps
|
||||
|
||||
self.total_occurrence += mask_occurrence
|
||||
|
||||
# Apply filters based on occurrence, length, complexity and time
|
||||
if (self.minoccurrence == None or mask_occurrence >= self.minoccurrence) and \
|
||||
(self.maxoccurrence == None or mask_occurrence <= self.maxoccurrence) and \
|
||||
(self.mincomplexity == None or mask_complexity <= self.mincomplexity) and \
|
||||
(self.maxcomplexity == None or mask_complexity <= self.maxcomplexity) and \
|
||||
(self.mintime == None or mask_time <= self.mintime) and \
|
||||
(self.maxtime == None or mask_time <= self.maxtime) and \
|
||||
(self.maxlength == None or mask_length <= self.maxlength) and \
|
||||
(self.minlength == None or mask_length >= self.minlength):
|
||||
(self.maxoccurrence == None or mask_occurrence <= self.maxoccurrence) and \
|
||||
(self.mincomplexity == None or mask_complexity >= self.mincomplexity) and \
|
||||
(self.maxcomplexity == None or mask_complexity <= self.maxcomplexity) and \
|
||||
(self.mintime == None or mask_time >= self.mintime) and \
|
||||
(self.maxtime == None or mask_time <= self.maxtime) and \
|
||||
(self.maxlength == None or mask_length <= self.maxlength) and \
|
||||
(self.minlength == None or mask_length >= self.minlength):
|
||||
|
||||
self.masks[mask] = dict()
|
||||
self.masks[mask]['length'] = mask_length
|
||||
self.masks[mask]['occurrence'] = mask_occurrence
|
||||
self.masks[mask]['complexity'] = 1 - mask_complexity
|
||||
self.masks[mask]['time'] = mask_time
|
||||
self.masks[mask]['optindex'] = 1 - mask_complexity / mask_occurrence
|
||||
self.masks[mask]['optindex'] = 1 - \
|
||||
mask_complexity/mask_occurrence
|
||||
|
||||
def generate_masks(self, sorting_mode):
|
||||
""" Generate optimal password masks sorted by occurrence, complexity or optindex """
|
||||
@@ -104,16 +127,15 @@ class MaskGen:
|
||||
# Group by length 1,2,3,4,5,6,7,8,9,10....
|
||||
# Group by occurrence 10%, 20%, 30%, 40%, 50%....
|
||||
|
||||
if self.showmasks: print
|
||||
"[L:] Mask: [ Occ: ] [ Time: ]"
|
||||
for mask in sorted(self.masks.keys(), key=lambda mask: self.masks[mask][sorting_mode], reverse=True):
|
||||
if self.showmasks:
|
||||
print("[L:] Mask: [ Occ: ] [ Time: ]")
|
||||
for mask in sorted(list(self.masks.keys()), key=lambda mask: self.masks[mask][sorting_mode], reverse=True):
|
||||
|
||||
if self.showmasks:
|
||||
time_human = ">1 year" if self.masks[mask]['time'] > 60 * 60 * 24 * 365 else str(
|
||||
time_human = ">1 year" if self.masks[mask]['time'] > 60*60*24*365 else str(
|
||||
datetime.timedelta(seconds=self.masks[mask]['time']))
|
||||
print
|
||||
"[{:>2}] {:<30} [{:<7}] [{:>8}] ".format(self.masks[mask]['length'], mask,
|
||||
self.masks[mask]['occurrence'], time_human)
|
||||
print("[{:>2}] {:<30} [{:<7}] [{:>8}] ".format(
|
||||
self.masks[mask]['length'], mask, self.masks[mask]['occurrence'], time_human))
|
||||
|
||||
if self.output_file:
|
||||
self.output_file.write("%s\n" % mask)
|
||||
@@ -123,20 +145,16 @@ class MaskGen:
|
||||
sample_count += 1
|
||||
|
||||
if self.target_time and sample_time > self.target_time:
|
||||
print
|
||||
"[!] Target time exceeded."
|
||||
print("[!] Target time exceeded.")
|
||||
break
|
||||
|
||||
print
|
||||
"[*] Finished generating masks:"
|
||||
print
|
||||
" Masks generated: %s" % sample_count
|
||||
print
|
||||
" Masks coverage: %d%% (%d/%d)" % (
|
||||
sample_occurrence * 100 / self.total_occurrence, sample_occurrence, self.total_occurrence)
|
||||
time_human = ">1 year" if sample_time > 60 * 60 * 24 * 365 else str(datetime.timedelta(seconds=sample_time))
|
||||
print
|
||||
" Masks runtime: %s" % time_human
|
||||
print("[*] Finished generating masks:")
|
||||
print(" Masks generated: %s" % sample_count)
|
||||
print(" Masks coverage: %d%% (%d/%d)" % (sample_occurrence*100 /
|
||||
self.total_occurrence, sample_occurrence, self.total_occurrence))
|
||||
time_human = ">1 year" if sample_time > 60*60*24 * \
|
||||
365 else str(datetime.timedelta(seconds=sample_time))
|
||||
print(" Masks runtime: %s" % time_human)
|
||||
|
||||
def getmaskscoverage(self, checkmasks):
|
||||
|
||||
@@ -145,8 +163,8 @@ class MaskGen:
|
||||
|
||||
total_complexity = 0
|
||||
|
||||
if self.showmasks: print
|
||||
"[L:] Mask: [ Occ: ] [ Time: ]"
|
||||
if self.showmasks:
|
||||
print("[L:] Mask: [ Occ: ] [ Time: ]")
|
||||
for mask in checkmasks:
|
||||
mask = mask.strip()
|
||||
mask_complexity = self.getcomplexity(mask)
|
||||
@@ -156,11 +174,10 @@ class MaskGen:
|
||||
if mask in self.masks:
|
||||
|
||||
if self.showmasks:
|
||||
time_human = ">1 year" if self.masks[mask]['time'] > 60 * 60 * 24 * 365 else str(
|
||||
time_human = ">1 year" if self.masks[mask]['time'] > 60*60*24*365 else str(
|
||||
datetime.timedelta(seconds=self.masks[mask]['time']))
|
||||
print
|
||||
"[{:>2}] {:<30} [{:<7}] [{:>8}] ".format(self.masks[mask]['length'], mask,
|
||||
self.masks[mask]['occurrence'], time_human)
|
||||
print("[{:>2}] {:<30} [{:<7}] [{:>8}] ".format(
|
||||
self.masks[mask]['length'], mask, self.masks[mask]['occurrence'], time_human))
|
||||
|
||||
if self.output_file:
|
||||
self.output_file.write("%s\n" % mask)
|
||||
@@ -168,23 +185,19 @@ class MaskGen:
|
||||
sample_occurrence += self.masks[mask]['occurrence']
|
||||
sample_count += 1
|
||||
|
||||
if self.target_time and total_complexity / self.pps > self.target_time:
|
||||
print
|
||||
"[!] Target time exceeded."
|
||||
if self.target_time and total_complexity/self.pps > self.target_time:
|
||||
print("[!] Target time exceeded.")
|
||||
break
|
||||
|
||||
# TODO: Something wrong here, complexity and time doesn't match with estimated from policygen
|
||||
total_time = total_complexity / self.pps
|
||||
time_human = ">1 year" if total_time > 60 * 60 * 24 * 365 else str(datetime.timedelta(seconds=total_time))
|
||||
print
|
||||
"[*] Finished matching masks:"
|
||||
print
|
||||
" Masks matched: %s" % sample_count
|
||||
print
|
||||
" Masks coverage: %d%% (%d/%d)" % (
|
||||
sample_occurrence * 100 / self.total_occurrence, sample_occurrence, self.total_occurrence)
|
||||
print
|
||||
" Masks runtime: %s" % time_human
|
||||
total_time = total_complexity/self.pps
|
||||
time_human = ">1 year" if total_time > 60*60*24 * \
|
||||
365 else str(datetime.timedelta(seconds=total_time))
|
||||
print("[*] Finished matching masks:")
|
||||
print(" Masks matched: %s" % sample_count)
|
||||
print(" Masks coverage: %d%% (%d/%d)" % (sample_occurrence*100 /
|
||||
self.total_occurrence, sample_occurrence, self.total_occurrence))
|
||||
print(" Masks runtime: %s" % time_human)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -199,85 +212,127 @@ if __name__ == "__main__":
|
||||
header += " |_| iphelix@thesprawl.org\n"
|
||||
header += "\n"
|
||||
|
||||
parser = OptionParser("%prog pass0.masks [pass1.masks ...] [options]", version="%prog " + VERSION)
|
||||
parser = OptionParser(
|
||||
"%prog pass0.masks [pass1.masks ...] [options]", version="%prog "+VERSION)
|
||||
|
||||
parser.add_option("-t", "--targettime", dest="target_time", type="int", metavar="86400",
|
||||
help="Target time of all masks (seconds)")
|
||||
parser.add_option("-o", "--outputmasks", dest="output_masks", metavar="masks.hcmask", help="Save masks to a file")
|
||||
parser.add_option("-t", "--targettime", dest="target_time", type="int",
|
||||
metavar="86400", help="Target time of all masks (seconds)")
|
||||
parser.add_option("-o", "--outputmasks", dest="output_masks",
|
||||
metavar="masks.hcmask", help="Save masks to a file")
|
||||
|
||||
filters = OptionGroup(parser, "Individual Mask Filter Options")
|
||||
filters.add_option("--minlength", dest="minlength", type="int", metavar="8", help="Minimum password length")
|
||||
filters.add_option("--maxlength", dest="maxlength", type="int", metavar="8", help="Maximum password length")
|
||||
filters.add_option("--mintime", dest="mintime", type="int", metavar="3600", help="Minimum mask runtime (seconds)")
|
||||
filters.add_option("--maxtime", dest="maxtime", type="int", metavar="3600", help="Maximum mask runtime (seconds)")
|
||||
filters.add_option("--mincomplexity", dest="mincomplexity", type="int", metavar="1", help="Minimum complexity")
|
||||
filters.add_option("--maxcomplexity", dest="maxcomplexity", type="int", metavar="100", help="Maximum complexity")
|
||||
filters.add_option("--minoccurrence", dest="minoccurrence", type="int", metavar="1", help="Minimum occurrence")
|
||||
filters.add_option("--maxoccurrence", dest="maxoccurrence", type="int", metavar="100", help="Maximum occurrence")
|
||||
filters.add_option("--minlength", dest="minlength",
|
||||
type="int", metavar="8", help="Minimum password length")
|
||||
filters.add_option("--maxlength", dest="maxlength",
|
||||
type="int", metavar="8", help="Maximum password length")
|
||||
filters.add_option("--mintime", dest="mintime", type="int",
|
||||
metavar="3600", help="Minimum mask runtime (seconds)")
|
||||
filters.add_option("--maxtime", dest="maxtime", type="int",
|
||||
metavar="3600", help="Maximum mask runtime (seconds)")
|
||||
filters.add_option("--mincomplexity", dest="mincomplexity",
|
||||
type="int", metavar="1", help="Minimum complexity")
|
||||
filters.add_option("--maxcomplexity", dest="maxcomplexity",
|
||||
type="int", metavar="100", help="Maximum complexity")
|
||||
filters.add_option("--minoccurrence", dest="minoccurrence",
|
||||
type="int", metavar="1", help="Minimum occurrence")
|
||||
filters.add_option("--maxoccurrence", dest="maxoccurrence",
|
||||
type="int", metavar="100", help="Maximum occurrence")
|
||||
parser.add_option_group(filters)
|
||||
|
||||
sorting = OptionGroup(parser, "Mask Sorting Options")
|
||||
sorting.add_option("--optindex", action="store_true", dest="optindex", help="sort by mask optindex (default)",
|
||||
default=False)
|
||||
sorting.add_option("--occurrence", action="store_true", dest="occurrence", help="sort by mask occurrence",
|
||||
default=False)
|
||||
sorting.add_option("--complexity", action="store_true", dest="complexity", help="sort by mask complexity",
|
||||
default=False)
|
||||
sorting.add_option("--optindex", action="store_true", dest="optindex",
|
||||
help="sort by mask optindex (default)", default=False)
|
||||
sorting.add_option("--occurrence", action="store_true", dest="occurrence",
|
||||
help="sort by mask occurrence", default=False)
|
||||
sorting.add_option("--complexity", action="store_true", dest="complexity",
|
||||
help="sort by mask complexity", default=False)
|
||||
parser.add_option_group(sorting)
|
||||
|
||||
coverage = OptionGroup(parser, "Check mask coverage")
|
||||
coverage.add_option("--checkmasks", dest="checkmasks", help="check mask coverage",
|
||||
metavar="?u?l?l?l?l?l?d,?l?l?l?l?l?d?d")
|
||||
coverage.add_option("--checkmasksfile", dest="checkmasks_file", help="check mask coverage in a file",
|
||||
metavar="masks.hcmask")
|
||||
coverage.add_option("--checkmasks", dest="checkmasks",
|
||||
help="check mask coverage", metavar="?u?l?l?l?l?l?d,?l?l?l?l?l?d?d")
|
||||
coverage.add_option("--checkmasksfile", dest="checkmasks_file",
|
||||
help="check mask coverage in a file", metavar="masks.hcmask")
|
||||
parser.add_option_group(coverage)
|
||||
|
||||
parser.add_option("--showmasks", dest="showmasks", help="Show matching masks", action="store_true", default=False)
|
||||
parser.add_option("--showmasks", dest="showmasks",
|
||||
help="Show matching masks", action="store_true", default=False)
|
||||
|
||||
custom = OptionGroup(parser, "Custom charater set options")
|
||||
custom.add_option("--custom-charset1-len", dest="customcharset1len",
|
||||
type="int", metavar="26", help="Length of cutom character set 1")
|
||||
custom.add_option("--custom-charset2-len", dest="customcharset2len",
|
||||
type="int", metavar="26", help="Length of cutom character set 2")
|
||||
custom.add_option("--custom-charset3-len", dest="customcharset3len",
|
||||
type="int", metavar="26", help="Length of cutom character set 3")
|
||||
custom.add_option("--custom-charset4-len", dest="customcharset4len",
|
||||
type="int", metavar="26", help="Length of cutom character set 4")
|
||||
parser.add_option_group(custom)
|
||||
|
||||
misc = OptionGroup(parser, "Miscellaneous options")
|
||||
misc.add_option("--pps", dest="pps", help="Passwords per Second", type="int", metavar="1000000000")
|
||||
misc.add_option("-q", "--quiet", action="store_true", dest="quiet", default=False, help="Don't show headers.")
|
||||
misc.add_option("--pps", dest="pps", help="Passwords per Second",
|
||||
type="int", metavar="1000000000")
|
||||
misc.add_option("-q", "--quiet", action="store_true",
|
||||
dest="quiet", default=False, help="Don't show headers.")
|
||||
parser.add_option_group(misc)
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
# Print program header
|
||||
if not options.quiet:
|
||||
print
|
||||
header
|
||||
print(header)
|
||||
|
||||
if len(args) < 1:
|
||||
parser.error("no masks file specified! Please provide statsgen output.")
|
||||
parser.error(
|
||||
"no masks file specified! Please provide statsgen output.")
|
||||
exit(1)
|
||||
|
||||
print
|
||||
"[*] Analyzing masks in [%s]" % args[0]
|
||||
print("[*] Analyzing masks in [%s]" % args[0])
|
||||
|
||||
maskgen = MaskGen()
|
||||
|
||||
# Settings
|
||||
if options.target_time: maskgen.target_time = options.target_time
|
||||
if options.target_time:
|
||||
maskgen.target_time = options.target_time
|
||||
if options.output_masks:
|
||||
print
|
||||
"[*] Saving generated masks to [%s]" % options.output_masks
|
||||
print("[*] Saving generated masks to [%s]" % options.output_masks)
|
||||
maskgen.output_file = open(options.output_masks, 'w')
|
||||
|
||||
# Filters
|
||||
if options.minlength: maskgen.minlength = options.minlength
|
||||
if options.maxlength: maskgen.maxlength = options.maxlength
|
||||
if options.mintime: maskgen.mintime = options.mintime
|
||||
if options.maxtime: maskgen.maxtime = options.maxtime
|
||||
if options.mincomplexity: maskgen.mincomplexity = options.mincomplexity
|
||||
if options.maxcomplexity: maskgen.maxcomplexity = options.maxcomplexity
|
||||
if options.minoccurrence: maskgen.minoccurrence = options.minoccurrence
|
||||
if options.maxoccurrence: maskgen.maxoccurrence = options.maxoccurrence
|
||||
if options.minlength:
|
||||
maskgen.minlength = options.minlength
|
||||
if options.maxlength:
|
||||
maskgen.maxlength = options.maxlength
|
||||
if options.mintime:
|
||||
maskgen.mintime = options.mintime
|
||||
if options.maxtime:
|
||||
maskgen.maxtime = options.maxtime
|
||||
if options.mincomplexity:
|
||||
maskgen.mincomplexity = options.mincomplexity
|
||||
if options.maxcomplexity:
|
||||
maskgen.maxcomplexity = options.maxcomplexity
|
||||
if options.minoccurrence:
|
||||
maskgen.minoccurrence = options.minoccurrence
|
||||
if options.maxoccurrence:
|
||||
maskgen.maxoccurrence = options.maxoccurrence
|
||||
|
||||
# Custom
|
||||
if options.customcharset1len:
|
||||
maskgen.customcharset1len = options.customcharset1len
|
||||
if options.customcharset2len:
|
||||
maskgen.customcharset2len = options.customcharset2len
|
||||
if options.customcharset3len:
|
||||
maskgen.customcharset3len = options.customcharset3len
|
||||
if options.customcharset4len:
|
||||
maskgen.customcharset4len = options.customcharset4len
|
||||
|
||||
# Misc
|
||||
if options.pps: maskgen.pps = options.pps
|
||||
if options.showmasks: maskgen.showmasks = options.showmasks
|
||||
if options.pps:
|
||||
maskgen.pps = options.pps
|
||||
if options.showmasks:
|
||||
maskgen.showmasks = options.showmasks
|
||||
|
||||
print
|
||||
"[*] Using {:,d} keys/sec for calculations.".format(maskgen.pps)
|
||||
print("[*] Using {:,d} keys/sec for calculations.".format(maskgen.pps))
|
||||
|
||||
# Load masks
|
||||
for arg in args:
|
||||
@@ -286,15 +341,15 @@ if __name__ == "__main__":
|
||||
# Matching masks from the command-line
|
||||
if options.checkmasks:
|
||||
checkmasks = [m.strip() for m in options.checkmasks.split(',')]
|
||||
print
|
||||
"[*] Checking coverage of the these masks [%s]" % ", ".join(checkmasks)
|
||||
print("[*] Checking coverage of the these masks [%s]" %
|
||||
", ".join(checkmasks))
|
||||
maskgen.getmaskscoverage(checkmasks)
|
||||
|
||||
# Matching masks from a file
|
||||
elif options.checkmasks_file:
|
||||
checkmasks_file = open(options.checkmasks_file, 'r')
|
||||
print
|
||||
"[*] Checking coverage of masks in [%s]" % options.checkmasks_file
|
||||
print("[*] Checking coverage of masks in [%s]" %
|
||||
options.checkmasks_file)
|
||||
maskgen.getmaskscoverage(checkmasks_file)
|
||||
|
||||
# Printing masks in a file
|
||||
@@ -307,6 +362,5 @@ if __name__ == "__main__":
|
||||
else:
|
||||
sorting_mode = "optindex"
|
||||
|
||||
print
|
||||
"[*] Sorting masks by their [%s]." % sorting_mode
|
||||
print("[*] Sorting masks by their [%s]." % sorting_mode)
|
||||
maskgen.generate_masks(sorting_mode)
|
||||
|
||||
130
PACK/policygen.py
Normal file → Executable file
130
PACK/policygen.py
Normal file → Executable file
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/env python3
|
||||
# PolicyGen - Analyze and Generate password masks according to a password policy
|
||||
#
|
||||
# This tool is part of PACK (Password Analysis and Cracking Kit)
|
||||
@@ -10,7 +10,9 @@
|
||||
#
|
||||
# Please see the attached LICENSE file for additional licensing information.
|
||||
|
||||
import sys, string, random
|
||||
import sys
|
||||
import string
|
||||
import random
|
||||
import datetime
|
||||
from optparse import OptionParser, OptionGroup
|
||||
import itertools
|
||||
@@ -52,7 +54,7 @@ class PolicyGen:
|
||||
elif char == "a":
|
||||
count *= 95
|
||||
else:
|
||||
print
|
||||
print()
|
||||
"[!] Error, unknown mask ?%s in a mask %s" % (char, mask)
|
||||
|
||||
return count
|
||||
@@ -69,8 +71,8 @@ class PolicyGen:
|
||||
sample_complexity = 0
|
||||
|
||||
# TODO: Randomize or even statistically arrange matching masks
|
||||
for length in xrange(self.minlength, self.maxlength + 1):
|
||||
print
|
||||
for length in range(self.minlength, self.maxlength + 1):
|
||||
print()
|
||||
"[*] Generating %d character password masks." % length
|
||||
total_length_count = 0
|
||||
sample_length_count = 0
|
||||
@@ -106,14 +108,14 @@ class PolicyGen:
|
||||
# Filter according to password policy
|
||||
# NOTE: Perform exact opposite (XOR) operation if noncompliant
|
||||
# flag was set when calling the function.
|
||||
if ((self.minlower == None or lowercount >= self.minlower) and \
|
||||
(self.maxlower == None or lowercount <= self.maxlower) and \
|
||||
(self.minupper == None or uppercount >= self.minupper) and \
|
||||
(self.maxupper == None or uppercount <= self.maxupper) and \
|
||||
(self.mindigit == None or digitcount >= self.mindigit) and \
|
||||
(self.maxdigit == None or digitcount <= self.maxdigit) and \
|
||||
(self.minspecial == None or specialcount >= self.minspecial) and \
|
||||
(self.maxspecial == None or specialcount <= self.maxspecial)) ^ noncompliant:
|
||||
if ((self.minlower == None or lowercount >= self.minlower) and
|
||||
(self.maxlower == None or lowercount <= self.maxlower) and
|
||||
(self.minupper == None or uppercount >= self.minupper) and
|
||||
(self.maxupper == None or uppercount <= self.maxupper) and
|
||||
(self.mindigit == None or digitcount >= self.mindigit) and
|
||||
(self.maxdigit == None or digitcount <= self.maxdigit) and
|
||||
(self.minspecial == None or specialcount >= self.minspecial) and
|
||||
(self.maxspecial == None or specialcount <= self.maxspecial)) ^ noncompliant:
|
||||
|
||||
sample_length_count += 1
|
||||
sample_length_complexity += mask_complexity
|
||||
@@ -122,10 +124,9 @@ class PolicyGen:
|
||||
mask_time = mask_complexity / self.pps
|
||||
time_human = ">1 year" if mask_time > 60 * 60 * 24 * 365 else str(
|
||||
datetime.timedelta(seconds=mask_time))
|
||||
print
|
||||
"[{:>2}] {:<30} [l:{:>2} u:{:>2} d:{:>2} s:{:>2}] [{:>8}] ".format(length, mask, lowercount,
|
||||
print("[{:>2}] {:<30} [l:{:>2} u:{:>2} d:{:>2} s:{:>2}] [{:>8}] ".format(length, mask, lowercount,
|
||||
uppercount, digitcount,
|
||||
specialcount, time_human)
|
||||
specialcount, time_human))
|
||||
|
||||
if self.output_file:
|
||||
self.output_file.write("%s\n" % mask)
|
||||
@@ -137,15 +138,14 @@ class PolicyGen:
|
||||
sample_complexity += sample_length_complexity
|
||||
|
||||
total_time = total_complexity / self.pps
|
||||
total_time_human = ">1 year" if total_time > 60 * 60 * 24 * 365 else str(datetime.timedelta(seconds=total_time))
|
||||
print
|
||||
"[*] Total Masks: %d Time: %s" % (total_count, total_time_human)
|
||||
total_time_human = ">1 year" if total_time > 60 * 60 * 24 * \
|
||||
365 else str(datetime.timedelta(seconds=total_time))
|
||||
print("[*] Total Masks: %d Time: %s" % (total_count, total_time_human))
|
||||
|
||||
sample_time = sample_complexity / self.pps
|
||||
sample_time_human = ">1 year" if sample_time > 60 * 60 * 24 * 365 else str(
|
||||
datetime.timedelta(seconds=sample_time))
|
||||
print
|
||||
"[*] Policy Masks: %d Time: %s" % (sample_count, sample_time_human)
|
||||
print("[*] Policy Masks: %d Time: %s" % (sample_count, sample_time_human))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -161,10 +161,14 @@ if __name__ == "__main__":
|
||||
header += "\n"
|
||||
|
||||
# parse command line arguments
|
||||
parser = OptionParser("%prog [options]\n\nType --help for more options", version="%prog " + VERSION)
|
||||
parser.add_option("-o", "--outputmasks", dest="output_masks", help="Save masks to a file", metavar="masks.hcmask")
|
||||
parser.add_option("--pps", dest="pps", help="Passwords per Second", type="int", metavar="1000000000")
|
||||
parser.add_option("--showmasks", dest="showmasks", help="Show matching masks", action="store_true", default=False)
|
||||
parser = OptionParser(
|
||||
"%prog [options]\n\nType --help for more options", version="%prog " + VERSION)
|
||||
parser.add_option("-o", "--outputmasks", dest="output_masks",
|
||||
help="Save masks to a file", metavar="masks.hcmask")
|
||||
parser.add_option("--pps", dest="pps", help="Passwords per Second",
|
||||
type="int", metavar="1000000000")
|
||||
parser.add_option("--showmasks", dest="showmasks",
|
||||
help="Show matching masks", action="store_true", default=False)
|
||||
parser.add_option("--noncompliant", dest="noncompliant", help="Generate masks for noncompliant passwords",
|
||||
action="store_true", default=False)
|
||||
|
||||
@@ -174,14 +178,16 @@ if __name__ == "__main__":
|
||||
help="Minimum password length")
|
||||
group.add_option("--maxlength", dest="maxlength", type="int", metavar="8", default=8,
|
||||
help="Maximum password length")
|
||||
group.add_option("--mindigit", dest="mindigit", type="int", metavar="1", help="Minimum number of digits")
|
||||
group.add_option("--mindigit", dest="mindigit", type="int",
|
||||
metavar="1", help="Minimum number of digits")
|
||||
group.add_option("--minlower", dest="minlower", type="int", metavar="1",
|
||||
help="Minimum number of lower-case characters")
|
||||
group.add_option("--minupper", dest="minupper", type="int", metavar="1",
|
||||
help="Minimum number of upper-case characters")
|
||||
group.add_option("--minspecial", dest="minspecial", type="int", metavar="1",
|
||||
help="Minimum number of special characters")
|
||||
group.add_option("--maxdigit", dest="maxdigit", type="int", metavar="3", help="Maximum number of digits")
|
||||
group.add_option("--maxdigit", dest="maxdigit", type="int",
|
||||
metavar="3", help="Maximum number of digits")
|
||||
group.add_option("--maxlower", dest="maxlower", type="int", metavar="3",
|
||||
help="Maximum number of lower-case characters")
|
||||
group.add_option("--maxupper", dest="maxupper", type="int", metavar="3",
|
||||
@@ -190,54 +196,62 @@ if __name__ == "__main__":
|
||||
help="Maximum number of special characters")
|
||||
parser.add_option_group(group)
|
||||
|
||||
parser.add_option("-q", "--quiet", action="store_true", dest="quiet", default=False, help="Don't show headers.")
|
||||
parser.add_option("-q", "--quiet", action="store_true",
|
||||
dest="quiet", default=False, help="Don't show headers.")
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
# Print program header
|
||||
if not options.quiet:
|
||||
print
|
||||
print()
|
||||
header
|
||||
|
||||
policygen = PolicyGen()
|
||||
|
||||
# Settings
|
||||
# Settings
|
||||
if options.output_masks:
|
||||
print
|
||||
"[*] Saving generated masks to [%s]" % options.output_masks
|
||||
print("[*] Saving generated masks to [%s]" % options.output_masks)
|
||||
policygen.output_file = open(options.output_masks, 'w')
|
||||
|
||||
# Password policy
|
||||
if options.minlength != None: policygen.minlength = options.minlength
|
||||
if options.maxlength != None: policygen.maxlength = options.maxlength
|
||||
if options.mindigit != None: policygen.mindigit = options.mindigit
|
||||
if options.minlower != None: policygen.minlower = options.minlower
|
||||
if options.minupper != None: policygen.minupper = options.minupper
|
||||
if options.minspecial != None: policygen.minspecial = options.minspecial
|
||||
if options.maxdigit != None: policygen.maxdigits = options.maxdigit
|
||||
if options.maxlower != None: policygen.maxlower = options.maxlower
|
||||
if options.maxupper != None: policygen.maxupper = options.maxupper
|
||||
if options.maxspecial != None: policygen.maxspecial = options.maxspecial
|
||||
if options.minlength != None:
|
||||
policygen.minlength = options.minlength
|
||||
if options.maxlength != None:
|
||||
policygen.maxlength = options.maxlength
|
||||
if options.mindigit != None:
|
||||
policygen.mindigit = options.mindigit
|
||||
if options.minlower != None:
|
||||
policygen.minlower = options.minlower
|
||||
if options.minupper != None:
|
||||
policygen.minupper = options.minupper
|
||||
if options.minspecial != None:
|
||||
policygen.minspecial = options.minspecial
|
||||
if options.maxdigit != None:
|
||||
policygen.maxdigits = options.maxdigit
|
||||
if options.maxlower != None:
|
||||
policygen.maxlower = options.maxlower
|
||||
if options.maxupper != None:
|
||||
policygen.maxupper = options.maxupper
|
||||
if options.maxspecial != None:
|
||||
policygen.maxspecial = options.maxspecial
|
||||
|
||||
# Misc
|
||||
if options.pps: policygen.pps = options.pps
|
||||
if options.showmasks: policygen.showmasks = options.showmasks
|
||||
if options.pps:
|
||||
policygen.pps = options.pps
|
||||
if options.showmasks:
|
||||
policygen.showmasks = options.showmasks
|
||||
|
||||
print
|
||||
"[*] Using {:,d} keys/sec for calculations.".format(policygen.pps)
|
||||
print("[*] Using {:,d} keys/sec for calculations.".format(policygen.pps))
|
||||
|
||||
# Print current password policy
|
||||
print
|
||||
"[*] Password policy:"
|
||||
print
|
||||
" Pass Lengths: min:%d max:%d" % (policygen.minlength, policygen.maxlength)
|
||||
print
|
||||
" Min strength: l:%s u:%s d:%s s:%s" % (
|
||||
policygen.minlower, policygen.minupper, policygen.mindigit, policygen.minspecial)
|
||||
print
|
||||
" Max strength: l:%s u:%s d:%s s:%s" % (
|
||||
policygen.maxlower, policygen.maxupper, policygen.maxdigit, policygen.maxspecial)
|
||||
print("[*] Password policy:")
|
||||
print(" Pass Lengths: min:%d max:%d" % (
|
||||
policygen.minlength, policygen.maxlength))
|
||||
print(" Min strength: l:%s u:%s d:%s s:%s" % (
|
||||
policygen.minlower, policygen.minupper, policygen.mindigit, policygen.minspecial))
|
||||
print(" Max strength: l:%s u:%s d:%s s:%s" % (
|
||||
policygen.maxlower, policygen.maxupper, policygen.maxdigit, policygen.maxspecial))
|
||||
|
||||
print
|
||||
"[*] Generating [%s] masks." % ("compliant" if not options.noncompliant else "non-compliant")
|
||||
print("[*] Generating [%s] masks." % (
|
||||
"compliant" if not options.noncompliant else "non-compliant"))
|
||||
policygen.generate_masks(options.noncompliant)
|
||||
|
||||
1191
PACK/rulegen.py
1191
PACK/rulegen.py
File diff suppressed because it is too large
Load Diff
239
PACK/statsgen.py
Normal file → Executable file
239
PACK/statsgen.py
Normal file → Executable file
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
#!/usr/bin/env python3
|
||||
# StatsGen - Password Statistical Analysis tool
|
||||
#
|
||||
# This tool is part of PACK (Password Analysis and Cracking Kit)
|
||||
@@ -11,7 +11,9 @@
|
||||
# Please see the attached LICENSE file for additional licensing information.
|
||||
|
||||
import sys
|
||||
import re, operator, string
|
||||
import re
|
||||
import operator
|
||||
import string
|
||||
from optparse import OptionParser, OptionGroup
|
||||
import time
|
||||
|
||||
@@ -73,26 +75,30 @@ class StatsGen:
|
||||
if letter in string.digits:
|
||||
digit += 1
|
||||
advancedmask_string += "?d"
|
||||
if not simplemask or not simplemask[-1] == 'digit': simplemask.append('digit')
|
||||
if not simplemask or not simplemask[-1] == 'digit':
|
||||
simplemask.append('digit')
|
||||
|
||||
elif letter in string.lowercase:
|
||||
elif letter in string.ascii_lowercase:
|
||||
lower += 1
|
||||
advancedmask_string += "?l"
|
||||
if not simplemask or not simplemask[-1] == 'string': simplemask.append('string')
|
||||
if not simplemask or not simplemask[-1] == 'string':
|
||||
simplemask.append('string')
|
||||
|
||||
|
||||
elif letter in string.uppercase:
|
||||
elif letter in string.ascii_uppercase:
|
||||
upper += 1
|
||||
advancedmask_string += "?u"
|
||||
if not simplemask or not simplemask[-1] == 'string': simplemask.append('string')
|
||||
if not simplemask or not simplemask[-1] == 'string':
|
||||
simplemask.append('string')
|
||||
|
||||
else:
|
||||
special += 1
|
||||
advancedmask_string += "?s"
|
||||
if not simplemask or not simplemask[-1] == 'special': simplemask.append('special')
|
||||
if not simplemask or not simplemask[-1] == 'special':
|
||||
simplemask.append('special')
|
||||
|
||||
# String representation of masks
|
||||
simplemask_string = ''.join(simplemask) if len(simplemask) <= 3 else 'othermask'
|
||||
simplemask_string = ''.join(simplemask) if len(
|
||||
simplemask) <= 3 else 'othermask'
|
||||
|
||||
# Policy
|
||||
policy = (digit, lower, upper, special)
|
||||
@@ -136,106 +142,109 @@ class StatsGen:
|
||||
def generate_stats(self, filename):
|
||||
""" Generate password statistics. """
|
||||
|
||||
f = open(filename, 'r')
|
||||
with open(filename, 'r') as f:
|
||||
|
||||
for password in f:
|
||||
password = password.rstrip('\r\n')
|
||||
for password in f:
|
||||
password = password.rstrip('\r\n')
|
||||
|
||||
if len(password) == 0: continue
|
||||
if len(password) == 0:
|
||||
continue
|
||||
|
||||
self.total_counter += 1
|
||||
self.total_counter += 1
|
||||
|
||||
(pass_length, characterset, simplemask, advancedmask, policy) = self.analyze_password(password)
|
||||
(digit, lower, upper, special) = policy
|
||||
(pass_length, characterset, simplemask, advancedmask,
|
||||
policy) = self.analyze_password(password)
|
||||
(digit, lower, upper, special) = policy
|
||||
|
||||
if (self.charsets == None or characterset in self.charsets) and \
|
||||
(self.simplemasks == None or simplemask in self.simplemasks) and \
|
||||
(self.maxlength == None or pass_length <= self.maxlength) and \
|
||||
(self.minlength == None or pass_length >= self.minlength):
|
||||
if (self.charsets == None or characterset in self.charsets) and \
|
||||
(self.simplemasks == None or simplemask in self.simplemasks) and \
|
||||
(self.maxlength == None or pass_length <= self.maxlength) and \
|
||||
(self.minlength == None or pass_length >= self.minlength):
|
||||
|
||||
self.filter_counter += 1
|
||||
self.filter_counter += 1
|
||||
|
||||
if self.mindigit == None or digit < self.mindigit: self.mindigit = digit
|
||||
if self.maxdigit == None or digit > self.maxdigit: self.maxdigit = digit
|
||||
if self.mindigit == None or digit < self.mindigit:
|
||||
self.mindigit = digit
|
||||
if self.maxdigit == None or digit > self.maxdigit:
|
||||
self.maxdigit = digit
|
||||
|
||||
if self.minupper == None or upper < self.minupper: self.minupper = upper
|
||||
if self.maxupper == None or upper > self.maxupper: self.maxupper = upper
|
||||
if self.minupper == None or upper < self.minupper:
|
||||
self.minupper = upper
|
||||
if self.maxupper == None or upper > self.maxupper:
|
||||
self.maxupper = upper
|
||||
|
||||
if self.minlower == None or lower < self.minlower: self.minlower = lower
|
||||
if self.maxlower == None or lower > self.maxlower: self.maxlower = lower
|
||||
if self.minlower == None or lower < self.minlower:
|
||||
self.minlower = lower
|
||||
if self.maxlower == None or lower > self.maxlower:
|
||||
self.maxlower = lower
|
||||
|
||||
if self.minspecial == None or special < self.minspecial: self.minspecial = special
|
||||
if self.maxspecial == None or special > self.maxspecial: self.maxspecial = special
|
||||
if self.minspecial == None or special < self.minspecial:
|
||||
self.minspecial = special
|
||||
if self.maxspecial == None or special > self.maxspecial:
|
||||
self.maxspecial = special
|
||||
|
||||
if pass_length in self.stats_length:
|
||||
self.stats_length[pass_length] += 1
|
||||
else:
|
||||
self.stats_length[pass_length] = 1
|
||||
if pass_length in self.stats_length:
|
||||
self.stats_length[pass_length] += 1
|
||||
else:
|
||||
self.stats_length[pass_length] = 1
|
||||
|
||||
if characterset in self.stats_charactersets:
|
||||
self.stats_charactersets[characterset] += 1
|
||||
else:
|
||||
self.stats_charactersets[characterset] = 1
|
||||
if characterset in self.stats_charactersets:
|
||||
self.stats_charactersets[characterset] += 1
|
||||
else:
|
||||
self.stats_charactersets[characterset] = 1
|
||||
|
||||
if simplemask in self.stats_simplemasks:
|
||||
self.stats_simplemasks[simplemask] += 1
|
||||
else:
|
||||
self.stats_simplemasks[simplemask] = 1
|
||||
if simplemask in self.stats_simplemasks:
|
||||
self.stats_simplemasks[simplemask] += 1
|
||||
else:
|
||||
self.stats_simplemasks[simplemask] = 1
|
||||
|
||||
if advancedmask in self.stats_advancedmasks:
|
||||
self.stats_advancedmasks[advancedmask] += 1
|
||||
else:
|
||||
self.stats_advancedmasks[advancedmask] = 1
|
||||
|
||||
f.close()
|
||||
if advancedmask in self.stats_advancedmasks:
|
||||
self.stats_advancedmasks[advancedmask] += 1
|
||||
else:
|
||||
self.stats_advancedmasks[advancedmask] = 1
|
||||
|
||||
def print_stats(self):
|
||||
""" Print password statistics. """
|
||||
|
||||
print
|
||||
"[+] Analyzing %d%% (%d/%d) of passwords" % (
|
||||
self.filter_counter * 100 / self.total_counter, self.filter_counter, self.total_counter)
|
||||
print
|
||||
" NOTE: Statistics below is relative to the number of analyzed passwords, not total number of passwords"
|
||||
print
|
||||
"\n[*] Length:"
|
||||
for (length, count) in sorted(self.stats_length.iteritems(), key=operator.itemgetter(1), reverse=True):
|
||||
if self.hiderare and not count * 100 / self.filter_counter > 0: continue
|
||||
print
|
||||
"[+] %25d: %02d%% (%d)" % (length, count * 100 / self.filter_counter, count)
|
||||
print("[+] Analyzing %d%% (%d/%d) of passwords" % (self.filter_counter *
|
||||
100/self.total_counter, self.filter_counter, self.total_counter))
|
||||
print(" NOTE: Statistics below is relative to the number of analyzed passwords, not total number of passwords")
|
||||
print("\n[*] Length:")
|
||||
for (length, count) in sorted(iter(self.stats_length.items()), key=operator.itemgetter(1), reverse=True):
|
||||
if self.hiderare and not count*100/self.filter_counter > 0:
|
||||
continue
|
||||
print("[+] %25d: %02d%% (%d)" %
|
||||
(length, count*100/self.filter_counter, count))
|
||||
|
||||
print
|
||||
"\n[*] Character-set:"
|
||||
for (char, count) in sorted(self.stats_charactersets.iteritems(), key=operator.itemgetter(1), reverse=True):
|
||||
if self.hiderare and not count * 100 / self.filter_counter > 0: continue
|
||||
print
|
||||
"[+] %25s: %02d%% (%d)" % (char, count * 100 / self.filter_counter, count)
|
||||
print("\n[*] Character-set:")
|
||||
for (char, count) in sorted(iter(self.stats_charactersets.items()), key=operator.itemgetter(1), reverse=True):
|
||||
if self.hiderare and not count*100/self.filter_counter > 0:
|
||||
continue
|
||||
print("[+] %25s: %02d%% (%d)" %
|
||||
(char, count*100/self.filter_counter, count))
|
||||
|
||||
print
|
||||
"\n[*] Password complexity:"
|
||||
print
|
||||
"[+] digit: min(%s) max(%s)" % (self.mindigit, self.maxdigit)
|
||||
print
|
||||
"[+] lower: min(%s) max(%s)" % (self.minlower, self.maxlower)
|
||||
print
|
||||
"[+] upper: min(%s) max(%s)" % (self.minupper, self.maxupper)
|
||||
print
|
||||
"[+] special: min(%s) max(%s)" % (self.minspecial, self.maxspecial)
|
||||
print("\n[*] Password complexity:")
|
||||
print("[+] digit: min(%s) max(%s)" %
|
||||
(self.mindigit, self.maxdigit))
|
||||
print("[+] lower: min(%s) max(%s)" %
|
||||
(self.minlower, self.maxlower))
|
||||
print("[+] upper: min(%s) max(%s)" %
|
||||
(self.minupper, self.maxupper))
|
||||
print("[+] special: min(%s) max(%s)" %
|
||||
(self.minspecial, self.maxspecial))
|
||||
|
||||
print
|
||||
"\n[*] Simple Masks:"
|
||||
for (simplemask, count) in sorted(self.stats_simplemasks.iteritems(), key=operator.itemgetter(1), reverse=True):
|
||||
if self.hiderare and not count * 100 / self.filter_counter > 0: continue
|
||||
print
|
||||
"[+] %25s: %02d%% (%d)" % (simplemask, count * 100 / self.filter_counter, count)
|
||||
print("\n[*] Simple Masks:")
|
||||
for (simplemask, count) in sorted(iter(self.stats_simplemasks.items()), key=operator.itemgetter(1), reverse=True):
|
||||
if self.hiderare and not count*100/self.filter_counter > 0:
|
||||
continue
|
||||
print("[+] %25s: %02d%% (%d)" %
|
||||
(simplemask, count*100/self.filter_counter, count))
|
||||
|
||||
print
|
||||
"\n[*] Advanced Masks:"
|
||||
for (advancedmask, count) in sorted(self.stats_advancedmasks.iteritems(), key=operator.itemgetter(1),
|
||||
reverse=True):
|
||||
if count * 100 / self.filter_counter > 0:
|
||||
print
|
||||
"[+] %25s: %02d%% (%d)" % (advancedmask, count * 100 / self.filter_counter, count)
|
||||
print("\n[*] Advanced Masks:")
|
||||
for (advancedmask, count) in sorted(iter(self.stats_advancedmasks.items()), key=operator.itemgetter(1), reverse=True):
|
||||
if count*100/self.filter_counter > 0:
|
||||
print("[+] %25s: %02d%% (%d)" %
|
||||
(advancedmask, count*100/self.filter_counter, count))
|
||||
|
||||
if self.output_file:
|
||||
self.output_file.write("%s,%d\n" % (advancedmask, count))
|
||||
@@ -253,49 +262,57 @@ if __name__ == "__main__":
|
||||
header += " |_| iphelix@thesprawl.org\n"
|
||||
header += "\n"
|
||||
|
||||
parser = OptionParser("%prog [options] passwords.txt\n\nType --help for more options", version="%prog " + VERSION)
|
||||
parser = OptionParser(
|
||||
"%prog [options] passwords.txt\n\nType --help for more options", version="%prog "+VERSION)
|
||||
|
||||
filters = OptionGroup(parser, "Password Filters")
|
||||
filters.add_option("--minlength", dest="minlength", type="int", metavar="8", help="Minimum password length")
|
||||
filters.add_option("--maxlength", dest="maxlength", type="int", metavar="8", help="Maximum password length")
|
||||
filters.add_option("--charset", dest="charsets", help="Password charset filter (comma separated)",
|
||||
metavar="loweralpha,numeric")
|
||||
filters.add_option("--simplemask", dest="simplemasks", help="Password mask filter (comma separated)",
|
||||
metavar="stringdigit,allspecial")
|
||||
filters.add_option("--minlength", dest="minlength",
|
||||
type="int", metavar="8", help="Minimum password length")
|
||||
filters.add_option("--maxlength", dest="maxlength",
|
||||
type="int", metavar="8", help="Maximum password length")
|
||||
filters.add_option("--charset", dest="charsets",
|
||||
help="Password charset filter (comma separated)", metavar="loweralpha,numeric")
|
||||
filters.add_option("--simplemask", dest="simplemasks",
|
||||
help="Password mask filter (comma separated)", metavar="stringdigit,allspecial")
|
||||
parser.add_option_group(filters)
|
||||
|
||||
parser.add_option("-o", "--output", dest="output_file", help="Save masks and stats to a file",
|
||||
metavar="password.masks")
|
||||
parser.add_option("--hiderare", action="store_true", dest="hiderare", default=False,
|
||||
help="Hide statistics covering less than 1% of the sample")
|
||||
parser.add_option("-o", "--output", dest="output_file",
|
||||
help="Save masks and stats to a file", metavar="password.masks")
|
||||
parser.add_option("--hiderare", action="store_true", dest="hiderare",
|
||||
default=False, help="Hide statistics covering less than 1% of the sample")
|
||||
|
||||
parser.add_option("-q", "--quiet", action="store_true", dest="quiet", default=False, help="Don't show headers.")
|
||||
parser.add_option("-q", "--quiet", action="store_true",
|
||||
dest="quiet", default=False, help="Don't show headers.")
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
# Print program header
|
||||
if not options.quiet:
|
||||
print
|
||||
header
|
||||
print(header)
|
||||
|
||||
if len(args) != 1:
|
||||
parser.error("no passwords file specified")
|
||||
exit(1)
|
||||
|
||||
print
|
||||
"[*] Analyzing passwords in [%s]" % args[0]
|
||||
print("[*] Analyzing passwords in [%s]" % args[0])
|
||||
|
||||
statsgen = StatsGen()
|
||||
|
||||
if not options.minlength == None: statsgen.minlength = options.minlength
|
||||
if not options.maxlength == None: statsgen.maxlength = options.maxlength
|
||||
if not options.charsets == None: statsgen.charsets = [x.strip() for x in options.charsets.split(',')]
|
||||
if not options.simplemasks == None: statsgen.simplemasks = [x.strip() for x in options.simplemasks.split(',')]
|
||||
if not options.minlength == None:
|
||||
statsgen.minlength = options.minlength
|
||||
if not options.maxlength == None:
|
||||
statsgen.maxlength = options.maxlength
|
||||
if not options.charsets == None:
|
||||
statsgen.charsets = [x.strip() for x in options.charsets.split(',')]
|
||||
if not options.simplemasks == None:
|
||||
statsgen.simplemasks = [x.strip()
|
||||
for x in options.simplemasks.split(',')]
|
||||
|
||||
if options.hiderare: statsgen.hiderare = options.hiderare
|
||||
if options.hiderare:
|
||||
statsgen.hiderare = options.hiderare
|
||||
|
||||
if options.output_file:
|
||||
print
|
||||
"[*] Saving advanced masks and occurrences to [%s]" % options.output_file
|
||||
print("[*] Saving advanced masks and occurrences to [%s]" %
|
||||
options.output_file)
|
||||
statsgen.output_file = open(options.output_file, 'w')
|
||||
|
||||
statsgen.generate_stats(args[0])
|
||||
|
||||
Reference in New Issue
Block a user