mirror of
https://github.com/morpheus65535/bazarr.git
synced 2024-09-20 15:35:58 +08:00
WIP
This commit is contained in:
parent
645952c61a
commit
c5fa0f56e4
1340
libs/backports/configparser2/__init__.py
Normal file
1340
libs/backports/configparser2/__init__.py
Normal file
File diff suppressed because it is too large
Load diff
171
libs/backports/configparser2/helpers.py
Normal file
171
libs/backports/configparser2/helpers.py
Normal file
|
@ -0,0 +1,171 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from collections import MutableMapping
|
||||
try:
|
||||
from collections import UserDict
|
||||
except ImportError:
|
||||
from UserDict import UserDict
|
||||
|
||||
try:
|
||||
from collections import OrderedDict
|
||||
except ImportError:
|
||||
from ordereddict import OrderedDict
|
||||
|
||||
from io import open
|
||||
import sys
|
||||
try:
|
||||
from thread import get_ident
|
||||
except ImportError:
|
||||
try:
|
||||
from _thread import get_ident
|
||||
except ImportError:
|
||||
from _dummy_thread import get_ident
|
||||
|
||||
|
||||
PY2 = sys.version_info[0] == 2
|
||||
PY3 = sys.version_info[0] == 3
|
||||
|
||||
str = type('str')
|
||||
|
||||
|
||||
def from_none(exc):
|
||||
"""raise from_none(ValueError('a')) == raise ValueError('a') from None"""
|
||||
exc.__cause__ = None
|
||||
exc.__suppress_context__ = True
|
||||
return exc
|
||||
|
||||
|
||||
# from reprlib 3.2.1
|
||||
def recursive_repr(fillvalue='...'):
|
||||
'Decorator to make a repr function return fillvalue for a recursive call'
|
||||
|
||||
def decorating_function(user_function):
|
||||
repr_running = set()
|
||||
|
||||
def wrapper(self):
|
||||
key = id(self), get_ident()
|
||||
if key in repr_running:
|
||||
return fillvalue
|
||||
repr_running.add(key)
|
||||
try:
|
||||
result = user_function(self)
|
||||
finally:
|
||||
repr_running.discard(key)
|
||||
return result
|
||||
|
||||
# Can't use functools.wraps() here because of bootstrap issues
|
||||
wrapper.__module__ = getattr(user_function, '__module__')
|
||||
wrapper.__doc__ = getattr(user_function, '__doc__')
|
||||
wrapper.__name__ = getattr(user_function, '__name__')
|
||||
wrapper.__annotations__ = getattr(user_function, '__annotations__', {})
|
||||
return wrapper
|
||||
|
||||
return decorating_function
|
||||
|
||||
# from collections 3.2.1
|
||||
class _ChainMap(MutableMapping):
|
||||
''' A ChainMap groups multiple dicts (or other mappings) together
|
||||
to create a single, updateable view.
|
||||
|
||||
The underlying mappings are stored in a list. That list is public and can
|
||||
accessed or updated using the *maps* attribute. There is no other state.
|
||||
|
||||
Lookups search the underlying mappings successively until a key is found.
|
||||
In contrast, writes, updates, and deletions only operate on the first
|
||||
mapping.
|
||||
|
||||
'''
|
||||
|
||||
def __init__(self, *maps):
|
||||
'''Initialize a ChainMap by setting *maps* to the given mappings.
|
||||
If no mappings are provided, a single empty dictionary is used.
|
||||
|
||||
'''
|
||||
self.maps = list(maps) or [{}] # always at least one map
|
||||
|
||||
def __missing__(self, key):
|
||||
raise KeyError(key)
|
||||
|
||||
def __getitem__(self, key):
|
||||
for mapping in self.maps:
|
||||
try:
|
||||
return mapping[key] # can't use 'key in mapping' with defaultdict
|
||||
except KeyError:
|
||||
pass
|
||||
return self.__missing__(key) # support subclasses that define __missing__
|
||||
|
||||
def get(self, key, default=None):
|
||||
return self[key] if key in self else default
|
||||
|
||||
def __len__(self):
|
||||
return len(set().union(*self.maps)) # reuses stored hash values if possible
|
||||
|
||||
def __iter__(self):
|
||||
return iter(set().union(*self.maps))
|
||||
|
||||
def __contains__(self, key):
|
||||
return any(key in m for m in self.maps)
|
||||
|
||||
@recursive_repr()
|
||||
def __repr__(self):
|
||||
return '{0.__class__.__name__}({1})'.format(
|
||||
self, ', '.join(map(repr, self.maps)))
|
||||
|
||||
@classmethod
|
||||
def fromkeys(cls, iterable, *args):
|
||||
'Create a ChainMap with a single dict created from the iterable.'
|
||||
return cls(dict.fromkeys(iterable, *args))
|
||||
|
||||
def copy(self):
|
||||
'New ChainMap or subclass with a new copy of maps[0] and refs to maps[1:]'
|
||||
return self.__class__(self.maps[0].copy(), *self.maps[1:])
|
||||
|
||||
__copy__ = copy
|
||||
|
||||
def new_child(self): # like Django's Context.push()
|
||||
'New ChainMap with a new dict followed by all previous maps.'
|
||||
return self.__class__({}, *self.maps)
|
||||
|
||||
@property
|
||||
def parents(self): # like Django's Context.pop()
|
||||
'New ChainMap from maps[1:].'
|
||||
return self.__class__(*self.maps[1:])
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
self.maps[0][key] = value
|
||||
|
||||
def __delitem__(self, key):
|
||||
try:
|
||||
del self.maps[0][key]
|
||||
except KeyError:
|
||||
raise KeyError('Key not found in the first mapping: {!r}'.format(key))
|
||||
|
||||
def popitem(self):
|
||||
'Remove and return an item pair from maps[0]. Raise KeyError is maps[0] is empty.'
|
||||
try:
|
||||
return self.maps[0].popitem()
|
||||
except KeyError:
|
||||
raise KeyError('No keys found in the first mapping.')
|
||||
|
||||
def pop(self, key, *args):
|
||||
'Remove *key* from maps[0] and return its value. Raise KeyError if *key* not in maps[0].'
|
||||
try:
|
||||
return self.maps[0].pop(key, *args)
|
||||
except KeyError:
|
||||
raise KeyError('Key not found in the first mapping: {!r}'.format(key))
|
||||
|
||||
def clear(self):
|
||||
'Clear maps[0], leaving maps[1:] intact.'
|
||||
self.maps[0].clear()
|
||||
|
||||
|
||||
try:
|
||||
from collections import ChainMap
|
||||
except ImportError:
|
||||
ChainMap = _ChainMap
|
99
libs/bs4/formatter.py
Normal file
99
libs/bs4/formatter.py
Normal file
|
@ -0,0 +1,99 @@
|
|||
from bs4.dammit import EntitySubstitution
|
||||
|
||||
class Formatter(EntitySubstitution):
|
||||
"""Describes a strategy to use when outputting a parse tree to a string.
|
||||
|
||||
Some parts of this strategy come from the distinction between
|
||||
HTML4, HTML5, and XML. Others are configurable by the user.
|
||||
"""
|
||||
# Registries of XML and HTML formatters.
|
||||
XML_FORMATTERS = {}
|
||||
HTML_FORMATTERS = {}
|
||||
|
||||
HTML = 'html'
|
||||
XML = 'xml'
|
||||
|
||||
HTML_DEFAULTS = dict(
|
||||
cdata_containing_tags=set(["script", "style"]),
|
||||
)
|
||||
|
||||
def _default(self, language, value, kwarg):
|
||||
if value is not None:
|
||||
return value
|
||||
if language == self.XML:
|
||||
return set()
|
||||
return self.HTML_DEFAULTS[kwarg]
|
||||
|
||||
def __init__(
|
||||
self, language=None, entity_substitution=None,
|
||||
void_element_close_prefix='/', cdata_containing_tags=None,
|
||||
):
|
||||
"""
|
||||
|
||||
:param void_element_close_prefix: By default, represent void
|
||||
elements as <tag/> rather than <tag>
|
||||
"""
|
||||
self.language = language
|
||||
self.entity_substitution = entity_substitution
|
||||
self.void_element_close_prefix = void_element_close_prefix
|
||||
self.cdata_containing_tags = self._default(
|
||||
language, cdata_containing_tags, 'cdata_containing_tags'
|
||||
)
|
||||
|
||||
def substitute(self, ns):
|
||||
"""Process a string that needs to undergo entity substitution."""
|
||||
if not self.entity_substitution:
|
||||
return ns
|
||||
from .element import NavigableString
|
||||
if (isinstance(ns, NavigableString)
|
||||
and ns.parent is not None
|
||||
and ns.parent.name in self.cdata_containing_tags):
|
||||
# Do nothing.
|
||||
return ns
|
||||
# Substitute.
|
||||
return self.entity_substitution(ns)
|
||||
|
||||
def attribute_value(self, value):
|
||||
"""Process the value of an attribute."""
|
||||
return self.substitute(value)
|
||||
|
||||
def attributes(self, tag):
|
||||
"""Reorder a tag's attributes however you want."""
|
||||
return sorted(tag.attrs.items())
|
||||
|
||||
|
||||
class HTMLFormatter(Formatter):
|
||||
REGISTRY = {}
|
||||
def __init__(self, *args, **kwargs):
|
||||
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
|
||||
|
||||
|
||||
class XMLFormatter(Formatter):
|
||||
REGISTRY = {}
|
||||
def __init__(self, *args, **kwargs):
|
||||
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
|
||||
|
||||
|
||||
# Set up aliases for the default formatters.
|
||||
HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html
|
||||
)
|
||||
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html,
|
||||
void_element_close_prefix = None
|
||||
)
|
||||
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_xml
|
||||
)
|
||||
HTMLFormatter.REGISTRY[None] = HTMLFormatter(
|
||||
entity_substitution=None
|
||||
)
|
||||
XMLFormatter.REGISTRY["html"] = XMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html
|
||||
)
|
||||
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_xml
|
||||
)
|
||||
XMLFormatter.REGISTRY[None] = Formatter(
|
||||
Formatter(Formatter.XML, entity_substitution=None)
|
||||
)
|
43
libs/bs42.7/AUTHORS.txt
Normal file
43
libs/bs42.7/AUTHORS.txt
Normal file
|
@ -0,0 +1,43 @@
|
|||
Behold, mortal, the origins of Beautiful Soup...
|
||||
================================================
|
||||
|
||||
Leonard Richardson is the primary programmer.
|
||||
|
||||
Aaron DeVore is awesome.
|
||||
|
||||
Mark Pilgrim provided the encoding detection code that forms the base
|
||||
of UnicodeDammit.
|
||||
|
||||
Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
|
||||
Soup 4 working under Python 3.
|
||||
|
||||
Simon Willison wrote soupselect, which was used to make Beautiful Soup
|
||||
support CSS selectors.
|
||||
|
||||
Sam Ruby helped with a lot of edge cases.
|
||||
|
||||
Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
|
||||
work in solving the nestable tags conundrum.
|
||||
|
||||
An incomplete list of people have contributed patches to Beautiful
|
||||
Soup:
|
||||
|
||||
Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
|
||||
Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
|
||||
Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
|
||||
Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
|
||||
Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
|
||||
Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
|
||||
Webster, Paul Wright, Danny Yoo
|
||||
|
||||
An incomplete list of people who made suggestions or found bugs or
|
||||
found ways to break Beautiful Soup:
|
||||
|
||||
Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
|
||||
Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
|
||||
Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
|
||||
warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
|
||||
Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
|
||||
Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
|
||||
Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
|
||||
Sousa Rocha, Yichun Wei, Per Vognsen
|
27
libs/bs42.7/COPYING.txt
Normal file
27
libs/bs42.7/COPYING.txt
Normal file
|
@ -0,0 +1,27 @@
|
|||
Beautiful Soup is made available under the MIT license:
|
||||
|
||||
Copyright (c) 2004-2015 Leonard Richardson
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
Beautiful Soup incorporates code from the html5lib library, which is
|
||||
also made available under the MIT license. Copyright (c) 2006-2013
|
||||
James Graham and other contributors
|
1190
libs/bs42.7/NEWS.txt
Normal file
1190
libs/bs42.7/NEWS.txt
Normal file
File diff suppressed because it is too large
Load diff
63
libs/bs42.7/README.txt
Normal file
63
libs/bs42.7/README.txt
Normal file
|
@ -0,0 +1,63 @@
|
|||
= Introduction =
|
||||
|
||||
>>> from bs4 import BeautifulSoup
|
||||
>>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML")
|
||||
>>> print soup.prettify()
|
||||
<html>
|
||||
<body>
|
||||
<p>
|
||||
Some
|
||||
<b>
|
||||
bad
|
||||
<i>
|
||||
HTML
|
||||
</i>
|
||||
</b>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
>>> soup.find(text="bad")
|
||||
u'bad'
|
||||
|
||||
>>> soup.i
|
||||
<i>HTML</i>
|
||||
|
||||
>>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml")
|
||||
>>> print soup.prettify()
|
||||
<?xml version="1.0" encoding="utf-8">
|
||||
<tag1>
|
||||
Some
|
||||
<tag2 />
|
||||
bad
|
||||
<tag3>
|
||||
XML
|
||||
</tag3>
|
||||
</tag1>
|
||||
|
||||
= Full documentation =
|
||||
|
||||
The bs4/doc/ directory contains full documentation in Sphinx
|
||||
format. Run "make html" in that directory to create HTML
|
||||
documentation.
|
||||
|
||||
= Running the unit tests =
|
||||
|
||||
Beautiful Soup supports unit test discovery from the project root directory:
|
||||
|
||||
$ nosetests
|
||||
|
||||
$ python -m unittest discover -s bs4 # Python 2.7 and up
|
||||
|
||||
If you checked out the source tree, you should see a script in the
|
||||
home directory called test-all-versions. This script will run the unit
|
||||
tests under Python 2.7, then create a temporary Python 3 conversion of
|
||||
the source and run the unit tests again under Python 3.
|
||||
|
||||
= Links =
|
||||
|
||||
Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/
|
||||
Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
http://readthedocs.org/docs/beautiful-soup-4/
|
||||
Discussion group: http://groups.google.com/group/beautifulsoup/
|
||||
Development: https://code.launchpad.net/beautifulsoup/
|
||||
Bug tracker: https://bugs.launchpad.net/beautifulsoup/
|
31
libs/bs42.7/TODO.txt
Normal file
31
libs/bs42.7/TODO.txt
Normal file
|
@ -0,0 +1,31 @@
|
|||
Additions
|
||||
---------
|
||||
|
||||
More of the jQuery API: nextUntil?
|
||||
|
||||
Optimizations
|
||||
-------------
|
||||
|
||||
The html5lib tree builder doesn't use the standard tree-building API,
|
||||
which worries me and has resulted in a number of bugs.
|
||||
|
||||
markup_attr_map can be optimized since it's always a map now.
|
||||
|
||||
Upon encountering UTF-16LE data or some other uncommon serialization
|
||||
of Unicode, UnicodeDammit will convert the data to Unicode, then
|
||||
encode it at UTF-8. This is wasteful because it will just get decoded
|
||||
back to Unicode.
|
||||
|
||||
CDATA
|
||||
-----
|
||||
|
||||
The elementtree XMLParser has a strip_cdata argument that, when set to
|
||||
False, should allow Beautiful Soup to preserve CDATA sections instead
|
||||
of treating them as text. Except it doesn't. (This argument is also
|
||||
present for HTMLParser, and also does nothing there.)
|
||||
|
||||
Currently, htm5lib converts CDATA sections into comments. An
|
||||
as-yet-unreleased version of html5lib changes the parser's handling of
|
||||
CDATA sections to allow CDATA sections in tags like <svg> and
|
||||
<math>. The HTML5TreeBuilder will need to be updated to create CData
|
||||
objects instead of Comment objects in this situation.
|
529
libs/bs42.7/__init__.py
Normal file
529
libs/bs42.7/__init__.py
Normal file
|
@ -0,0 +1,529 @@
|
|||
"""Beautiful Soup
|
||||
Elixir and Tonic
|
||||
"The Screen-Scraper's Friend"
|
||||
http://www.crummy.com/software/BeautifulSoup/
|
||||
|
||||
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||
(possibly invalid) document into a tree representation. Beautiful Soup
|
||||
provides methods and Pythonic idioms that make it easy to navigate,
|
||||
search, and modify the parse tree.
|
||||
|
||||
Beautiful Soup works with Python 2.7 and up. It works better if lxml
|
||||
and/or html5lib is installed.
|
||||
|
||||
For more than you ever wanted to know about Beautiful Soup, see the
|
||||
documentation:
|
||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
|
||||
"""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.6.0"
|
||||
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = ['BeautifulSoup']
|
||||
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
import warnings
|
||||
|
||||
from .builder import builder_registry, ParserRejectedMarkup
|
||||
from .dammit import UnicodeDammit
|
||||
from .element import (
|
||||
CData,
|
||||
Comment,
|
||||
DEFAULT_OUTPUT_ENCODING,
|
||||
Declaration,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
PageElement,
|
||||
ProcessingInstruction,
|
||||
ResultSet,
|
||||
SoupStrainer,
|
||||
Tag,
|
||||
)
|
||||
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 3 without converting it.
|
||||
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
|
||||
class BeautifulSoup(Tag):
|
||||
"""
|
||||
This class defines the basic interface called by the tree builders.
|
||||
|
||||
These methods will be called by the parser:
|
||||
reset()
|
||||
feed(markup)
|
||||
|
||||
The tree builder may call these methods from its feed() implementation:
|
||||
handle_starttag(name, attrs) # See note about return value
|
||||
handle_endtag(name)
|
||||
handle_data(data) # Appends to the current data node
|
||||
endData(containerClass=NavigableString) # Ends the current data node
|
||||
|
||||
No matter how complicated the underlying parser is, you should be
|
||||
able to build a tree using 'start tag' events, 'end tag' events,
|
||||
'data' events, and "done with data" events.
|
||||
|
||||
If you encounter an empty-element tag (aka a self-closing tag,
|
||||
like HTML's <br> tag), call handle_starttag and then
|
||||
handle_endtag.
|
||||
"""
|
||||
ROOT_TAG_NAME = u'[document]'
|
||||
|
||||
# If the end-user gives no indication which tree builder they
|
||||
# want, look for one with these features.
|
||||
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
||||
|
||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||
|
||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
|
||||
|
||||
def __init__(self, markup="", features=None, builder=None,
|
||||
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||
**kwargs):
|
||||
"""The Soup object is initialized as the 'root tag', and the
|
||||
provided markup (which can be a string or a file-like object)
|
||||
is fed into the underlying parser."""
|
||||
|
||||
if 'convertEntities' in kwargs:
|
||||
warnings.warn(
|
||||
"BS4 does not respect the convertEntities argument to the "
|
||||
"BeautifulSoup constructor. Entities are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'markupMassage' in kwargs:
|
||||
del kwargs['markupMassage']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the markupMassage argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for any necessary markup massage.")
|
||||
|
||||
if 'smartQuotesTo' in kwargs:
|
||||
del kwargs['smartQuotesTo']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the smartQuotesTo argument to the "
|
||||
"BeautifulSoup constructor. Smart quotes are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'selfClosingTags' in kwargs:
|
||||
del kwargs['selfClosingTags']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the selfClosingTags argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for understanding self-closing tags.")
|
||||
|
||||
if 'isHTML' in kwargs:
|
||||
del kwargs['isHTML']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the isHTML argument to the "
|
||||
"BeautifulSoup constructor. Suggest you use "
|
||||
"features='lxml' for HTML and features='lxml-xml' for "
|
||||
"XML.")
|
||||
|
||||
def deprecated_argument(old_name, new_name):
|
||||
if old_name in kwargs:
|
||||
warnings.warn(
|
||||
'The "%s" argument to the BeautifulSoup constructor '
|
||||
'has been renamed to "%s."' % (old_name, new_name))
|
||||
value = kwargs[old_name]
|
||||
del kwargs[old_name]
|
||||
return value
|
||||
return None
|
||||
|
||||
parse_only = parse_only or deprecated_argument(
|
||||
"parseOnlyThese", "parse_only")
|
||||
|
||||
from_encoding = from_encoding or deprecated_argument(
|
||||
"fromEncoding", "from_encoding")
|
||||
|
||||
if from_encoding and isinstance(markup, unicode):
|
||||
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
|
||||
from_encoding = None
|
||||
|
||||
if len(kwargs) > 0:
|
||||
arg = kwargs.keys().pop()
|
||||
raise TypeError(
|
||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||
|
||||
if builder is None:
|
||||
original_features = features
|
||||
if isinstance(features, basestring):
|
||||
features = [features]
|
||||
if features is None or len(features) == 0:
|
||||
features = self.DEFAULT_BUILDER_FEATURES
|
||||
builder_class = builder_registry.lookup(*features)
|
||||
if builder_class is None:
|
||||
raise FeatureNotFound(
|
||||
"Couldn't find a tree builder with the features you "
|
||||
"requested: %s. Do you need to install a parser library?"
|
||||
% ",".join(features))
|
||||
builder = builder_class()
|
||||
if not (original_features == builder.NAME or
|
||||
original_features in builder.ALTERNATE_NAMES):
|
||||
if builder.is_xml:
|
||||
markup_type = "XML"
|
||||
else:
|
||||
markup_type = "HTML"
|
||||
|
||||
caller = traceback.extract_stack()[0]
|
||||
filename = caller[0]
|
||||
line_number = caller[1]
|
||||
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
|
||||
filename=filename,
|
||||
line_number=line_number,
|
||||
parser=builder.NAME,
|
||||
markup_type=markup_type))
|
||||
|
||||
self.builder = builder
|
||||
self.is_xml = builder.is_xml
|
||||
self.known_xml = self.is_xml
|
||||
self.builder.soup = self
|
||||
|
||||
self.parse_only = parse_only
|
||||
|
||||
if hasattr(markup, 'read'): # It's a file-type object.
|
||||
markup = markup.read()
|
||||
elif len(markup) <= 256 and (
|
||||
(isinstance(markup, bytes) and not b'<' in markup)
|
||||
or (isinstance(markup, unicode) and not u'<' in markup)
|
||||
):
|
||||
# Print out warnings for a couple beginner problems
|
||||
# involving passing non-markup to Beautiful Soup.
|
||||
# Beautiful Soup will still parse the input as markup,
|
||||
# just in case that's what the user really wants.
|
||||
if (isinstance(markup, unicode)
|
||||
and not os.path.supports_unicode_filenames):
|
||||
possible_filename = markup.encode("utf8")
|
||||
else:
|
||||
possible_filename = markup
|
||||
is_file = False
|
||||
try:
|
||||
is_file = os.path.exists(possible_filename)
|
||||
except Exception, e:
|
||||
# This is almost certainly a problem involving
|
||||
# characters not valid in filenames on this
|
||||
# system. Just let it go.
|
||||
pass
|
||||
if is_file:
|
||||
if isinstance(markup, unicode):
|
||||
markup = markup.encode("utf8")
|
||||
warnings.warn(
|
||||
'"%s" looks like a filename, not markup. You should'
|
||||
' probably open this file and pass the filehandle into'
|
||||
' Beautiful Soup.' % markup)
|
||||
self._check_markup_is_url(markup)
|
||||
|
||||
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
||||
self.contains_replacement_characters) in (
|
||||
self.builder.prepare_markup(
|
||||
markup, from_encoding, exclude_encodings=exclude_encodings)):
|
||||
self.reset()
|
||||
try:
|
||||
self._feed()
|
||||
break
|
||||
except ParserRejectedMarkup:
|
||||
pass
|
||||
|
||||
# Clear out the markup and remove the builder's circular
|
||||
# reference to this object.
|
||||
self.markup = None
|
||||
self.builder.soup = None
|
||||
|
||||
def __copy__(self):
|
||||
copy = type(self)(
|
||||
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
|
||||
)
|
||||
|
||||
# Although we encoded the tree to UTF-8, that may not have
|
||||
# been the encoding of the original markup. Set the copy's
|
||||
# .original_encoding to reflect the original object's
|
||||
# .original_encoding.
|
||||
copy.original_encoding = self.original_encoding
|
||||
return copy
|
||||
|
||||
def __getstate__(self):
|
||||
# Frequently a tree builder can't be pickled.
|
||||
d = dict(self.__dict__)
|
||||
if 'builder' in d and not self.builder.picklable:
|
||||
d['builder'] = None
|
||||
return d
|
||||
|
||||
@staticmethod
|
||||
def _check_markup_is_url(markup):
|
||||
"""
|
||||
Check if markup looks like it's actually a url and raise a warning
|
||||
if so. Markup can be unicode or str (py2) / bytes (py3).
|
||||
"""
|
||||
if isinstance(markup, bytes):
|
||||
space = b' '
|
||||
cant_start_with = (b"http:", b"https:")
|
||||
elif isinstance(markup, unicode):
|
||||
space = u' '
|
||||
cant_start_with = (u"http:", u"https:")
|
||||
else:
|
||||
return
|
||||
|
||||
if any(markup.startswith(prefix) for prefix in cant_start_with):
|
||||
if not space in markup:
|
||||
if isinstance(markup, bytes):
|
||||
decoded_markup = markup.decode('utf-8', 'replace')
|
||||
else:
|
||||
decoded_markup = markup
|
||||
warnings.warn(
|
||||
'"%s" looks like a URL. Beautiful Soup is not an'
|
||||
' HTTP client. You should probably use an HTTP client like'
|
||||
' requests to get the document behind the URL, and feed'
|
||||
' that document to Beautiful Soup.' % decoded_markup
|
||||
)
|
||||
|
||||
def _feed(self):
|
||||
# Convert the document to Unicode.
|
||||
self.builder.reset()
|
||||
|
||||
self.builder.feed(self.markup)
|
||||
# Close out any unfinished strings and close all the open tags.
|
||||
self.endData()
|
||||
while self.currentTag.name != self.ROOT_TAG_NAME:
|
||||
self.popTag()
|
||||
|
||||
def reset(self):
|
||||
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
|
||||
self.hidden = 1
|
||||
self.builder.reset()
|
||||
self.current_data = []
|
||||
self.currentTag = None
|
||||
self.tagStack = []
|
||||
self.preserve_whitespace_tag_stack = []
|
||||
self.pushTag(self)
|
||||
|
||||
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
|
||||
"""Create a new tag associated with this soup."""
|
||||
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
||||
|
||||
def new_string(self, s, subclass=NavigableString):
|
||||
"""Create a new NavigableString associated with this soup."""
|
||||
return subclass(s)
|
||||
|
||||
def insert_before(self, successor):
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
||||
|
||||
def insert_after(self, successor):
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
|
||||
|
||||
def popTag(self):
|
||||
tag = self.tagStack.pop()
|
||||
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
|
||||
self.preserve_whitespace_tag_stack.pop()
|
||||
#print "Pop", tag.name
|
||||
if self.tagStack:
|
||||
self.currentTag = self.tagStack[-1]
|
||||
return self.currentTag
|
||||
|
||||
def pushTag(self, tag):
|
||||
#print "Push", tag.name
|
||||
if self.currentTag:
|
||||
self.currentTag.contents.append(tag)
|
||||
self.tagStack.append(tag)
|
||||
self.currentTag = self.tagStack[-1]
|
||||
if tag.name in self.builder.preserve_whitespace_tags:
|
||||
self.preserve_whitespace_tag_stack.append(tag)
|
||||
|
||||
def endData(self, containerClass=NavigableString):
|
||||
if self.current_data:
|
||||
current_data = u''.join(self.current_data)
|
||||
# If whitespace is not preserved, and this string contains
|
||||
# nothing but ASCII spaces, replace it with a single space
|
||||
# or newline.
|
||||
if not self.preserve_whitespace_tag_stack:
|
||||
strippable = True
|
||||
for i in current_data:
|
||||
if i not in self.ASCII_SPACES:
|
||||
strippable = False
|
||||
break
|
||||
if strippable:
|
||||
if '\n' in current_data:
|
||||
current_data = '\n'
|
||||
else:
|
||||
current_data = ' '
|
||||
|
||||
# Reset the data collector.
|
||||
self.current_data = []
|
||||
|
||||
# Should we add this string to the tree at all?
|
||||
if self.parse_only and len(self.tagStack) <= 1 and \
|
||||
(not self.parse_only.text or \
|
||||
not self.parse_only.search(current_data)):
|
||||
return
|
||||
|
||||
o = containerClass(current_data)
|
||||
self.object_was_parsed(o)
|
||||
|
||||
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
||||
"""Add an object to the parse tree."""
|
||||
parent = parent or self.currentTag
|
||||
previous_element = most_recent_element or self._most_recent_element
|
||||
|
||||
next_element = previous_sibling = next_sibling = None
|
||||
if isinstance(o, Tag):
|
||||
next_element = o.next_element
|
||||
next_sibling = o.next_sibling
|
||||
previous_sibling = o.previous_sibling
|
||||
if not previous_element:
|
||||
previous_element = o.previous_element
|
||||
|
||||
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
|
||||
|
||||
self._most_recent_element = o
|
||||
parent.contents.append(o)
|
||||
|
||||
if parent.next_sibling:
|
||||
# This node is being inserted into an element that has
|
||||
# already been parsed. Deal with any dangling references.
|
||||
index = len(parent.contents)-1
|
||||
while index >= 0:
|
||||
if parent.contents[index] is o:
|
||||
break
|
||||
index -= 1
|
||||
else:
|
||||
raise ValueError(
|
||||
"Error building tree: supposedly %r was inserted "
|
||||
"into %r after the fact, but I don't see it!" % (
|
||||
o, parent
|
||||
)
|
||||
)
|
||||
if index == 0:
|
||||
previous_element = parent
|
||||
previous_sibling = None
|
||||
else:
|
||||
previous_element = previous_sibling = parent.contents[index-1]
|
||||
if index == len(parent.contents)-1:
|
||||
next_element = parent.next_sibling
|
||||
next_sibling = None
|
||||
else:
|
||||
next_element = next_sibling = parent.contents[index+1]
|
||||
|
||||
o.previous_element = previous_element
|
||||
if previous_element:
|
||||
previous_element.next_element = o
|
||||
o.next_element = next_element
|
||||
if next_element:
|
||||
next_element.previous_element = o
|
||||
o.next_sibling = next_sibling
|
||||
if next_sibling:
|
||||
next_sibling.previous_sibling = o
|
||||
o.previous_sibling = previous_sibling
|
||||
if previous_sibling:
|
||||
previous_sibling.next_sibling = o
|
||||
|
||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||
"""Pops the tag stack up to and including the most recent
|
||||
instance of the given tag. If inclusivePop is false, pops the tag
|
||||
stack up to but *not* including the most recent instqance of
|
||||
the given tag."""
|
||||
#print "Popping to %s" % name
|
||||
if name == self.ROOT_TAG_NAME:
|
||||
# The BeautifulSoup object itself can never be popped.
|
||||
return
|
||||
|
||||
most_recently_popped = None
|
||||
|
||||
stack_size = len(self.tagStack)
|
||||
for i in range(stack_size - 1, 0, -1):
|
||||
t = self.tagStack[i]
|
||||
if (name == t.name and nsprefix == t.prefix):
|
||||
if inclusivePop:
|
||||
most_recently_popped = self.popTag()
|
||||
break
|
||||
most_recently_popped = self.popTag()
|
||||
|
||||
return most_recently_popped
|
||||
|
||||
def handle_starttag(self, name, namespace, nsprefix, attrs):
|
||||
"""Push a start tag on to the stack.
|
||||
|
||||
If this method returns None, the tag was rejected by the
|
||||
SoupStrainer. You should proceed as if the tag had not occurred
|
||||
in the document. For instance, if this was a self-closing tag,
|
||||
don't call handle_endtag.
|
||||
"""
|
||||
|
||||
# print "Start tag %s: %s" % (name, attrs)
|
||||
self.endData()
|
||||
|
||||
if (self.parse_only and len(self.tagStack) <= 1
|
||||
and (self.parse_only.text
|
||||
or not self.parse_only.search_tag(name, attrs))):
|
||||
return None
|
||||
|
||||
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
||||
self.currentTag, self._most_recent_element)
|
||||
if tag is None:
|
||||
return tag
|
||||
if self._most_recent_element:
|
||||
self._most_recent_element.next_element = tag
|
||||
self._most_recent_element = tag
|
||||
self.pushTag(tag)
|
||||
return tag
|
||||
|
||||
def handle_endtag(self, name, nsprefix=None):
|
||||
#print "End tag: " + name
|
||||
self.endData()
|
||||
self._popToTag(name, nsprefix)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.current_data.append(data)
|
||||
|
||||
def decode(self, pretty_print=False,
|
||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||
formatter="minimal"):
|
||||
"""Returns a string or Unicode representation of this document.
|
||||
To get Unicode, pass None for encoding."""
|
||||
|
||||
if self.is_xml:
|
||||
# Print the XML declaration
|
||||
encoding_part = ''
|
||||
if eventual_encoding != None:
|
||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
||||
else:
|
||||
prefix = u''
|
||||
if not pretty_print:
|
||||
indent_level = None
|
||||
else:
|
||||
indent_level = 0
|
||||
return prefix + super(BeautifulSoup, self).decode(
|
||||
indent_level, eventual_encoding, formatter)
|
||||
|
||||
# Alias to make it easier to type import: 'from bs4 import _soup'
|
||||
_s = BeautifulSoup
|
||||
_soup = BeautifulSoup
|
||||
|
||||
class BeautifulStoneSoup(BeautifulSoup):
|
||||
"""Deprecated interface to an XML parser."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['features'] = 'xml'
|
||||
warnings.warn(
|
||||
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
||||
'it, pass features="xml" into the BeautifulSoup constructor.')
|
||||
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class StopParsing(Exception):
|
||||
pass
|
||||
|
||||
class FeatureNotFound(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
#By default, act as an HTML pretty-printer.
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
soup = BeautifulSoup(sys.stdin)
|
||||
print soup.prettify()
|
333
libs/bs42.7/builder/__init__.py
Normal file
333
libs/bs42.7/builder/__init__.py
Normal file
|
@ -0,0 +1,333 @@
|
|||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
import sys
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
HTMLAwareEntitySubstitution,
|
||||
whitespace_re
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'HTMLTreeBuilder',
|
||||
'SAXTreeBuilder',
|
||||
'TreeBuilder',
|
||||
'TreeBuilderRegistry',
|
||||
]
|
||||
|
||||
# Some useful features for a TreeBuilder to have.
|
||||
FAST = 'fast'
|
||||
PERMISSIVE = 'permissive'
|
||||
STRICT = 'strict'
|
||||
XML = 'xml'
|
||||
HTML = 'html'
|
||||
HTML_5 = 'html5'
|
||||
|
||||
|
||||
class TreeBuilderRegistry(object):
|
||||
|
||||
def __init__(self):
|
||||
self.builders_for_feature = defaultdict(list)
|
||||
self.builders = []
|
||||
|
||||
def register(self, treebuilder_class):
|
||||
"""Register a treebuilder based on its advertised features."""
|
||||
for feature in treebuilder_class.features:
|
||||
self.builders_for_feature[feature].insert(0, treebuilder_class)
|
||||
self.builders.insert(0, treebuilder_class)
|
||||
|
||||
def lookup(self, *features):
|
||||
if len(self.builders) == 0:
|
||||
# There are no builders at all.
|
||||
return None
|
||||
|
||||
if len(features) == 0:
|
||||
# They didn't ask for any features. Give them the most
|
||||
# recently registered builder.
|
||||
return self.builders[0]
|
||||
|
||||
# Go down the list of features in order, and eliminate any builders
|
||||
# that don't match every feature.
|
||||
features = list(features)
|
||||
features.reverse()
|
||||
candidates = None
|
||||
candidate_set = None
|
||||
while len(features) > 0:
|
||||
feature = features.pop()
|
||||
we_have_the_feature = self.builders_for_feature.get(feature, [])
|
||||
if len(we_have_the_feature) > 0:
|
||||
if candidates is None:
|
||||
candidates = we_have_the_feature
|
||||
candidate_set = set(candidates)
|
||||
else:
|
||||
# Eliminate any candidates that don't have this feature.
|
||||
candidate_set = candidate_set.intersection(
|
||||
set(we_have_the_feature))
|
||||
|
||||
# The only valid candidates are the ones in candidate_set.
|
||||
# Go through the original list of candidates and pick the first one
|
||||
# that's in candidate_set.
|
||||
if candidate_set is None:
|
||||
return None
|
||||
for candidate in candidates:
|
||||
if candidate in candidate_set:
|
||||
return candidate
|
||||
return None
|
||||
|
||||
# The BeautifulSoup class will take feature lists from developers and use them
|
||||
# to look up builders in this registry.
|
||||
builder_registry = TreeBuilderRegistry()
|
||||
|
||||
class TreeBuilder(object):
|
||||
"""Turn a document into a Beautiful Soup object tree."""
|
||||
|
||||
NAME = "[Unknown tree builder]"
|
||||
ALTERNATE_NAMES = []
|
||||
features = []
|
||||
|
||||
is_xml = False
|
||||
picklable = False
|
||||
preserve_whitespace_tags = set()
|
||||
empty_element_tags = None # A tag will be considered an empty-element
|
||||
# tag when and only when it has no contents.
|
||||
|
||||
# A value for these tag/attribute combinations is a space- or
|
||||
# comma-separated list of CDATA, rather than a single CDATA.
|
||||
cdata_list_attributes = {}
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.soup = None
|
||||
|
||||
def reset(self):
|
||||
pass
|
||||
|
||||
def can_be_empty_element(self, tag_name):
|
||||
"""Might a tag with this name be an empty-element tag?
|
||||
|
||||
The final markup may or may not actually present this tag as
|
||||
self-closing.
|
||||
|
||||
For instance: an HTMLBuilder does not consider a <p> tag to be
|
||||
an empty-element tag (it's not in
|
||||
HTMLBuilder.empty_element_tags). This means an empty <p> tag
|
||||
will be presented as "<p></p>", not "<p />".
|
||||
|
||||
The default implementation has no opinion about which tags are
|
||||
empty-element tags, so a tag will be presented as an
|
||||
empty-element tag if and only if it has no contents.
|
||||
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
|
||||
be left alone.
|
||||
"""
|
||||
if self.empty_element_tags is None:
|
||||
return True
|
||||
return tag_name in self.empty_element_tags
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
return markup, None, None, False
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""Wrap an HTML fragment to make it look like a document.
|
||||
|
||||
Different parsers do this differently. For instance, lxml
|
||||
introduces an empty <head> tag, and html5lib
|
||||
doesn't. Abstracting this away lets us write simple tests
|
||||
which run HTML fragments through the parser and compare the
|
||||
results against other HTML fragments.
|
||||
|
||||
This method should not be used outside of tests.
|
||||
"""
|
||||
return fragment
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
return False
|
||||
|
||||
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
|
||||
"""Replaces class="foo bar" with class=["foo", "bar"]
|
||||
|
||||
Modifies its input in place.
|
||||
"""
|
||||
if not attrs:
|
||||
return attrs
|
||||
if self.cdata_list_attributes:
|
||||
universal = self.cdata_list_attributes.get('*', [])
|
||||
tag_specific = self.cdata_list_attributes.get(
|
||||
tag_name.lower(), None)
|
||||
for attr in attrs.keys():
|
||||
if attr in universal or (tag_specific and attr in tag_specific):
|
||||
# We have a "class"-type attribute whose string
|
||||
# value is a whitespace-separated list of
|
||||
# values. Split it into a list.
|
||||
value = attrs[attr]
|
||||
if isinstance(value, basestring):
|
||||
values = whitespace_re.split(value)
|
||||
else:
|
||||
# html5lib sometimes calls setAttributes twice
|
||||
# for the same tag when rearranging the parse
|
||||
# tree. On the second call the attribute value
|
||||
# here is already a list. If this happens,
|
||||
# leave the value alone rather than trying to
|
||||
# split it again.
|
||||
values = value
|
||||
attrs[attr] = values
|
||||
return attrs
|
||||
|
||||
class SAXTreeBuilder(TreeBuilder):
|
||||
"""A Beautiful Soup treebuilder that listens for SAX events."""
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
attrs = dict((key[1], value) for key, value in list(attrs.items()))
|
||||
#print "Start %s, %r" % (name, attrs)
|
||||
self.soup.handle_starttag(name, attrs)
|
||||
|
||||
def endElement(self, name):
|
||||
#print "End %s" % name
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def startElementNS(self, nsTuple, nodeName, attrs):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.startElement(nodeName, attrs)
|
||||
|
||||
def endElementNS(self, nsTuple, nodeName):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.endElement(nodeName)
|
||||
#handler.endElementNS((ns, node.nodeName), node.nodeName)
|
||||
|
||||
def startPrefixMapping(self, prefix, nodeValue):
|
||||
# Ignore the prefix for now.
|
||||
pass
|
||||
|
||||
def endPrefixMapping(self, prefix):
|
||||
# Ignore the prefix for now.
|
||||
# handler.endPrefixMapping(prefix)
|
||||
pass
|
||||
|
||||
def characters(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def startDocument(self):
|
||||
pass
|
||||
|
||||
def endDocument(self):
|
||||
pass
|
||||
|
||||
|
||||
class HTMLTreeBuilder(TreeBuilder):
|
||||
"""This TreeBuilder knows facts about HTML.
|
||||
|
||||
Such as which tags are empty-element tags.
|
||||
"""
|
||||
|
||||
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
|
||||
empty_element_tags = set([
|
||||
# These are from HTML5.
|
||||
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||
|
||||
# These are from HTML4, removed in HTML5.
|
||||
'spacer', 'frame'
|
||||
])
|
||||
|
||||
# The HTML standard defines these attributes as containing a
|
||||
# space-separated list of values, not a single value. That is,
|
||||
# class="foo bar" means that the 'class' attribute has two values,
|
||||
# 'foo' and 'bar', not the single value 'foo bar'. When we
|
||||
# encounter one of these attributes, we will parse its value into
|
||||
# a list of values if possible. Upon output, the list will be
|
||||
# converted back into a string.
|
||||
cdata_list_attributes = {
|
||||
"*" : ['class', 'accesskey', 'dropzone'],
|
||||
"a" : ['rel', 'rev'],
|
||||
"link" : ['rel', 'rev'],
|
||||
"td" : ["headers"],
|
||||
"th" : ["headers"],
|
||||
"td" : ["headers"],
|
||||
"form" : ["accept-charset"],
|
||||
"object" : ["archive"],
|
||||
|
||||
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
|
||||
"area" : ["rel"],
|
||||
"icon" : ["sizes"],
|
||||
"iframe" : ["sandbox"],
|
||||
"output" : ["for"],
|
||||
}
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
# We are only interested in <meta> tags
|
||||
if tag.name != 'meta':
|
||||
return False
|
||||
|
||||
http_equiv = tag.get('http-equiv')
|
||||
content = tag.get('content')
|
||||
charset = tag.get('charset')
|
||||
|
||||
# We are interested in <meta> tags that say what encoding the
|
||||
# document was originally in. This means HTML 5-style <meta>
|
||||
# tags that provide the "charset" attribute. It also means
|
||||
# HTML 4-style <meta> tags that provide the "content"
|
||||
# attribute and have "http-equiv" set to "content-type".
|
||||
#
|
||||
# In both cases we will replace the value of the appropriate
|
||||
# attribute with a standin object that can take on any
|
||||
# encoding.
|
||||
meta_encoding = None
|
||||
if charset is not None:
|
||||
# HTML 5 style:
|
||||
# <meta charset="utf8">
|
||||
meta_encoding = charset
|
||||
tag['charset'] = CharsetMetaAttributeValue(charset)
|
||||
|
||||
elif (content is not None and http_equiv is not None
|
||||
and http_equiv.lower() == 'content-type'):
|
||||
# HTML 4 style:
|
||||
# <meta http-equiv="content-type" content="text/html; charset=utf8">
|
||||
tag['content'] = ContentMetaAttributeValue(content)
|
||||
|
||||
return (meta_encoding is not None)
|
||||
|
||||
def register_treebuilders_from(module):
|
||||
"""Copy TreeBuilders from the given module into this module."""
|
||||
# I'm fairly sure this is not the best way to do this.
|
||||
this_module = sys.modules['bs4.builder']
|
||||
for name in module.__all__:
|
||||
obj = getattr(module, name)
|
||||
|
||||
if issubclass(obj, TreeBuilder):
|
||||
setattr(this_module, name, obj)
|
||||
this_module.__all__.append(name)
|
||||
# Register the builder while we're at it.
|
||||
this_module.builder_registry.register(obj)
|
||||
|
||||
class ParserRejectedMarkup(Exception):
|
||||
pass
|
||||
|
||||
# Builders are registered in reverse order of priority, so that custom
|
||||
# builder registrations will take precedence. In general, we want lxml
|
||||
# to take precedence over html5lib, because it's faster. And we only
|
||||
# want to use HTMLParser as a last result.
|
||||
from . import _htmlparser
|
||||
register_treebuilders_from(_htmlparser)
|
||||
try:
|
||||
from . import _html5lib
|
||||
register_treebuilders_from(_html5lib)
|
||||
except ImportError:
|
||||
# They don't have html5lib installed.
|
||||
pass
|
||||
try:
|
||||
from . import _lxml
|
||||
register_treebuilders_from(_lxml)
|
||||
except ImportError:
|
||||
# They don't have lxml installed.
|
||||
pass
|
426
libs/bs42.7/builder/_html5lib.py
Normal file
426
libs/bs42.7/builder/_html5lib.py
Normal file
|
@ -0,0 +1,426 @@
|
|||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
__all__ = [
|
||||
'HTML5TreeBuilder',
|
||||
]
|
||||
|
||||
import warnings
|
||||
import re
|
||||
from bs4.builder import (
|
||||
PERMISSIVE,
|
||||
HTML,
|
||||
HTML_5,
|
||||
HTMLTreeBuilder,
|
||||
)
|
||||
from bs4.element import (
|
||||
NamespacedAttribute,
|
||||
whitespace_re,
|
||||
)
|
||||
import html5lib
|
||||
from html5lib.constants import (
|
||||
namespaces,
|
||||
prefixes,
|
||||
)
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
Tag,
|
||||
)
|
||||
|
||||
try:
|
||||
# Pre-0.99999999
|
||||
from html5lib.treebuilders import _base as treebuilder_base
|
||||
new_html5lib = False
|
||||
except ImportError, e:
|
||||
# 0.99999999 and up
|
||||
from html5lib.treebuilders import base as treebuilder_base
|
||||
new_html5lib = True
|
||||
|
||||
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||
"""Use html5lib to build a tree."""
|
||||
|
||||
NAME = "html5lib"
|
||||
|
||||
features = [NAME, PERMISSIVE, HTML_5, HTML]
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding,
|
||||
document_declared_encoding=None, exclude_encodings=None):
|
||||
# Store the user-specified encoding for use later on.
|
||||
self.user_specified_encoding = user_specified_encoding
|
||||
|
||||
# document_declared_encoding and exclude_encodings aren't used
|
||||
# ATM because the html5lib TreeBuilder doesn't use
|
||||
# UnicodeDammit.
|
||||
if exclude_encodings:
|
||||
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
|
||||
yield (markup, None, None, False)
|
||||
|
||||
# These methods are defined by Beautiful Soup.
|
||||
def feed(self, markup):
|
||||
if self.soup.parse_only is not None:
|
||||
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||
|
||||
extra_kwargs = dict()
|
||||
if not isinstance(markup, unicode):
|
||||
if new_html5lib:
|
||||
extra_kwargs['override_encoding'] = self.user_specified_encoding
|
||||
else:
|
||||
extra_kwargs['encoding'] = self.user_specified_encoding
|
||||
doc = parser.parse(markup, **extra_kwargs)
|
||||
|
||||
# Set the character encoding detected by the tokenizer.
|
||||
if isinstance(markup, unicode):
|
||||
# We need to special-case this because html5lib sets
|
||||
# charEncoding to UTF-8 if it gets Unicode input.
|
||||
doc.original_encoding = None
|
||||
else:
|
||||
original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||
if not isinstance(original_encoding, basestring):
|
||||
# In 0.99999999 and up, the encoding is an html5lib
|
||||
# Encoding object. We want to use a string for compatibility
|
||||
# with other tree builders.
|
||||
original_encoding = original_encoding.name
|
||||
doc.original_encoding = original_encoding
|
||||
|
||||
def create_treebuilder(self, namespaceHTMLElements):
|
||||
self.underlying_builder = TreeBuilderForHtml5lib(
|
||||
namespaceHTMLElements, self.soup)
|
||||
return self.underlying_builder
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><head></head><body>%s</body></html>' % fragment
|
||||
|
||||
|
||||
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||||
|
||||
def __init__(self, namespaceHTMLElements, soup=None):
|
||||
if soup:
|
||||
self.soup = soup
|
||||
else:
|
||||
from bs4 import BeautifulSoup
|
||||
self.soup = BeautifulSoup("", "html.parser")
|
||||
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
||||
|
||||
def documentClass(self):
|
||||
self.soup.reset()
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def elementClass(self, name, namespace):
|
||||
tag = self.soup.new_tag(name, namespace)
|
||||
return Element(tag, self.soup, namespace)
|
||||
|
||||
def commentClass(self, data):
|
||||
return TextNode(Comment(data), self.soup)
|
||||
|
||||
def fragmentClass(self):
|
||||
from bs4 import BeautifulSoup
|
||||
self.soup = BeautifulSoup("", "html.parser")
|
||||
self.soup.name = "[document_fragment]"
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def appendChild(self, node):
|
||||
# XXX This code is not covered by the BS4 tests.
|
||||
self.soup.append(node.element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.soup
|
||||
|
||||
def getFragment(self):
|
||||
return treebuilder_base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def testSerializer(self, element):
|
||||
from bs4 import BeautifulSoup
|
||||
rv = []
|
||||
doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
|
||||
|
||||
def serializeElement(element, indent=0):
|
||||
if isinstance(element, BeautifulSoup):
|
||||
pass
|
||||
if isinstance(element, Doctype):
|
||||
m = doctype_re.match(element)
|
||||
if m:
|
||||
name = m.group(1)
|
||||
if m.lastindex > 1:
|
||||
publicId = m.group(2) or ""
|
||||
systemId = m.group(3) or m.group(4) or ""
|
||||
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
|
||||
(' ' * indent, name, publicId, systemId))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
|
||||
elif isinstance(element, Comment):
|
||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element))
|
||||
elif isinstance(element, NavigableString):
|
||||
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
||||
else:
|
||||
if element.namespace:
|
||||
name = "%s %s" % (prefixes[element.namespace],
|
||||
element.name)
|
||||
else:
|
||||
name = element.name
|
||||
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||
if element.attrs:
|
||||
attributes = []
|
||||
for name, value in element.attrs.items():
|
||||
if isinstance(name, NamespacedAttribute):
|
||||
name = "%s %s" % (prefixes[name.namespace], name.name)
|
||||
if isinstance(value, list):
|
||||
value = " ".join(value)
|
||||
attributes.append((name, value))
|
||||
|
||||
for name, value in sorted(attributes):
|
||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||
indent += 2
|
||||
for child in element.children:
|
||||
serializeElement(child, indent)
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
class AttrList(object):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
self.attrs = dict(self.element.attrs)
|
||||
def __iter__(self):
|
||||
return list(self.attrs.items()).__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
# If this attribute is a multi-valued attribute for this element,
|
||||
# turn its value into a list.
|
||||
list_attr = HTML5TreeBuilder.cdata_list_attributes
|
||||
if (name in list_attr['*']
|
||||
or (self.element.name in list_attr
|
||||
and name in list_attr[self.element.name])):
|
||||
# A node that is being cloned may have already undergone
|
||||
# this procedure.
|
||||
if not isinstance(value, list):
|
||||
value = whitespace_re.split(value)
|
||||
self.element[name] = value
|
||||
def items(self):
|
||||
return list(self.attrs.items())
|
||||
def keys(self):
|
||||
return list(self.attrs.keys())
|
||||
def __len__(self):
|
||||
return len(self.attrs)
|
||||
def __getitem__(self, name):
|
||||
return self.attrs[name]
|
||||
def __contains__(self, name):
|
||||
return name in list(self.attrs.keys())
|
||||
|
||||
|
||||
class Element(treebuilder_base.Node):
|
||||
def __init__(self, element, soup, namespace):
|
||||
treebuilder_base.Node.__init__(self, element.name)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
self.namespace = namespace
|
||||
|
||||
def appendChild(self, node):
|
||||
string_child = child = None
|
||||
if isinstance(node, basestring):
|
||||
# Some other piece of code decided to pass in a string
|
||||
# instead of creating a TextElement object to contain the
|
||||
# string.
|
||||
string_child = child = node
|
||||
elif isinstance(node, Tag):
|
||||
# Some other piece of code decided to pass in a Tag
|
||||
# instead of creating an Element object to contain the
|
||||
# Tag.
|
||||
child = node
|
||||
elif node.element.__class__ == NavigableString:
|
||||
string_child = child = node.element
|
||||
node.parent = self
|
||||
else:
|
||||
child = node.element
|
||||
node.parent = self
|
||||
|
||||
if not isinstance(child, basestring) and child.parent is not None:
|
||||
node.element.extract()
|
||||
|
||||
if (string_child and self.element.contents
|
||||
and self.element.contents[-1].__class__ == NavigableString):
|
||||
# We are appending a string onto another string.
|
||||
# TODO This has O(n^2) performance, for input like
|
||||
# "a</a>a</a>a</a>..."
|
||||
old_element = self.element.contents[-1]
|
||||
new_element = self.soup.new_string(old_element + string_child)
|
||||
old_element.replace_with(new_element)
|
||||
self.soup._most_recent_element = new_element
|
||||
else:
|
||||
if isinstance(node, basestring):
|
||||
# Create a brand new NavigableString from this string.
|
||||
child = self.soup.new_string(node)
|
||||
|
||||
# Tell Beautiful Soup to act as if it parsed this element
|
||||
# immediately after the parent's last descendant. (Or
|
||||
# immediately after the parent, if it has no children.)
|
||||
if self.element.contents:
|
||||
most_recent_element = self.element._last_descendant(False)
|
||||
elif self.element.next_element is not None:
|
||||
# Something from further ahead in the parse tree is
|
||||
# being inserted into this earlier element. This is
|
||||
# very annoying because it means an expensive search
|
||||
# for the last element in the tree.
|
||||
most_recent_element = self.soup._last_descendant()
|
||||
else:
|
||||
most_recent_element = self.element
|
||||
|
||||
self.soup.object_was_parsed(
|
||||
child, parent=self.element,
|
||||
most_recent_element=most_recent_element)
|
||||
|
||||
def getAttributes(self):
|
||||
if isinstance(self.element, Comment):
|
||||
return {}
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
|
||||
if attributes is not None and len(attributes) > 0:
|
||||
|
||||
converted_attributes = []
|
||||
for name, value in list(attributes.items()):
|
||||
if isinstance(name, tuple):
|
||||
new_name = NamespacedAttribute(*name)
|
||||
del attributes[name]
|
||||
attributes[new_name] = value
|
||||
|
||||
self.soup.builder._replace_cdata_list_attribute_values(
|
||||
self.name, attributes)
|
||||
for name, value in attributes.items():
|
||||
self.element[name] = value
|
||||
|
||||
# The attributes may contain variables that need substitution.
|
||||
# Call set_up_substitutions manually.
|
||||
#
|
||||
# The Tag constructor called this method when the Tag was created,
|
||||
# but we just set/changed the attributes, so call it again.
|
||||
self.soup.builder.set_up_substitutions(self.element)
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
text = TextNode(self.soup.new_string(data), self.soup)
|
||||
if insertBefore:
|
||||
self.insertBefore(text, insertBefore)
|
||||
else:
|
||||
self.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self.element.index(refNode.element)
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[index-1].__class__ == NavigableString):
|
||||
# (See comments in appendChild)
|
||||
old_node = self.element.contents[index-1]
|
||||
new_str = self.soup.new_string(old_node + node.element)
|
||||
old_node.replace_with(new_str)
|
||||
else:
|
||||
self.element.insert(index, node.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
node.element.extract()
|
||||
|
||||
def reparentChildren(self, new_parent):
|
||||
"""Move all of this tag's children into another tag."""
|
||||
# print "MOVE", self.element.contents
|
||||
# print "FROM", self.element
|
||||
# print "TO", new_parent.element
|
||||
|
||||
element = self.element
|
||||
new_parent_element = new_parent.element
|
||||
# Determine what this tag's next_element will be once all the children
|
||||
# are removed.
|
||||
final_next_element = element.next_sibling
|
||||
|
||||
new_parents_last_descendant = new_parent_element._last_descendant(False, False)
|
||||
if len(new_parent_element.contents) > 0:
|
||||
# The new parent already contains children. We will be
|
||||
# appending this tag's children to the end.
|
||||
new_parents_last_child = new_parent_element.contents[-1]
|
||||
new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
|
||||
else:
|
||||
# The new parent contains no children.
|
||||
new_parents_last_child = None
|
||||
new_parents_last_descendant_next_element = new_parent_element.next_element
|
||||
|
||||
to_append = element.contents
|
||||
if len(to_append) > 0:
|
||||
# Set the first child's previous_element and previous_sibling
|
||||
# to elements within the new parent
|
||||
first_child = to_append[0]
|
||||
if new_parents_last_descendant:
|
||||
first_child.previous_element = new_parents_last_descendant
|
||||
else:
|
||||
first_child.previous_element = new_parent_element
|
||||
first_child.previous_sibling = new_parents_last_child
|
||||
if new_parents_last_descendant:
|
||||
new_parents_last_descendant.next_element = first_child
|
||||
else:
|
||||
new_parent_element.next_element = first_child
|
||||
if new_parents_last_child:
|
||||
new_parents_last_child.next_sibling = first_child
|
||||
|
||||
# Find the very last element being moved. It is now the
|
||||
# parent's last descendant. It has no .next_sibling and
|
||||
# its .next_element is whatever the previous last
|
||||
# descendant had.
|
||||
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
|
||||
|
||||
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
|
||||
if new_parents_last_descendant_next_element:
|
||||
# TODO: This code has no test coverage and I'm not sure
|
||||
# how to get html5lib to go through this path, but it's
|
||||
# just the other side of the previous line.
|
||||
new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
|
||||
last_childs_last_descendant.next_sibling = None
|
||||
|
||||
for child in to_append:
|
||||
child.parent = new_parent_element
|
||||
new_parent_element.contents.append(child)
|
||||
|
||||
# Now that this element has no children, change its .next_element.
|
||||
element.contents = []
|
||||
element.next_element = final_next_element
|
||||
|
||||
# print "DONE WITH MOVE"
|
||||
# print "FROM", self.element
|
||||
# print "TO", new_parent_element
|
||||
|
||||
def cloneNode(self):
|
||||
tag = self.soup.new_tag(self.element.name, self.namespace)
|
||||
node = Element(tag, self.soup, self.namespace)
|
||||
for key,value in self.attributes:
|
||||
node.attributes[key] = value
|
||||
return node
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.contents
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TextNode(Element):
|
||||
def __init__(self, element, soup):
|
||||
treebuilder_base.Node.__init__(self, None)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
|
||||
def cloneNode(self):
|
||||
raise NotImplementedError
|
314
libs/bs42.7/builder/_htmlparser.py
Normal file
314
libs/bs42.7/builder/_htmlparser.py
Normal file
|
@ -0,0 +1,314 @@
|
|||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
__all__ = [
|
||||
'HTMLParserTreeBuilder',
|
||||
]
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
|
||||
try:
|
||||
from HTMLParser import HTMLParseError
|
||||
except ImportError, e:
|
||||
# HTMLParseError is removed in Python 3.5. Since it can never be
|
||||
# thrown in 3.5, we can just define our own class as a placeholder.
|
||||
class HTMLParseError(Exception):
|
||||
pass
|
||||
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
|
||||
# argument, which we'd like to set to False. Unfortunately,
|
||||
# http://bugs.python.org/issue13273 makes strict=True a better bet
|
||||
# before Python 3.2.3.
|
||||
#
|
||||
# At the end of this file, we monkeypatch HTMLParser so that
|
||||
# strict=True works well on Python 3.2.2.
|
||||
major, minor, release = sys.version_info[:3]
|
||||
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
|
||||
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
|
||||
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
|
||||
|
||||
|
||||
from bs4.element import (
|
||||
CData,
|
||||
Comment,
|
||||
Declaration,
|
||||
Doctype,
|
||||
ProcessingInstruction,
|
||||
)
|
||||
from bs4.dammit import EntitySubstitution, UnicodeDammit
|
||||
|
||||
from bs4.builder import (
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
STRICT,
|
||||
)
|
||||
|
||||
|
||||
HTMLPARSER = 'html.parser'
|
||||
|
||||
class BeautifulSoupHTMLParser(HTMLParser):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
HTMLParser.__init__(self, *args, **kwargs)
|
||||
|
||||
# Keep a list of empty-element tags that were encountered
|
||||
# without an explicit closing tag. If we encounter a closing tag
|
||||
# of this type, we'll associate it with one of those entries.
|
||||
#
|
||||
# This isn't a stack because we don't care about the
|
||||
# order. It's a list of closing tags we've already handled and
|
||||
# will ignore, assuming they ever show up.
|
||||
self.already_closed_empty_element = []
|
||||
|
||||
def handle_startendtag(self, name, attrs):
|
||||
# This is only called when the markup looks like
|
||||
# <tag/>.
|
||||
|
||||
# is_startend() tells handle_starttag not to close the tag
|
||||
# just because its name matches a known empty-element tag. We
|
||||
# know that this is an empty-element tag and we want to call
|
||||
# handle_endtag ourselves.
|
||||
tag = self.handle_starttag(name, attrs, handle_empty_element=False)
|
||||
self.handle_endtag(name)
|
||||
|
||||
def handle_starttag(self, name, attrs, handle_empty_element=True):
|
||||
# XXX namespace
|
||||
attr_dict = {}
|
||||
for key, value in attrs:
|
||||
# Change None attribute values to the empty string
|
||||
# for consistency with the other tree builders.
|
||||
if value is None:
|
||||
value = ''
|
||||
attr_dict[key] = value
|
||||
attrvalue = '""'
|
||||
#print "START", name
|
||||
tag = self.soup.handle_starttag(name, None, None, attr_dict)
|
||||
if tag and tag.is_empty_element and handle_empty_element:
|
||||
# Unlike other parsers, html.parser doesn't send separate end tag
|
||||
# events for empty-element tags. (It's handled in
|
||||
# handle_startendtag, but only if the original markup looked like
|
||||
# <tag/>.)
|
||||
#
|
||||
# So we need to call handle_endtag() ourselves. Since we
|
||||
# know the start event is identical to the end event, we
|
||||
# don't want handle_endtag() to cross off any previous end
|
||||
# events for tags of this name.
|
||||
self.handle_endtag(name, check_already_closed=False)
|
||||
|
||||
# But we might encounter an explicit closing tag for this tag
|
||||
# later on. If so, we want to ignore it.
|
||||
self.already_closed_empty_element.append(name)
|
||||
|
||||
def handle_endtag(self, name, check_already_closed=True):
|
||||
#print "END", name
|
||||
if check_already_closed and name in self.already_closed_empty_element:
|
||||
# This is a redundant end tag for an empty-element tag.
|
||||
# We've already called handle_endtag() for it, so just
|
||||
# check it off the list.
|
||||
# print "ALREADY CLOSED", name
|
||||
self.already_closed_empty_element.remove(name)
|
||||
else:
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.soup.handle_data(data)
|
||||
|
||||
def handle_charref(self, name):
|
||||
# XXX workaround for a bug in HTMLParser. Remove this once
|
||||
# it's fixed in all supported versions.
|
||||
# http://bugs.python.org/issue13633
|
||||
if name.startswith('x'):
|
||||
real_name = int(name.lstrip('x'), 16)
|
||||
elif name.startswith('X'):
|
||||
real_name = int(name.lstrip('X'), 16)
|
||||
else:
|
||||
real_name = int(name)
|
||||
|
||||
try:
|
||||
data = unichr(real_name)
|
||||
except (ValueError, OverflowError), e:
|
||||
data = u"\N{REPLACEMENT CHARACTER}"
|
||||
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_entityref(self, name):
|
||||
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
|
||||
if character is not None:
|
||||
data = character
|
||||
else:
|
||||
data = "&%s;" % name
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def handle_decl(self, data):
|
||||
self.soup.endData()
|
||||
if data.startswith("DOCTYPE "):
|
||||
data = data[len("DOCTYPE "):]
|
||||
elif data == 'DOCTYPE':
|
||||
# i.e. "<!DOCTYPE>"
|
||||
data = ''
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Doctype)
|
||||
|
||||
def unknown_decl(self, data):
|
||||
if data.upper().startswith('CDATA['):
|
||||
cls = CData
|
||||
data = data[len('CDATA['):]
|
||||
else:
|
||||
cls = Declaration
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(cls)
|
||||
|
||||
def handle_pi(self, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(ProcessingInstruction)
|
||||
|
||||
|
||||
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
|
||||
is_xml = False
|
||||
picklable = True
|
||||
NAME = HTMLPARSER
|
||||
features = [NAME, HTML, STRICT]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
|
||||
kwargs['strict'] = False
|
||||
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
||||
kwargs['convert_charrefs'] = False
|
||||
self.parser_args = (args, kwargs)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None, exclude_encodings=None):
|
||||
"""
|
||||
:return: A 4-tuple (markup, original encoding, encoding
|
||||
declared within markup, whether any characters had to be
|
||||
replaced with REPLACEMENT CHARACTER).
|
||||
"""
|
||||
if isinstance(markup, unicode):
|
||||
yield (markup, None, None, False)
|
||||
return
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
|
||||
exclude_encodings=exclude_encodings)
|
||||
yield (dammit.markup, dammit.original_encoding,
|
||||
dammit.declared_html_encoding,
|
||||
dammit.contains_replacement_characters)
|
||||
|
||||
def feed(self, markup):
|
||||
args, kwargs = self.parser_args
|
||||
parser = BeautifulSoupHTMLParser(*args, **kwargs)
|
||||
parser.soup = self.soup
|
||||
try:
|
||||
parser.feed(markup)
|
||||
except HTMLParseError, e:
|
||||
warnings.warn(RuntimeWarning(
|
||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||
raise e
|
||||
parser.already_closed_empty_element = []
|
||||
|
||||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
||||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
||||
# string.
|
||||
#
|
||||
# XXX This code can be removed once most Python 3 users are on 3.2.3.
|
||||
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
|
||||
import re
|
||||
attrfind_tolerant = re.compile(
|
||||
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
|
||||
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
|
||||
|
||||
locatestarttagend = re.compile(r"""
|
||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
||||
(?:\s+ # whitespace before attribute name
|
||||
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
|
||||
(?:\s*=\s* # value indicator
|
||||
(?:'[^']*' # LITA-enclosed value
|
||||
|\"[^\"]*\" # LIT-enclosed value
|
||||
|[^'\">\s]+ # bare value
|
||||
)
|
||||
)?
|
||||
)
|
||||
)*
|
||||
\s* # trailing whitespace
|
||||
""", re.VERBOSE)
|
||||
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
|
||||
|
||||
from html.parser import tagfind, attrfind
|
||||
|
||||
def parse_starttag(self, i):
|
||||
self.__starttag_text = None
|
||||
endpos = self.check_for_whole_start_tag(i)
|
||||
if endpos < 0:
|
||||
return endpos
|
||||
rawdata = self.rawdata
|
||||
self.__starttag_text = rawdata[i:endpos]
|
||||
|
||||
# Now parse the data between i+1 and j into a tag and attrs
|
||||
attrs = []
|
||||
match = tagfind.match(rawdata, i+1)
|
||||
assert match, 'unexpected call to parse_starttag()'
|
||||
k = match.end()
|
||||
self.lasttag = tag = rawdata[i+1:k].lower()
|
||||
while k < endpos:
|
||||
if self.strict:
|
||||
m = attrfind.match(rawdata, k)
|
||||
else:
|
||||
m = attrfind_tolerant.match(rawdata, k)
|
||||
if not m:
|
||||
break
|
||||
attrname, rest, attrvalue = m.group(1, 2, 3)
|
||||
if not rest:
|
||||
attrvalue = None
|
||||
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
|
||||
attrvalue[:1] == '"' == attrvalue[-1:]:
|
||||
attrvalue = attrvalue[1:-1]
|
||||
if attrvalue:
|
||||
attrvalue = self.unescape(attrvalue)
|
||||
attrs.append((attrname.lower(), attrvalue))
|
||||
k = m.end()
|
||||
|
||||
end = rawdata[k:endpos].strip()
|
||||
if end not in (">", "/>"):
|
||||
lineno, offset = self.getpos()
|
||||
if "\n" in self.__starttag_text:
|
||||
lineno = lineno + self.__starttag_text.count("\n")
|
||||
offset = len(self.__starttag_text) \
|
||||
- self.__starttag_text.rfind("\n")
|
||||
else:
|
||||
offset = offset + len(self.__starttag_text)
|
||||
if self.strict:
|
||||
self.error("junk characters in start tag: %r"
|
||||
% (rawdata[k:endpos][:20],))
|
||||
self.handle_data(rawdata[i:endpos])
|
||||
return endpos
|
||||
if end.endswith('/>'):
|
||||
# XHTML-style empty tag: <span attr="value" />
|
||||
self.handle_startendtag(tag, attrs)
|
||||
else:
|
||||
self.handle_starttag(tag, attrs)
|
||||
if tag in self.CDATA_CONTENT_ELEMENTS:
|
||||
self.set_cdata_mode(tag)
|
||||
return endpos
|
||||
|
||||
def set_cdata_mode(self, elem):
|
||||
self.cdata_elem = elem.lower()
|
||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
|
||||
|
||||
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
|
||||
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
|
||||
|
||||
CONSTRUCTOR_TAKES_STRICT = True
|
258
libs/bs42.7/builder/_lxml.py
Normal file
258
libs/bs42.7/builder/_lxml.py
Normal file
|
@ -0,0 +1,258 @@
|
|||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
__all__ = [
|
||||
'LXMLTreeBuilderForXML',
|
||||
'LXMLTreeBuilder',
|
||||
]
|
||||
|
||||
from io import BytesIO
|
||||
from StringIO import StringIO
|
||||
import collections
|
||||
from lxml import etree
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
Doctype,
|
||||
NamespacedAttribute,
|
||||
ProcessingInstruction,
|
||||
XMLProcessingInstruction,
|
||||
)
|
||||
from bs4.builder import (
|
||||
FAST,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
PERMISSIVE,
|
||||
ParserRejectedMarkup,
|
||||
TreeBuilder,
|
||||
XML)
|
||||
from bs4.dammit import EncodingDetector
|
||||
|
||||
LXML = 'lxml'
|
||||
|
||||
class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||||
|
||||
is_xml = True
|
||||
processing_instruction_class = XMLProcessingInstruction
|
||||
|
||||
NAME = "lxml-xml"
|
||||
ALTERNATE_NAMES = ["xml"]
|
||||
|
||||
# Well, it's permissive by XML parser standards.
|
||||
features = [NAME, LXML, XML, FAST, PERMISSIVE]
|
||||
|
||||
CHUNK_SIZE = 512
|
||||
|
||||
# This namespace mapping is specified in the XML Namespace
|
||||
# standard.
|
||||
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
|
||||
|
||||
def default_parser(self, encoding):
|
||||
# This can either return a parser object or a class, which
|
||||
# will be instantiated with default arguments.
|
||||
if self._default_parser is not None:
|
||||
return self._default_parser
|
||||
return etree.XMLParser(
|
||||
target=self, strip_cdata=False, recover=True, encoding=encoding)
|
||||
|
||||
def parser_for(self, encoding):
|
||||
# Use the default parser.
|
||||
parser = self.default_parser(encoding)
|
||||
|
||||
if isinstance(parser, collections.Callable):
|
||||
# Instantiate the parser with default arguments
|
||||
parser = parser(target=self, strip_cdata=False, encoding=encoding)
|
||||
return parser
|
||||
|
||||
def __init__(self, parser=None, empty_element_tags=None):
|
||||
# TODO: Issue a warning if parser is present but not a
|
||||
# callable, since that means there's no way to create new
|
||||
# parsers for different encodings.
|
||||
self._default_parser = parser
|
||||
if empty_element_tags is not None:
|
||||
self.empty_element_tags = set(empty_element_tags)
|
||||
self.soup = None
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
|
||||
def _getNsTag(self, tag):
|
||||
# Split the namespace URL out of a fully-qualified lxml tag
|
||||
# name. Copied from lxml's src/lxml/sax.py.
|
||||
if tag[0] == '{':
|
||||
return tuple(tag[1:].split('}', 1))
|
||||
else:
|
||||
return (None, tag)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
exclude_encodings=None,
|
||||
document_declared_encoding=None):
|
||||
"""
|
||||
:yield: A series of 4-tuples.
|
||||
(markup, encoding, declared encoding,
|
||||
has undergone character replacement)
|
||||
|
||||
Each 4-tuple represents a strategy for parsing the document.
|
||||
"""
|
||||
# Instead of using UnicodeDammit to convert the bytestring to
|
||||
# Unicode using different encodings, use EncodingDetector to
|
||||
# iterate over the encodings, and tell lxml to try to parse
|
||||
# the document as each one in turn.
|
||||
is_html = not self.is_xml
|
||||
if is_html:
|
||||
self.processing_instruction_class = ProcessingInstruction
|
||||
else:
|
||||
self.processing_instruction_class = XMLProcessingInstruction
|
||||
|
||||
if isinstance(markup, unicode):
|
||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||
# this system?
|
||||
yield markup, None, document_declared_encoding, False
|
||||
|
||||
if isinstance(markup, unicode):
|
||||
# No, apparently not. Convert the Unicode to UTF-8 and
|
||||
# tell lxml to parse it as UTF-8.
|
||||
yield (markup.encode("utf8"), "utf8",
|
||||
document_declared_encoding, False)
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
detector = EncodingDetector(
|
||||
markup, try_encodings, is_html, exclude_encodings)
|
||||
for encoding in detector.encodings:
|
||||
yield (detector.markup, encoding, document_declared_encoding, False)
|
||||
|
||||
def feed(self, markup):
|
||||
if isinstance(markup, bytes):
|
||||
markup = BytesIO(markup)
|
||||
elif isinstance(markup, unicode):
|
||||
markup = StringIO(markup)
|
||||
|
||||
# Call feed() at least once, even if the markup is empty,
|
||||
# or the parser won't be initialized.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
try:
|
||||
self.parser = self.parser_for(self.soup.original_encoding)
|
||||
self.parser.feed(data)
|
||||
while len(data) != 0:
|
||||
# Now call feed() on the rest of the data, chunk by chunk.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
if len(data) != 0:
|
||||
self.parser.feed(data)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
||||
raise ParserRejectedMarkup(str(e))
|
||||
|
||||
def close(self):
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
|
||||
def start(self, name, attrs, nsmap={}):
|
||||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||
attrs = dict(attrs)
|
||||
nsprefix = None
|
||||
# Invert each namespace map as it comes in.
|
||||
if len(self.nsmaps) > 1:
|
||||
# There are no new namespaces for this tag, but
|
||||
# non-default namespaces are in play, so we need a
|
||||
# separate tag stack to know when they end.
|
||||
self.nsmaps.append(None)
|
||||
elif len(nsmap) > 0:
|
||||
# A new namespace mapping has come into play.
|
||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
||||
self.nsmaps.append(inverted_nsmap)
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
# tag, so we can recreate it later.
|
||||
attrs = attrs.copy()
|
||||
for prefix, namespace in nsmap.items():
|
||||
attribute = NamespacedAttribute(
|
||||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||
attrs[attribute] = namespace
|
||||
|
||||
# Namespaces are in play. Find any attributes that came in
|
||||
# from lxml with namespaces attached to their names, and
|
||||
# turn then into NamespacedAttribute objects.
|
||||
new_attrs = {}
|
||||
for attr, value in attrs.items():
|
||||
namespace, attr = self._getNsTag(attr)
|
||||
if namespace is None:
|
||||
new_attrs[attr] = value
|
||||
else:
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
attr = NamespacedAttribute(nsprefix, attr, namespace)
|
||||
new_attrs[attr] = value
|
||||
attrs = new_attrs
|
||||
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
|
||||
|
||||
def _prefix_for_namespace(self, namespace):
|
||||
"""Find the currently active prefix for the given namespace."""
|
||||
if namespace is None:
|
||||
return None
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
return inverted_nsmap[namespace]
|
||||
return None
|
||||
|
||||
def end(self, name):
|
||||
self.soup.endData()
|
||||
completed_tag = self.soup.tagStack[-1]
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = None
|
||||
if namespace is not None:
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
nsprefix = inverted_nsmap[namespace]
|
||||
break
|
||||
self.soup.handle_endtag(name, nsprefix)
|
||||
if len(self.nsmaps) > 1:
|
||||
# This tag, or one of its parents, introduced a namespace
|
||||
# mapping, so pop it off the stack.
|
||||
self.nsmaps.pop()
|
||||
|
||||
def pi(self, target, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(target + ' ' + data)
|
||||
self.soup.endData(self.processing_instruction_class)
|
||||
|
||||
def data(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def doctype(self, name, pubid, system):
|
||||
self.soup.endData()
|
||||
doctype = Doctype.for_name_and_ids(name, pubid, system)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def comment(self, content):
|
||||
"Handle comments as Comment objects."
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(content)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||
|
||||
|
||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||
|
||||
NAME = LXML
|
||||
ALTERNATE_NAMES = ["lxml-html"]
|
||||
|
||||
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
|
||||
is_xml = False
|
||||
processing_instruction_class = ProcessingInstruction
|
||||
|
||||
def default_parser(self, encoding):
|
||||
return etree.HTMLParser
|
||||
|
||||
def feed(self, markup):
|
||||
encoding = self.soup.original_encoding
|
||||
try:
|
||||
self.parser = self.parser_for(encoding)
|
||||
self.parser.feed(markup)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
||||
raise ParserRejectedMarkup(str(e))
|
||||
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><body>%s</body></html>' % fragment
|
842
libs/bs42.7/dammit.py
Normal file
842
libs/bs42.7/dammit.py
Normal file
|
@ -0,0 +1,842 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""Beautiful Soup bonus library: Unicode, Dammit
|
||||
|
||||
This library converts a bytestream to Unicode through any means
|
||||
necessary. It is heavily based on code from Mark Pilgrim's Universal
|
||||
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
||||
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||
"""
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
__license__ = "MIT"
|
||||
|
||||
import codecs
|
||||
from htmlentitydefs import codepoint2name
|
||||
import re
|
||||
import logging
|
||||
import string
|
||||
|
||||
# Import a library to autodetect character encodings.
|
||||
chardet_type = None
|
||||
try:
|
||||
# First try the fast C implementation.
|
||||
# PyPI package: cchardet
|
||||
import cchardet
|
||||
def chardet_dammit(s):
|
||||
return cchardet.detect(s)['encoding']
|
||||
except ImportError:
|
||||
try:
|
||||
# Fall back to the pure Python implementation
|
||||
# Debian package: python-chardet
|
||||
# PyPI package: chardet
|
||||
import chardet
|
||||
def chardet_dammit(s):
|
||||
return chardet.detect(s)['encoding']
|
||||
#import chardet.constants
|
||||
#chardet.constants._debug = 1
|
||||
except ImportError:
|
||||
# No chardet available.
|
||||
def chardet_dammit(s):
|
||||
return None
|
||||
|
||||
# Available from http://cjkpython.i18n.org/.
|
||||
try:
|
||||
import iconv_codec
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
xml_encoding_re = re.compile(
|
||||
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
|
||||
html_meta_re = re.compile(
|
||||
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
||||
|
||||
class EntitySubstitution(object):
|
||||
|
||||
"""Substitute XML or HTML entities for the corresponding characters."""
|
||||
|
||||
def _populate_class_variables():
|
||||
lookup = {}
|
||||
reverse_lookup = {}
|
||||
characters_for_re = []
|
||||
for codepoint, name in list(codepoint2name.items()):
|
||||
character = unichr(codepoint)
|
||||
if codepoint != 34:
|
||||
# There's no point in turning the quotation mark into
|
||||
# ", unless it happens within an attribute value, which
|
||||
# is handled elsewhere.
|
||||
characters_for_re.append(character)
|
||||
lookup[character] = name
|
||||
# But we do want to turn " into the quotation mark.
|
||||
reverse_lookup[name] = character
|
||||
re_definition = "[%s]" % "".join(characters_for_re)
|
||||
return lookup, reverse_lookup, re.compile(re_definition)
|
||||
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
|
||||
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
|
||||
|
||||
CHARACTER_TO_XML_ENTITY = {
|
||||
"'": "apos",
|
||||
'"': "quot",
|
||||
"&": "amp",
|
||||
"<": "lt",
|
||||
">": "gt",
|
||||
}
|
||||
|
||||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
||||
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
|
||||
")")
|
||||
|
||||
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
||||
|
||||
@classmethod
|
||||
def _substitute_html_entity(cls, matchobj):
|
||||
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def _substitute_xml_entity(cls, matchobj):
|
||||
"""Used with a regular expression to substitute the
|
||||
appropriate XML entity for an XML special character."""
|
||||
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def quoted_attribute_value(self, value):
|
||||
"""Make a value into a quoted XML attribute, possibly escaping it.
|
||||
|
||||
Most strings will be quoted using double quotes.
|
||||
|
||||
Bob's Bar -> "Bob's Bar"
|
||||
|
||||
If a string contains double quotes, it will be quoted using
|
||||
single quotes.
|
||||
|
||||
Welcome to "my bar" -> 'Welcome to "my bar"'
|
||||
|
||||
If a string contains both single and double quotes, the
|
||||
double quotes will be escaped, and the string will be quoted
|
||||
using double quotes.
|
||||
|
||||
Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
|
||||
"""
|
||||
quote_with = '"'
|
||||
if '"' in value:
|
||||
if "'" in value:
|
||||
# The string contains both single and double
|
||||
# quotes. Turn the double quotes into
|
||||
# entities. We quote the double quotes rather than
|
||||
# the single quotes because the entity name is
|
||||
# """ whether this is HTML or XML. If we
|
||||
# quoted the single quotes, we'd have to decide
|
||||
# between ' and &squot;.
|
||||
replace_with = """
|
||||
value = value.replace('"', replace_with)
|
||||
else:
|
||||
# There are double quotes but no single quotes.
|
||||
# We can use single quotes to quote the attribute.
|
||||
quote_with = "'"
|
||||
return quote_with + value + quote_with
|
||||
|
||||
@classmethod
|
||||
def substitute_xml(cls, value, make_quoted_attribute=False):
|
||||
"""Substitute XML entities for special XML characters.
|
||||
|
||||
:param value: A string to be substituted. The less-than sign
|
||||
will become <, the greater-than sign will become >,
|
||||
and any ampersands will become &. If you want ampersands
|
||||
that appear to be part of an entity definition to be left
|
||||
alone, use substitute_xml_containing_entities() instead.
|
||||
|
||||
:param make_quoted_attribute: If True, then the string will be
|
||||
quoted, as befits an attribute value.
|
||||
"""
|
||||
# Escape angle brackets and ampersands.
|
||||
value = cls.AMPERSAND_OR_BRACKET.sub(
|
||||
cls._substitute_xml_entity, value)
|
||||
|
||||
if make_quoted_attribute:
|
||||
value = cls.quoted_attribute_value(value)
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def substitute_xml_containing_entities(
|
||||
cls, value, make_quoted_attribute=False):
|
||||
"""Substitute XML entities for special XML characters.
|
||||
|
||||
:param value: A string to be substituted. The less-than sign will
|
||||
become <, the greater-than sign will become >, and any
|
||||
ampersands that are not part of an entity defition will
|
||||
become &.
|
||||
|
||||
:param make_quoted_attribute: If True, then the string will be
|
||||
quoted, as befits an attribute value.
|
||||
"""
|
||||
# Escape angle brackets, and ampersands that aren't part of
|
||||
# entities.
|
||||
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
|
||||
cls._substitute_xml_entity, value)
|
||||
|
||||
if make_quoted_attribute:
|
||||
value = cls.quoted_attribute_value(value)
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def substitute_html(cls, s):
|
||||
"""Replace certain Unicode characters with named HTML entities.
|
||||
|
||||
This differs from data.encode(encoding, 'xmlcharrefreplace')
|
||||
in that the goal is to make the result more readable (to those
|
||||
with ASCII displays) rather than to recover from
|
||||
errors. There's absolutely nothing wrong with a UTF-8 string
|
||||
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
|
||||
character with "é" will make it more readable to some
|
||||
people.
|
||||
"""
|
||||
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
|
||||
cls._substitute_html_entity, s)
|
||||
|
||||
|
||||
class EncodingDetector:
|
||||
"""Suggests a number of possible encodings for a bytestring.
|
||||
|
||||
Order of precedence:
|
||||
|
||||
1. Encodings you specifically tell EncodingDetector to try first
|
||||
(the override_encodings argument to the constructor).
|
||||
|
||||
2. An encoding declared within the bytestring itself, either in an
|
||||
XML declaration (if the bytestring is to be interpreted as an XML
|
||||
document), or in a <meta> tag (if the bytestring is to be
|
||||
interpreted as an HTML document.)
|
||||
|
||||
3. An encoding detected through textual analysis by chardet,
|
||||
cchardet, or a similar external library.
|
||||
|
||||
4. UTF-8.
|
||||
|
||||
5. Windows-1252.
|
||||
"""
|
||||
def __init__(self, markup, override_encodings=None, is_html=False,
|
||||
exclude_encodings=None):
|
||||
self.override_encodings = override_encodings or []
|
||||
exclude_encodings = exclude_encodings or []
|
||||
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
|
||||
self.chardet_encoding = None
|
||||
self.is_html = is_html
|
||||
self.declared_encoding = None
|
||||
|
||||
# First order of business: strip a byte-order mark.
|
||||
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
|
||||
|
||||
def _usable(self, encoding, tried):
|
||||
if encoding is not None:
|
||||
encoding = encoding.lower()
|
||||
if encoding in self.exclude_encodings:
|
||||
return False
|
||||
if encoding not in tried:
|
||||
tried.add(encoding)
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def encodings(self):
|
||||
"""Yield a number of encodings that might work for this markup."""
|
||||
tried = set()
|
||||
for e in self.override_encodings:
|
||||
if self._usable(e, tried):
|
||||
yield e
|
||||
|
||||
# Did the document originally start with a byte-order mark
|
||||
# that indicated its encoding?
|
||||
if self._usable(self.sniffed_encoding, tried):
|
||||
yield self.sniffed_encoding
|
||||
|
||||
# Look within the document for an XML or HTML encoding
|
||||
# declaration.
|
||||
if self.declared_encoding is None:
|
||||
self.declared_encoding = self.find_declared_encoding(
|
||||
self.markup, self.is_html)
|
||||
if self._usable(self.declared_encoding, tried):
|
||||
yield self.declared_encoding
|
||||
|
||||
# Use third-party character set detection to guess at the
|
||||
# encoding.
|
||||
if self.chardet_encoding is None:
|
||||
self.chardet_encoding = chardet_dammit(self.markup)
|
||||
if self._usable(self.chardet_encoding, tried):
|
||||
yield self.chardet_encoding
|
||||
|
||||
# As a last-ditch effort, try utf-8 and windows-1252.
|
||||
for e in ('utf-8', 'windows-1252'):
|
||||
if self._usable(e, tried):
|
||||
yield e
|
||||
|
||||
@classmethod
|
||||
def strip_byte_order_mark(cls, data):
|
||||
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
||||
encoding = None
|
||||
if isinstance(data, unicode):
|
||||
# Unicode data cannot have a byte-order mark.
|
||||
return data, encoding
|
||||
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16be'
|
||||
data = data[2:]
|
||||
elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16le'
|
||||
data = data[2:]
|
||||
elif data[:3] == b'\xef\xbb\xbf':
|
||||
encoding = 'utf-8'
|
||||
data = data[3:]
|
||||
elif data[:4] == b'\x00\x00\xfe\xff':
|
||||
encoding = 'utf-32be'
|
||||
data = data[4:]
|
||||
elif data[:4] == b'\xff\xfe\x00\x00':
|
||||
encoding = 'utf-32le'
|
||||
data = data[4:]
|
||||
return data, encoding
|
||||
|
||||
@classmethod
|
||||
def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
|
||||
"""Given a document, tries to find its declared encoding.
|
||||
|
||||
An XML encoding is declared at the beginning of the document.
|
||||
|
||||
An HTML encoding is declared in a <meta> tag, hopefully near the
|
||||
beginning of the document.
|
||||
"""
|
||||
if search_entire_document:
|
||||
xml_endpos = html_endpos = len(markup)
|
||||
else:
|
||||
xml_endpos = 1024
|
||||
html_endpos = max(2048, int(len(markup) * 0.05))
|
||||
|
||||
declared_encoding = None
|
||||
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
|
||||
if not declared_encoding_match and is_html:
|
||||
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
|
||||
if declared_encoding_match is not None:
|
||||
declared_encoding = declared_encoding_match.groups()[0].decode(
|
||||
'ascii', 'replace')
|
||||
if declared_encoding:
|
||||
return declared_encoding.lower()
|
||||
return None
|
||||
|
||||
class UnicodeDammit:
|
||||
"""A class for detecting the encoding of a *ML document and
|
||||
converting it to a Unicode string. If the source encoding is
|
||||
windows-1252, can replace MS smart quotes with their HTML or XML
|
||||
equivalents."""
|
||||
|
||||
# This dictionary maps commonly seen values for "charset" in HTML
|
||||
# meta tags to the corresponding Python codec names. It only covers
|
||||
# values that aren't in Python's aliases and can't be determined
|
||||
# by the heuristics in find_codec.
|
||||
CHARSET_ALIASES = {"macintosh": "mac-roman",
|
||||
"x-sjis": "shift-jis"}
|
||||
|
||||
ENCODINGS_WITH_SMART_QUOTES = [
|
||||
"windows-1252",
|
||||
"iso-8859-1",
|
||||
"iso-8859-2",
|
||||
]
|
||||
|
||||
def __init__(self, markup, override_encodings=[],
|
||||
smart_quotes_to=None, is_html=False, exclude_encodings=[]):
|
||||
self.smart_quotes_to = smart_quotes_to
|
||||
self.tried_encodings = []
|
||||
self.contains_replacement_characters = False
|
||||
self.is_html = is_html
|
||||
self.log = logging.getLogger(__name__)
|
||||
self.detector = EncodingDetector(
|
||||
markup, override_encodings, is_html, exclude_encodings)
|
||||
|
||||
# Short-circuit if the data is in Unicode to begin with.
|
||||
if isinstance(markup, unicode) or markup == '':
|
||||
self.markup = markup
|
||||
self.unicode_markup = unicode(markup)
|
||||
self.original_encoding = None
|
||||
return
|
||||
|
||||
# The encoding detector may have stripped a byte-order mark.
|
||||
# Use the stripped markup from this point on.
|
||||
self.markup = self.detector.markup
|
||||
|
||||
u = None
|
||||
for encoding in self.detector.encodings:
|
||||
markup = self.detector.markup
|
||||
u = self._convert_from(encoding)
|
||||
if u is not None:
|
||||
break
|
||||
|
||||
if not u:
|
||||
# None of the encodings worked. As an absolute last resort,
|
||||
# try them again with character replacement.
|
||||
|
||||
for encoding in self.detector.encodings:
|
||||
if encoding != "ascii":
|
||||
u = self._convert_from(encoding, "replace")
|
||||
if u is not None:
|
||||
self.log.warning(
|
||||
"Some characters could not be decoded, and were "
|
||||
"replaced with REPLACEMENT CHARACTER."
|
||||
)
|
||||
self.contains_replacement_characters = True
|
||||
break
|
||||
|
||||
# If none of that worked, we could at this point force it to
|
||||
# ASCII, but that would destroy so much data that I think
|
||||
# giving up is better.
|
||||
self.unicode_markup = u
|
||||
if not u:
|
||||
self.original_encoding = None
|
||||
|
||||
def _sub_ms_char(self, match):
|
||||
"""Changes a MS smart quote character to an XML or HTML
|
||||
entity, or an ASCII character."""
|
||||
orig = match.group(1)
|
||||
if self.smart_quotes_to == 'ascii':
|
||||
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
|
||||
else:
|
||||
sub = self.MS_CHARS.get(orig)
|
||||
if type(sub) == tuple:
|
||||
if self.smart_quotes_to == 'xml':
|
||||
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
|
||||
else:
|
||||
sub = '&'.encode() + sub[0].encode() + ';'.encode()
|
||||
else:
|
||||
sub = sub.encode()
|
||||
return sub
|
||||
|
||||
def _convert_from(self, proposed, errors="strict"):
|
||||
proposed = self.find_codec(proposed)
|
||||
if not proposed or (proposed, errors) in self.tried_encodings:
|
||||
return None
|
||||
self.tried_encodings.append((proposed, errors))
|
||||
markup = self.markup
|
||||
# Convert smart quotes to HTML if coming from an encoding
|
||||
# that might have them.
|
||||
if (self.smart_quotes_to is not None
|
||||
and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
|
||||
smart_quotes_re = b"([\x80-\x9f])"
|
||||
smart_quotes_compiled = re.compile(smart_quotes_re)
|
||||
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
|
||||
|
||||
try:
|
||||
#print "Trying to convert document to %s (errors=%s)" % (
|
||||
# proposed, errors)
|
||||
u = self._to_unicode(markup, proposed, errors)
|
||||
self.markup = u
|
||||
self.original_encoding = proposed
|
||||
except Exception as e:
|
||||
#print "That didn't work!"
|
||||
#print e
|
||||
return None
|
||||
#print "Correct encoding: %s" % proposed
|
||||
return self.markup
|
||||
|
||||
def _to_unicode(self, data, encoding, errors="strict"):
|
||||
'''Given a string and its encoding, decodes the string into Unicode.
|
||||
%encoding is a string recognized by encodings.aliases'''
|
||||
return unicode(data, encoding, errors)
|
||||
|
||||
@property
|
||||
def declared_html_encoding(self):
|
||||
if not self.is_html:
|
||||
return None
|
||||
return self.detector.declared_encoding
|
||||
|
||||
def find_codec(self, charset):
|
||||
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
|
||||
or (charset and self._codec(charset.replace("-", "")))
|
||||
or (charset and self._codec(charset.replace("-", "_")))
|
||||
or (charset and charset.lower())
|
||||
or charset
|
||||
)
|
||||
if value:
|
||||
return value.lower()
|
||||
return None
|
||||
|
||||
def _codec(self, charset):
|
||||
if not charset:
|
||||
return charset
|
||||
codec = None
|
||||
try:
|
||||
codecs.lookup(charset)
|
||||
codec = charset
|
||||
except (LookupError, ValueError):
|
||||
pass
|
||||
return codec
|
||||
|
||||
|
||||
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
|
||||
MS_CHARS = {b'\x80': ('euro', '20AC'),
|
||||
b'\x81': ' ',
|
||||
b'\x82': ('sbquo', '201A'),
|
||||
b'\x83': ('fnof', '192'),
|
||||
b'\x84': ('bdquo', '201E'),
|
||||
b'\x85': ('hellip', '2026'),
|
||||
b'\x86': ('dagger', '2020'),
|
||||
b'\x87': ('Dagger', '2021'),
|
||||
b'\x88': ('circ', '2C6'),
|
||||
b'\x89': ('permil', '2030'),
|
||||
b'\x8A': ('Scaron', '160'),
|
||||
b'\x8B': ('lsaquo', '2039'),
|
||||
b'\x8C': ('OElig', '152'),
|
||||
b'\x8D': '?',
|
||||
b'\x8E': ('#x17D', '17D'),
|
||||
b'\x8F': '?',
|
||||
b'\x90': '?',
|
||||
b'\x91': ('lsquo', '2018'),
|
||||
b'\x92': ('rsquo', '2019'),
|
||||
b'\x93': ('ldquo', '201C'),
|
||||
b'\x94': ('rdquo', '201D'),
|
||||
b'\x95': ('bull', '2022'),
|
||||
b'\x96': ('ndash', '2013'),
|
||||
b'\x97': ('mdash', '2014'),
|
||||
b'\x98': ('tilde', '2DC'),
|
||||
b'\x99': ('trade', '2122'),
|
||||
b'\x9a': ('scaron', '161'),
|
||||
b'\x9b': ('rsaquo', '203A'),
|
||||
b'\x9c': ('oelig', '153'),
|
||||
b'\x9d': '?',
|
||||
b'\x9e': ('#x17E', '17E'),
|
||||
b'\x9f': ('Yuml', ''),}
|
||||
|
||||
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
|
||||
# horrors like stripping diacritical marks to turn á into a, but also
|
||||
# contains non-horrors like turning “ into ".
|
||||
MS_CHARS_TO_ASCII = {
|
||||
b'\x80' : 'EUR',
|
||||
b'\x81' : ' ',
|
||||
b'\x82' : ',',
|
||||
b'\x83' : 'f',
|
||||
b'\x84' : ',,',
|
||||
b'\x85' : '...',
|
||||
b'\x86' : '+',
|
||||
b'\x87' : '++',
|
||||
b'\x88' : '^',
|
||||
b'\x89' : '%',
|
||||
b'\x8a' : 'S',
|
||||
b'\x8b' : '<',
|
||||
b'\x8c' : 'OE',
|
||||
b'\x8d' : '?',
|
||||
b'\x8e' : 'Z',
|
||||
b'\x8f' : '?',
|
||||
b'\x90' : '?',
|
||||
b'\x91' : "'",
|
||||
b'\x92' : "'",
|
||||
b'\x93' : '"',
|
||||
b'\x94' : '"',
|
||||
b'\x95' : '*',
|
||||
b'\x96' : '-',
|
||||
b'\x97' : '--',
|
||||
b'\x98' : '~',
|
||||
b'\x99' : '(TM)',
|
||||
b'\x9a' : 's',
|
||||
b'\x9b' : '>',
|
||||
b'\x9c' : 'oe',
|
||||
b'\x9d' : '?',
|
||||
b'\x9e' : 'z',
|
||||
b'\x9f' : 'Y',
|
||||
b'\xa0' : ' ',
|
||||
b'\xa1' : '!',
|
||||
b'\xa2' : 'c',
|
||||
b'\xa3' : 'GBP',
|
||||
b'\xa4' : '$', #This approximation is especially parochial--this is the
|
||||
#generic currency symbol.
|
||||
b'\xa5' : 'YEN',
|
||||
b'\xa6' : '|',
|
||||
b'\xa7' : 'S',
|
||||
b'\xa8' : '..',
|
||||
b'\xa9' : '',
|
||||
b'\xaa' : '(th)',
|
||||
b'\xab' : '<<',
|
||||
b'\xac' : '!',
|
||||
b'\xad' : ' ',
|
||||
b'\xae' : '(R)',
|
||||
b'\xaf' : '-',
|
||||
b'\xb0' : 'o',
|
||||
b'\xb1' : '+-',
|
||||
b'\xb2' : '2',
|
||||
b'\xb3' : '3',
|
||||
b'\xb4' : ("'", 'acute'),
|
||||
b'\xb5' : 'u',
|
||||
b'\xb6' : 'P',
|
||||
b'\xb7' : '*',
|
||||
b'\xb8' : ',',
|
||||
b'\xb9' : '1',
|
||||
b'\xba' : '(th)',
|
||||
b'\xbb' : '>>',
|
||||
b'\xbc' : '1/4',
|
||||
b'\xbd' : '1/2',
|
||||
b'\xbe' : '3/4',
|
||||
b'\xbf' : '?',
|
||||
b'\xc0' : 'A',
|
||||
b'\xc1' : 'A',
|
||||
b'\xc2' : 'A',
|
||||
b'\xc3' : 'A',
|
||||
b'\xc4' : 'A',
|
||||
b'\xc5' : 'A',
|
||||
b'\xc6' : 'AE',
|
||||
b'\xc7' : 'C',
|
||||
b'\xc8' : 'E',
|
||||
b'\xc9' : 'E',
|
||||
b'\xca' : 'E',
|
||||
b'\xcb' : 'E',
|
||||
b'\xcc' : 'I',
|
||||
b'\xcd' : 'I',
|
||||
b'\xce' : 'I',
|
||||
b'\xcf' : 'I',
|
||||
b'\xd0' : 'D',
|
||||
b'\xd1' : 'N',
|
||||
b'\xd2' : 'O',
|
||||
b'\xd3' : 'O',
|
||||
b'\xd4' : 'O',
|
||||
b'\xd5' : 'O',
|
||||
b'\xd6' : 'O',
|
||||
b'\xd7' : '*',
|
||||
b'\xd8' : 'O',
|
||||
b'\xd9' : 'U',
|
||||
b'\xda' : 'U',
|
||||
b'\xdb' : 'U',
|
||||
b'\xdc' : 'U',
|
||||
b'\xdd' : 'Y',
|
||||
b'\xde' : 'b',
|
||||
b'\xdf' : 'B',
|
||||
b'\xe0' : 'a',
|
||||
b'\xe1' : 'a',
|
||||
b'\xe2' : 'a',
|
||||
b'\xe3' : 'a',
|
||||
b'\xe4' : 'a',
|
||||
b'\xe5' : 'a',
|
||||
b'\xe6' : 'ae',
|
||||
b'\xe7' : 'c',
|
||||
b'\xe8' : 'e',
|
||||
b'\xe9' : 'e',
|
||||
b'\xea' : 'e',
|
||||
b'\xeb' : 'e',
|
||||
b'\xec' : 'i',
|
||||
b'\xed' : 'i',
|
||||
b'\xee' : 'i',
|
||||
b'\xef' : 'i',
|
||||
b'\xf0' : 'o',
|
||||
b'\xf1' : 'n',
|
||||
b'\xf2' : 'o',
|
||||
b'\xf3' : 'o',
|
||||
b'\xf4' : 'o',
|
||||
b'\xf5' : 'o',
|
||||
b'\xf6' : 'o',
|
||||
b'\xf7' : '/',
|
||||
b'\xf8' : 'o',
|
||||
b'\xf9' : 'u',
|
||||
b'\xfa' : 'u',
|
||||
b'\xfb' : 'u',
|
||||
b'\xfc' : 'u',
|
||||
b'\xfd' : 'y',
|
||||
b'\xfe' : 'b',
|
||||
b'\xff' : 'y',
|
||||
}
|
||||
|
||||
# A map used when removing rogue Windows-1252/ISO-8859-1
|
||||
# characters in otherwise UTF-8 documents.
|
||||
#
|
||||
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
|
||||
# Windows-1252.
|
||||
WINDOWS_1252_TO_UTF8 = {
|
||||
0x80 : b'\xe2\x82\xac', # €
|
||||
0x82 : b'\xe2\x80\x9a', # ‚
|
||||
0x83 : b'\xc6\x92', # ƒ
|
||||
0x84 : b'\xe2\x80\x9e', # „
|
||||
0x85 : b'\xe2\x80\xa6', # …
|
||||
0x86 : b'\xe2\x80\xa0', # †
|
||||
0x87 : b'\xe2\x80\xa1', # ‡
|
||||
0x88 : b'\xcb\x86', # ˆ
|
||||
0x89 : b'\xe2\x80\xb0', # ‰
|
||||
0x8a : b'\xc5\xa0', # Š
|
||||
0x8b : b'\xe2\x80\xb9', # ‹
|
||||
0x8c : b'\xc5\x92', # Œ
|
||||
0x8e : b'\xc5\xbd', # Ž
|
||||
0x91 : b'\xe2\x80\x98', # ‘
|
||||
0x92 : b'\xe2\x80\x99', # ’
|
||||
0x93 : b'\xe2\x80\x9c', # “
|
||||
0x94 : b'\xe2\x80\x9d', # ”
|
||||
0x95 : b'\xe2\x80\xa2', # •
|
||||
0x96 : b'\xe2\x80\x93', # –
|
||||
0x97 : b'\xe2\x80\x94', # —
|
||||
0x98 : b'\xcb\x9c', # ˜
|
||||
0x99 : b'\xe2\x84\xa2', # ™
|
||||
0x9a : b'\xc5\xa1', # š
|
||||
0x9b : b'\xe2\x80\xba', # ›
|
||||
0x9c : b'\xc5\x93', # œ
|
||||
0x9e : b'\xc5\xbe', # ž
|
||||
0x9f : b'\xc5\xb8', # Ÿ
|
||||
0xa0 : b'\xc2\xa0', #
|
||||
0xa1 : b'\xc2\xa1', # ¡
|
||||
0xa2 : b'\xc2\xa2', # ¢
|
||||
0xa3 : b'\xc2\xa3', # £
|
||||
0xa4 : b'\xc2\xa4', # ¤
|
||||
0xa5 : b'\xc2\xa5', # ¥
|
||||
0xa6 : b'\xc2\xa6', # ¦
|
||||
0xa7 : b'\xc2\xa7', # §
|
||||
0xa8 : b'\xc2\xa8', # ¨
|
||||
0xa9 : b'\xc2\xa9', # ©
|
||||
0xaa : b'\xc2\xaa', # ª
|
||||
0xab : b'\xc2\xab', # «
|
||||
0xac : b'\xc2\xac', # ¬
|
||||
0xad : b'\xc2\xad', #
|
||||
0xae : b'\xc2\xae', # ®
|
||||
0xaf : b'\xc2\xaf', # ¯
|
||||
0xb0 : b'\xc2\xb0', # °
|
||||
0xb1 : b'\xc2\xb1', # ±
|
||||
0xb2 : b'\xc2\xb2', # ²
|
||||
0xb3 : b'\xc2\xb3', # ³
|
||||
0xb4 : b'\xc2\xb4', # ´
|
||||
0xb5 : b'\xc2\xb5', # µ
|
||||
0xb6 : b'\xc2\xb6', # ¶
|
||||
0xb7 : b'\xc2\xb7', # ·
|
||||
0xb8 : b'\xc2\xb8', # ¸
|
||||
0xb9 : b'\xc2\xb9', # ¹
|
||||
0xba : b'\xc2\xba', # º
|
||||
0xbb : b'\xc2\xbb', # »
|
||||
0xbc : b'\xc2\xbc', # ¼
|
||||
0xbd : b'\xc2\xbd', # ½
|
||||
0xbe : b'\xc2\xbe', # ¾
|
||||
0xbf : b'\xc2\xbf', # ¿
|
||||
0xc0 : b'\xc3\x80', # À
|
||||
0xc1 : b'\xc3\x81', # Á
|
||||
0xc2 : b'\xc3\x82', # Â
|
||||
0xc3 : b'\xc3\x83', # Ã
|
||||
0xc4 : b'\xc3\x84', # Ä
|
||||
0xc5 : b'\xc3\x85', # Å
|
||||
0xc6 : b'\xc3\x86', # Æ
|
||||
0xc7 : b'\xc3\x87', # Ç
|
||||
0xc8 : b'\xc3\x88', # È
|
||||
0xc9 : b'\xc3\x89', # É
|
||||
0xca : b'\xc3\x8a', # Ê
|
||||
0xcb : b'\xc3\x8b', # Ë
|
||||
0xcc : b'\xc3\x8c', # Ì
|
||||
0xcd : b'\xc3\x8d', # Í
|
||||
0xce : b'\xc3\x8e', # Î
|
||||
0xcf : b'\xc3\x8f', # Ï
|
||||
0xd0 : b'\xc3\x90', # Ð
|
||||
0xd1 : b'\xc3\x91', # Ñ
|
||||
0xd2 : b'\xc3\x92', # Ò
|
||||
0xd3 : b'\xc3\x93', # Ó
|
||||
0xd4 : b'\xc3\x94', # Ô
|
||||
0xd5 : b'\xc3\x95', # Õ
|
||||
0xd6 : b'\xc3\x96', # Ö
|
||||
0xd7 : b'\xc3\x97', # ×
|
||||
0xd8 : b'\xc3\x98', # Ø
|
||||
0xd9 : b'\xc3\x99', # Ù
|
||||
0xda : b'\xc3\x9a', # Ú
|
||||
0xdb : b'\xc3\x9b', # Û
|
||||
0xdc : b'\xc3\x9c', # Ü
|
||||
0xdd : b'\xc3\x9d', # Ý
|
||||
0xde : b'\xc3\x9e', # Þ
|
||||
0xdf : b'\xc3\x9f', # ß
|
||||
0xe0 : b'\xc3\xa0', # à
|
||||
0xe1 : b'\xa1', # á
|
||||
0xe2 : b'\xc3\xa2', # â
|
||||
0xe3 : b'\xc3\xa3', # ã
|
||||
0xe4 : b'\xc3\xa4', # ä
|
||||
0xe5 : b'\xc3\xa5', # å
|
||||
0xe6 : b'\xc3\xa6', # æ
|
||||
0xe7 : b'\xc3\xa7', # ç
|
||||
0xe8 : b'\xc3\xa8', # è
|
||||
0xe9 : b'\xc3\xa9', # é
|
||||
0xea : b'\xc3\xaa', # ê
|
||||
0xeb : b'\xc3\xab', # ë
|
||||
0xec : b'\xc3\xac', # ì
|
||||
0xed : b'\xc3\xad', # í
|
||||
0xee : b'\xc3\xae', # î
|
||||
0xef : b'\xc3\xaf', # ï
|
||||
0xf0 : b'\xc3\xb0', # ð
|
||||
0xf1 : b'\xc3\xb1', # ñ
|
||||
0xf2 : b'\xc3\xb2', # ò
|
||||
0xf3 : b'\xc3\xb3', # ó
|
||||
0xf4 : b'\xc3\xb4', # ô
|
||||
0xf5 : b'\xc3\xb5', # õ
|
||||
0xf6 : b'\xc3\xb6', # ö
|
||||
0xf7 : b'\xc3\xb7', # ÷
|
||||
0xf8 : b'\xc3\xb8', # ø
|
||||
0xf9 : b'\xc3\xb9', # ù
|
||||
0xfa : b'\xc3\xba', # ú
|
||||
0xfb : b'\xc3\xbb', # û
|
||||
0xfc : b'\xc3\xbc', # ü
|
||||
0xfd : b'\xc3\xbd', # ý
|
||||
0xfe : b'\xc3\xbe', # þ
|
||||
}
|
||||
|
||||
MULTIBYTE_MARKERS_AND_SIZES = [
|
||||
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
|
||||
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
|
||||
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
|
||||
]
|
||||
|
||||
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
|
||||
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
|
||||
|
||||
@classmethod
|
||||
def detwingle(cls, in_bytes, main_encoding="utf8",
|
||||
embedded_encoding="windows-1252"):
|
||||
"""Fix characters from one encoding embedded in some other encoding.
|
||||
|
||||
Currently the only situation supported is Windows-1252 (or its
|
||||
subset ISO-8859-1), embedded in UTF-8.
|
||||
|
||||
The input must be a bytestring. If you've already converted
|
||||
the document to Unicode, you're too late.
|
||||
|
||||
The output is a bytestring in which `embedded_encoding`
|
||||
characters have been converted to their `main_encoding`
|
||||
equivalents.
|
||||
"""
|
||||
if embedded_encoding.replace('_', '-').lower() not in (
|
||||
'windows-1252', 'windows_1252'):
|
||||
raise NotImplementedError(
|
||||
"Windows-1252 and ISO-8859-1 are the only currently supported "
|
||||
"embedded encodings.")
|
||||
|
||||
if main_encoding.lower() not in ('utf8', 'utf-8'):
|
||||
raise NotImplementedError(
|
||||
"UTF-8 is the only currently supported main encoding.")
|
||||
|
||||
byte_chunks = []
|
||||
|
||||
chunk_start = 0
|
||||
pos = 0
|
||||
while pos < len(in_bytes):
|
||||
byte = in_bytes[pos]
|
||||
if not isinstance(byte, int):
|
||||
# Python 2.x
|
||||
byte = ord(byte)
|
||||
if (byte >= cls.FIRST_MULTIBYTE_MARKER
|
||||
and byte <= cls.LAST_MULTIBYTE_MARKER):
|
||||
# This is the start of a UTF-8 multibyte character. Skip
|
||||
# to the end.
|
||||
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
|
||||
if byte >= start and byte <= end:
|
||||
pos += size
|
||||
break
|
||||
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
|
||||
# We found a Windows-1252 character!
|
||||
# Save the string up to this point as a chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:pos])
|
||||
|
||||
# Now translate the Windows-1252 character into UTF-8
|
||||
# and add it as another, one-byte chunk.
|
||||
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
|
||||
pos += 1
|
||||
chunk_start = pos
|
||||
else:
|
||||
# Go on to the next character.
|
||||
pos += 1
|
||||
if chunk_start == 0:
|
||||
# The string is unchanged.
|
||||
return in_bytes
|
||||
else:
|
||||
# Store the final chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:])
|
||||
return b''.join(byte_chunks)
|
||||
|
219
libs/bs42.7/diagnose.py
Normal file
219
libs/bs42.7/diagnose.py
Normal file
|
@ -0,0 +1,219 @@
|
|||
"""Diagnostic functions, mainly for use when doing tech support."""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
__license__ = "MIT"
|
||||
|
||||
import cProfile
|
||||
from StringIO import StringIO
|
||||
from HTMLParser import HTMLParser
|
||||
import bs4
|
||||
from bs4 import BeautifulSoup, __version__
|
||||
from bs4.builder import builder_registry
|
||||
|
||||
import os
|
||||
import pstats
|
||||
import random
|
||||
import tempfile
|
||||
import time
|
||||
import traceback
|
||||
import sys
|
||||
import cProfile
|
||||
|
||||
def diagnose(data):
|
||||
"""Diagnostic suite for isolating common problems."""
|
||||
print "Diagnostic running on Beautiful Soup %s" % __version__
|
||||
print "Python version %s" % sys.version
|
||||
|
||||
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||||
for name in basic_parsers:
|
||||
for builder in builder_registry.builders:
|
||||
if name in builder.features:
|
||||
break
|
||||
else:
|
||||
basic_parsers.remove(name)
|
||||
print (
|
||||
"I noticed that %s is not installed. Installing it may help." %
|
||||
name)
|
||||
|
||||
if 'lxml' in basic_parsers:
|
||||
basic_parsers.append(["lxml", "xml"])
|
||||
try:
|
||||
from lxml import etree
|
||||
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
||||
except ImportError, e:
|
||||
print (
|
||||
"lxml is not installed or couldn't be imported.")
|
||||
|
||||
|
||||
if 'html5lib' in basic_parsers:
|
||||
try:
|
||||
import html5lib
|
||||
print "Found html5lib version %s" % html5lib.__version__
|
||||
except ImportError, e:
|
||||
print (
|
||||
"html5lib is not installed or couldn't be imported.")
|
||||
|
||||
if hasattr(data, 'read'):
|
||||
data = data.read()
|
||||
elif os.path.exists(data):
|
||||
print '"%s" looks like a filename. Reading data from the file.' % data
|
||||
with open(data) as fp:
|
||||
data = fp.read()
|
||||
elif data.startswith("http:") or data.startswith("https:"):
|
||||
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
|
||||
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
|
||||
return
|
||||
print
|
||||
|
||||
for parser in basic_parsers:
|
||||
print "Trying to parse your markup with %s" % parser
|
||||
success = False
|
||||
try:
|
||||
soup = BeautifulSoup(data, parser)
|
||||
success = True
|
||||
except Exception, e:
|
||||
print "%s could not parse the markup." % parser
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print "Here's what %s did with the markup:" % parser
|
||||
print soup.prettify()
|
||||
|
||||
print "-" * 80
|
||||
|
||||
def lxml_trace(data, html=True, **kwargs):
|
||||
"""Print out the lxml events that occur during parsing.
|
||||
|
||||
This lets you see how lxml parses a document when no Beautiful
|
||||
Soup code is running.
|
||||
"""
|
||||
from lxml import etree
|
||||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
||||
print("%s, %4s, %s" % (event, element.tag, element.text))
|
||||
|
||||
class AnnouncingParser(HTMLParser):
|
||||
"""Announces HTMLParser parse events, without doing anything else."""
|
||||
|
||||
def _p(self, s):
|
||||
print(s)
|
||||
|
||||
def handle_starttag(self, name, attrs):
|
||||
self._p("%s START" % name)
|
||||
|
||||
def handle_endtag(self, name):
|
||||
self._p("%s END" % name)
|
||||
|
||||
def handle_data(self, data):
|
||||
self._p("%s DATA" % data)
|
||||
|
||||
def handle_charref(self, name):
|
||||
self._p("%s CHARREF" % name)
|
||||
|
||||
def handle_entityref(self, name):
|
||||
self._p("%s ENTITYREF" % name)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self._p("%s COMMENT" % data)
|
||||
|
||||
def handle_decl(self, data):
|
||||
self._p("%s DECL" % data)
|
||||
|
||||
def unknown_decl(self, data):
|
||||
self._p("%s UNKNOWN-DECL" % data)
|
||||
|
||||
def handle_pi(self, data):
|
||||
self._p("%s PI" % data)
|
||||
|
||||
def htmlparser_trace(data):
|
||||
"""Print out the HTMLParser events that occur during parsing.
|
||||
|
||||
This lets you see how HTMLParser parses a document when no
|
||||
Beautiful Soup code is running.
|
||||
"""
|
||||
parser = AnnouncingParser()
|
||||
parser.feed(data)
|
||||
|
||||
_vowels = "aeiou"
|
||||
_consonants = "bcdfghjklmnpqrstvwxyz"
|
||||
|
||||
def rword(length=5):
|
||||
"Generate a random word-like string."
|
||||
s = ''
|
||||
for i in range(length):
|
||||
if i % 2 == 0:
|
||||
t = _consonants
|
||||
else:
|
||||
t = _vowels
|
||||
s += random.choice(t)
|
||||
return s
|
||||
|
||||
def rsentence(length=4):
|
||||
"Generate a random sentence-like string."
|
||||
return " ".join(rword(random.randint(4,9)) for i in range(length))
|
||||
|
||||
def rdoc(num_elements=1000):
|
||||
"""Randomly generate an invalid HTML document."""
|
||||
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
|
||||
elements = []
|
||||
for i in range(num_elements):
|
||||
choice = random.randint(0,3)
|
||||
if choice == 0:
|
||||
# New tag.
|
||||
tag_name = random.choice(tag_names)
|
||||
elements.append("<%s>" % tag_name)
|
||||
elif choice == 1:
|
||||
elements.append(rsentence(random.randint(1,4)))
|
||||
elif choice == 2:
|
||||
# Close a tag.
|
||||
tag_name = random.choice(tag_names)
|
||||
elements.append("</%s>" % tag_name)
|
||||
return "<html>" + "\n".join(elements) + "</html>"
|
||||
|
||||
def benchmark_parsers(num_elements=100000):
|
||||
"""Very basic head-to-head performance benchmark."""
|
||||
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
|
||||
data = rdoc(num_elements)
|
||||
print "Generated a large invalid HTML document (%d bytes)." % len(data)
|
||||
|
||||
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||
success = False
|
||||
try:
|
||||
a = time.time()
|
||||
soup = BeautifulSoup(data, parser)
|
||||
b = time.time()
|
||||
success = True
|
||||
except Exception, e:
|
||||
print "%s could not parse the markup." % parser
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
|
||||
|
||||
from lxml import etree
|
||||
a = time.time()
|
||||
etree.HTML(data)
|
||||
b = time.time()
|
||||
print "Raw lxml parsed the markup in %.2fs." % (b-a)
|
||||
|
||||
import html5lib
|
||||
parser = html5lib.HTMLParser()
|
||||
a = time.time()
|
||||
parser.parse(data)
|
||||
b = time.time()
|
||||
print "Raw html5lib parsed the markup in %.2fs." % (b-a)
|
||||
|
||||
def profile(num_elements=100000, parser="lxml"):
|
||||
|
||||
filehandle = tempfile.NamedTemporaryFile()
|
||||
filename = filehandle.name
|
||||
|
||||
data = rdoc(num_elements)
|
||||
vars = dict(bs4=bs4, data=data, parser=parser)
|
||||
cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
|
||||
|
||||
stats = pstats.Stats(filename)
|
||||
# stats.strip_dirs()
|
||||
stats.sort_stats("cumulative")
|
||||
stats.print_stats('_html5lib|bs4', 50)
|
||||
|
||||
if __name__ == '__main__':
|
||||
diagnose(sys.stdin.read())
|
1808
libs/bs42.7/element.py
Normal file
1808
libs/bs42.7/element.py
Normal file
File diff suppressed because it is too large
Load diff
99
libs/bs42.7/formatter.py
Normal file
99
libs/bs42.7/formatter.py
Normal file
|
@ -0,0 +1,99 @@
|
|||
from bs4.dammit import EntitySubstitution
|
||||
|
||||
class Formatter(EntitySubstitution):
|
||||
"""Describes a strategy to use when outputting a parse tree to a string.
|
||||
|
||||
Some parts of this strategy come from the distinction between
|
||||
HTML4, HTML5, and XML. Others are configurable by the user.
|
||||
"""
|
||||
# Registries of XML and HTML formatters.
|
||||
XML_FORMATTERS = {}
|
||||
HTML_FORMATTERS = {}
|
||||
|
||||
HTML = 'html'
|
||||
XML = 'xml'
|
||||
|
||||
HTML_DEFAULTS = dict(
|
||||
cdata_containing_tags=set(["script", "style"]),
|
||||
)
|
||||
|
||||
def _default(self, language, value, kwarg):
|
||||
if value is not None:
|
||||
return value
|
||||
if language == self.XML:
|
||||
return set()
|
||||
return self.HTML_DEFAULTS[kwarg]
|
||||
|
||||
def __init__(
|
||||
self, language=None, entity_substitution=None,
|
||||
void_element_close_prefix='/', cdata_containing_tags=None,
|
||||
):
|
||||
"""
|
||||
|
||||
:param void_element_close_prefix: By default, represent void
|
||||
elements as <tag/> rather than <tag>
|
||||
"""
|
||||
self.language = language
|
||||
self.entity_substitution = entity_substitution
|
||||
self.void_element_close_prefix = void_element_close_prefix
|
||||
self.cdata_containing_tags = self._default(
|
||||
language, cdata_containing_tags, 'cdata_containing_tags'
|
||||
)
|
||||
|
||||
def substitute(self, ns):
|
||||
"""Process a string that needs to undergo entity substitution."""
|
||||
if not self.entity_substitution:
|
||||
return ns
|
||||
from .element import NavigableString
|
||||
if (isinstance(ns, NavigableString)
|
||||
and ns.parent is not None
|
||||
and ns.parent.name in self.cdata_containing_tags):
|
||||
# Do nothing.
|
||||
return ns
|
||||
# Substitute.
|
||||
return self.entity_substitution(ns)
|
||||
|
||||
def attribute_value(self, value):
|
||||
"""Process the value of an attribute."""
|
||||
return self.substitute(value)
|
||||
|
||||
def attributes(self, tag):
|
||||
"""Reorder a tag's attributes however you want."""
|
||||
return sorted(tag.attrs.items())
|
||||
|
||||
|
||||
class HTMLFormatter(Formatter):
|
||||
REGISTRY = {}
|
||||
def __init__(self, *args, **kwargs):
|
||||
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
|
||||
|
||||
|
||||
class XMLFormatter(Formatter):
|
||||
REGISTRY = {}
|
||||
def __init__(self, *args, **kwargs):
|
||||
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
|
||||
|
||||
|
||||
# Set up aliases for the default formatters.
|
||||
HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html
|
||||
)
|
||||
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html,
|
||||
void_element_close_prefix = None
|
||||
)
|
||||
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_xml
|
||||
)
|
||||
HTMLFormatter.REGISTRY[None] = HTMLFormatter(
|
||||
entity_substitution=None
|
||||
)
|
||||
XMLFormatter.REGISTRY["html"] = XMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html
|
||||
)
|
||||
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_xml
|
||||
)
|
||||
XMLFormatter.REGISTRY[None] = Formatter(
|
||||
Formatter(Formatter.XML, entity_substitution=None)
|
||||
)
|
770
libs/bs42.7/testing.py
Normal file
770
libs/bs42.7/testing.py
Normal file
|
@ -0,0 +1,770 @@
|
|||
"""Helper classes for tests."""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
__license__ = "MIT"
|
||||
|
||||
import pickle
|
||||
import copy
|
||||
import functools
|
||||
import unittest
|
||||
from unittest import TestCase
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
Comment,
|
||||
ContentMetaAttributeValue,
|
||||
Doctype,
|
||||
SoupStrainer,
|
||||
)
|
||||
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
default_builder = HTMLParserTreeBuilder
|
||||
|
||||
|
||||
class SoupTest(unittest.TestCase):
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return default_builder()
|
||||
|
||||
def soup(self, markup, **kwargs):
|
||||
"""Build a Beautiful Soup object from markup."""
|
||||
builder = kwargs.pop('builder', self.default_builder)
|
||||
return BeautifulSoup(markup, builder=builder, **kwargs)
|
||||
|
||||
def document_for(self, markup):
|
||||
"""Turn an HTML fragment into a document.
|
||||
|
||||
The details depend on the builder.
|
||||
"""
|
||||
return self.default_builder.test_fragment_to_document(markup)
|
||||
|
||||
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
||||
builder = self.default_builder
|
||||
obj = BeautifulSoup(to_parse, builder=builder)
|
||||
if compare_parsed_to is None:
|
||||
compare_parsed_to = to_parse
|
||||
|
||||
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
||||
|
||||
def assertConnectedness(self, element):
|
||||
"""Ensure that next_element and previous_element are properly
|
||||
set for all descendants of the given element.
|
||||
"""
|
||||
earlier = None
|
||||
for e in element.descendants:
|
||||
if earlier:
|
||||
self.assertEqual(e, earlier.next_element)
|
||||
self.assertEqual(earlier, e.previous_element)
|
||||
earlier = e
|
||||
|
||||
class HTMLTreeBuilderSmokeTest(object):
|
||||
|
||||
"""A basic test of a treebuilder's competence.
|
||||
|
||||
Any HTML treebuilder, present or future, should be able to pass
|
||||
these tests. With invalid markup, there's room for interpretation,
|
||||
and different parsers can handle it differently. But with the
|
||||
markup in these tests, there's not much room for interpretation.
|
||||
"""
|
||||
|
||||
def test_empty_element_tags(self):
|
||||
"""Verify that all HTML4 and HTML5 empty element (aka void element) tags
|
||||
are handled correctly.
|
||||
"""
|
||||
for name in [
|
||||
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||
'spacer', 'frame'
|
||||
]:
|
||||
soup = self.soup("")
|
||||
new_tag = soup.new_tag(name)
|
||||
self.assertEqual(True, new_tag.is_empty_element)
|
||||
|
||||
def test_pickle_and_unpickle_identity(self):
|
||||
# Pickling a tree, then unpickling it, yields a tree identical
|
||||
# to the original.
|
||||
tree = self.soup("<a><b>foo</a>")
|
||||
dumped = pickle.dumps(tree, 2)
|
||||
loaded = pickle.loads(dumped)
|
||||
self.assertEqual(loaded.__class__, BeautifulSoup)
|
||||
self.assertEqual(loaded.decode(), tree.decode())
|
||||
|
||||
def assertDoctypeHandled(self, doctype_fragment):
|
||||
"""Assert that a given doctype string is handled correctly."""
|
||||
doctype_str, soup = self._document_with_doctype(doctype_fragment)
|
||||
|
||||
# Make sure a Doctype object was created.
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual(doctype.__class__, Doctype)
|
||||
self.assertEqual(doctype, doctype_fragment)
|
||||
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
|
||||
|
||||
# Make sure that the doctype was correctly associated with the
|
||||
# parse tree and that the rest of the document parsed.
|
||||
self.assertEqual(soup.p.contents[0], 'foo')
|
||||
|
||||
def _document_with_doctype(self, doctype_fragment):
|
||||
"""Generate and parse a document with the given doctype."""
|
||||
doctype = '<!DOCTYPE %s>' % doctype_fragment
|
||||
markup = doctype + '\n<p>foo</p>'
|
||||
soup = self.soup(markup)
|
||||
return doctype, soup
|
||||
|
||||
def test_normal_doctypes(self):
|
||||
"""Make sure normal, everyday HTML doctypes are handled correctly."""
|
||||
self.assertDoctypeHandled("html")
|
||||
self.assertDoctypeHandled(
|
||||
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
|
||||
|
||||
def test_empty_doctype(self):
|
||||
soup = self.soup("<!DOCTYPE>")
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual("", doctype.strip())
|
||||
|
||||
def test_public_doctype_with_url(self):
|
||||
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
|
||||
self.assertDoctypeHandled(doctype)
|
||||
|
||||
def test_system_doctype(self):
|
||||
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
|
||||
|
||||
def test_namespaced_system_doctype(self):
|
||||
# We can handle a namespaced doctype with a system ID.
|
||||
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
|
||||
|
||||
def test_namespaced_public_doctype(self):
|
||||
# Test a namespaced doctype with a public id.
|
||||
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out more or less the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8").replace(b"\n", b""),
|
||||
markup.replace(b"\n", b""))
|
||||
|
||||
def test_processing_instruction(self):
|
||||
# We test both Unicode and bytestring to verify that
|
||||
# process_markup correctly sets processing_instruction_class
|
||||
# even when the markup is already Unicode and there is no
|
||||
# need to process anything.
|
||||
markup = u"""<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.decode())
|
||||
|
||||
markup = b"""<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode("utf8"))
|
||||
|
||||
def test_deepcopy(self):
|
||||
"""Make sure you can copy the tree builder.
|
||||
|
||||
This is important because the builder is part of a
|
||||
BeautifulSoup object, and we want to be able to copy that.
|
||||
"""
|
||||
copy.deepcopy(self.default_builder)
|
||||
|
||||
def test_p_tag_is_never_empty_element(self):
|
||||
"""A <p> tag is never designated as an empty-element tag.
|
||||
|
||||
Even if the markup shows it as an empty-element tag, it
|
||||
shouldn't be presented that way.
|
||||
"""
|
||||
soup = self.soup("<p/>")
|
||||
self.assertFalse(soup.p.is_empty_element)
|
||||
self.assertEqual(str(soup.p), "<p></p>")
|
||||
|
||||
def test_unclosed_tags_get_closed(self):
|
||||
"""A tag that's not closed by the end of the document should be closed.
|
||||
|
||||
This applies to all tags except empty-element tags.
|
||||
"""
|
||||
self.assertSoupEquals("<p>", "<p></p>")
|
||||
self.assertSoupEquals("<b>", "<b></b>")
|
||||
|
||||
self.assertSoupEquals("<br>", "<br/>")
|
||||
|
||||
def test_br_is_always_empty_element_tag(self):
|
||||
"""A <br> tag is designated as an empty-element tag.
|
||||
|
||||
Some parsers treat <br></br> as one <br/> tag, some parsers as
|
||||
two tags, but it should always be an empty-element tag.
|
||||
"""
|
||||
soup = self.soup("<br></br>")
|
||||
self.assertTrue(soup.br.is_empty_element)
|
||||
self.assertEqual(str(soup.br), "<br/>")
|
||||
|
||||
def test_nested_formatting_elements(self):
|
||||
self.assertSoupEquals("<em><em></em></em>")
|
||||
|
||||
def test_double_head(self):
|
||||
html = '''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Ordinary HEAD element test</title>
|
||||
</head>
|
||||
<script type="text/javascript">
|
||||
alert("Help!");
|
||||
</script>
|
||||
<body>
|
||||
Hello, world!
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
soup = self.soup(html)
|
||||
self.assertEqual("text/javascript", soup.find('script')['type'])
|
||||
|
||||
def test_comment(self):
|
||||
# Comments are represented as Comment objects.
|
||||
markup = "<p>foo<!--foobar-->baz</p>"
|
||||
self.assertSoupEquals(markup)
|
||||
|
||||
soup = self.soup(markup)
|
||||
comment = soup.find(text="foobar")
|
||||
self.assertEqual(comment.__class__, Comment)
|
||||
|
||||
# The comment is properly integrated into the tree.
|
||||
foo = soup.find(text="foo")
|
||||
self.assertEqual(comment, foo.next_element)
|
||||
baz = soup.find(text="baz")
|
||||
self.assertEqual(comment, baz.previous_element)
|
||||
|
||||
def test_preserved_whitespace_in_pre_and_textarea(self):
|
||||
"""Whitespace must be preserved in <pre> and <textarea> tags,
|
||||
even if that would mean not prettifying the markup.
|
||||
"""
|
||||
pre_markup = "<pre> </pre>"
|
||||
textarea_markup = "<textarea> woo\nwoo </textarea>"
|
||||
self.assertSoupEquals(pre_markup)
|
||||
self.assertSoupEquals(textarea_markup)
|
||||
|
||||
soup = self.soup(pre_markup)
|
||||
self.assertEqual(soup.pre.prettify(), pre_markup)
|
||||
|
||||
soup = self.soup(textarea_markup)
|
||||
self.assertEqual(soup.textarea.prettify(), textarea_markup)
|
||||
|
||||
soup = self.soup("<textarea></textarea>")
|
||||
self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>")
|
||||
|
||||
def test_nested_inline_elements(self):
|
||||
"""Inline elements can be nested indefinitely."""
|
||||
b_tag = "<b>Inside a B tag</b>"
|
||||
self.assertSoupEquals(b_tag)
|
||||
|
||||
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
def test_nested_block_level_elements(self):
|
||||
"""Block elements can be nested."""
|
||||
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
|
||||
blockquote = soup.blockquote
|
||||
self.assertEqual(blockquote.p.b.string, 'Foo')
|
||||
self.assertEqual(blockquote.b.string, 'Foo')
|
||||
|
||||
def test_correctly_nested_tables(self):
|
||||
"""One table can go inside another one."""
|
||||
markup = ('<table id="1">'
|
||||
'<tr>'
|
||||
"<td>Here's another table:"
|
||||
'<table id="2">'
|
||||
'<tr><td>foo</td></tr>'
|
||||
'</table></td>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
markup,
|
||||
'<table id="1"><tr><td>Here\'s another table:'
|
||||
'<table id="2"><tr><td>foo</td></tr></table>'
|
||||
'</td></tr></table>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||
|
||||
def test_deeply_nested_multivalued_attribute(self):
|
||||
# html5lib can set the attributes of the same tag many times
|
||||
# as it rearranges the tree. This has caused problems with
|
||||
# multivalued attributes.
|
||||
markup = '<table><div><div class="css"></div></div></table>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(["css"], soup.div.div['class'])
|
||||
|
||||
def test_multivalued_attribute_on_html(self):
|
||||
# html5lib uses a different API to set the attributes ot the
|
||||
# <html> tag. This has caused problems with multivalued
|
||||
# attributes.
|
||||
markup = '<html class="a b"></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(["a", "b"], soup.html['class'])
|
||||
|
||||
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
||||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
||||
|
||||
def test_entities_in_attributes_converted_to_unicode(self):
|
||||
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
|
||||
def test_entities_in_text_converted_to_unicode(self):
|
||||
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
|
||||
def test_quot_entity_converted_to_quotation_mark(self):
|
||||
self.assertSoupEquals("<p>I said "good day!"</p>",
|
||||
'<p>I said "good day!"</p>')
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
expect = u"\N{REPLACEMENT CHARACTER}"
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
|
||||
def test_multipart_strings(self):
|
||||
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
|
||||
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
|
||||
self.assertEqual("p", soup.h2.string.next_element.name)
|
||||
self.assertEqual("p", soup.p.name)
|
||||
self.assertConnectedness(soup)
|
||||
|
||||
def test_empty_element_tags(self):
|
||||
"""Verify consistent handling of empty-element tags,
|
||||
no matter how they come in through the markup.
|
||||
"""
|
||||
self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
|
||||
self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
|
||||
|
||||
def test_head_tag_between_head_and_body(self):
|
||||
"Prevent recurrence of a bug in the html5lib treebuilder."
|
||||
content = """<html><head></head>
|
||||
<link></link>
|
||||
<body>foo</body>
|
||||
</html>
|
||||
"""
|
||||
soup = self.soup(content)
|
||||
self.assertNotEqual(None, soup.html.body)
|
||||
self.assertConnectedness(soup)
|
||||
|
||||
def test_multiple_copies_of_a_tag(self):
|
||||
"Prevent recurrence of a bug in the html5lib treebuilder."
|
||||
content = """<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<article id="a" >
|
||||
<div><a href="1"></div>
|
||||
<footer>
|
||||
<a href="2"></a>
|
||||
</footer>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
soup = self.soup(content)
|
||||
self.assertConnectedness(soup.article)
|
||||
|
||||
def test_basic_namespaces(self):
|
||||
"""Parsers don't need to *understand* namespaces, but at the
|
||||
very least they should not choke on namespaces or lose
|
||||
data."""
|
||||
|
||||
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode())
|
||||
html = soup.html
|
||||
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
|
||||
|
||||
def test_multivalued_attribute_value_becomes_list(self):
|
||||
markup = b'<a class="foo bar">'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(['foo', 'bar'], soup.a['class'])
|
||||
|
||||
#
|
||||
# Generally speaking, tests below this point are more tests of
|
||||
# Beautiful Soup than tests of the tree builders. But parsers are
|
||||
# weird, so we run these tests separately for every tree builder
|
||||
# to detect any differences between them.
|
||||
#
|
||||
|
||||
def test_can_parse_unicode_document(self):
|
||||
# A seemingly innocuous document... but it's in Unicode! And
|
||||
# it contains characters that can't be represented in the
|
||||
# encoding found in the declaration! The horror!
|
||||
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
|
||||
|
||||
def test_soupstrainer(self):
|
||||
"""Parsers should be able to work with SoupStrainers."""
|
||||
strainer = SoupStrainer("b")
|
||||
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
|
||||
parse_only=strainer)
|
||||
self.assertEqual(soup.decode(), "<b>bold</b>")
|
||||
|
||||
def test_single_quote_attribute_values_become_double_quotes(self):
|
||||
self.assertSoupEquals("<foo attr='bar'></foo>",
|
||||
'<foo attr="bar"></foo>')
|
||||
|
||||
def test_attribute_values_with_nested_quotes_are_left_alone(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
self.assertSoupEquals(text)
|
||||
|
||||
def test_attribute_values_with_double_nested_quotes_get_quoted(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
soup = self.soup(text)
|
||||
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
|
||||
self.assertSoupEquals(
|
||||
soup.foo.decode(),
|
||||
"""<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""")
|
||||
|
||||
def test_ampersand_in_attribute_value_gets_escaped(self):
|
||||
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
|
||||
'<this is="really messed up & stuff"></this>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>',
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>')
|
||||
|
||||
def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
|
||||
self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>')
|
||||
|
||||
def test_entities_in_strings_converted_during_parsing(self):
|
||||
# Both XML and HTML entities are converted to Unicode characters
|
||||
# during parsing.
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||
self.assertSoupEquals(text, expected)
|
||||
|
||||
def test_smart_quotes_converted_on_the_way_in(self):
|
||||
# Microsoft smart quotes are converted to Unicode characters during
|
||||
# parsing.
|
||||
quote = b"<p>\x91Foo\x92</p>"
|
||||
soup = self.soup(quote)
|
||||
self.assertEqual(
|
||||
soup.p.string,
|
||||
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||
|
||||
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
||||
soup = self.soup("<a> </a>")
|
||||
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
|
||||
|
||||
def test_entities_converted_on_the_way_out(self):
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||
soup = self.soup(text)
|
||||
self.assertEqual(soup.p.encode("utf-8"), expected)
|
||||
|
||||
def test_real_iso_latin_document(self):
|
||||
# Smoke test of interrelated functionality, using an
|
||||
# easy-to-understand document.
|
||||
|
||||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
||||
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||
|
||||
# That's because we're going to encode it into ISO-Latin-1, and use
|
||||
# that to test.
|
||||
iso_latin_html = unicode_html.encode("iso-8859-1")
|
||||
|
||||
# Parse the ISO-Latin-1 HTML.
|
||||
soup = self.soup(iso_latin_html)
|
||||
# Encode it to UTF-8.
|
||||
result = soup.encode("utf-8")
|
||||
|
||||
# What do we expect the result to look like? Well, it would
|
||||
# look like unicode_html, except that the META tag would say
|
||||
# UTF-8 instead of ISO-Latin-1.
|
||||
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
|
||||
|
||||
# And, of course, it would be in UTF-8, not Unicode.
|
||||
expected = expected.encode("utf-8")
|
||||
|
||||
# Ta-da!
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_real_shift_jis_document(self):
|
||||
# Smoke test to make sure the parser can handle a document in
|
||||
# Shift-JIS encoding, without choking.
|
||||
shift_jis_html = (
|
||||
b'<html><head></head><body><pre>'
|
||||
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
|
||||
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
|
||||
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
|
||||
b'</pre></body></html>')
|
||||
unicode_html = shift_jis_html.decode("shift-jis")
|
||||
soup = self.soup(unicode_html)
|
||||
|
||||
# Make sure the parse tree is correctly encoded to various
|
||||
# encodings.
|
||||
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
|
||||
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
|
||||
|
||||
def test_real_hebrew_document(self):
|
||||
# A real-world test to make sure we can convert ISO-8859-9 (a
|
||||
# Hebrew encoding) to UTF-8.
|
||||
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
|
||||
soup = self.soup(
|
||||
hebrew_document, from_encoding="iso8859-8")
|
||||
# Some tree builders call it iso8859-8, others call it iso-8859-9.
|
||||
# That's not a difference we really care about.
|
||||
assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
|
||||
self.assertEqual(
|
||||
soup.encode('utf-8'),
|
||||
hebrew_document.decode("iso8859-8").encode("utf-8"))
|
||||
|
||||
def test_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta content="text/html; charset=x-sjis" '
|
||||
'http-equiv="Content-type"/>')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
|
||||
content = parsed_meta['content']
|
||||
self.assertEqual('text/html; charset=x-sjis', content)
|
||||
|
||||
# But that value is actually a ContentMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(content, ContentMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
|
||||
|
||||
# For the rest of the story, see TestSubstitutions in
|
||||
# test_tree.py.
|
||||
|
||||
def test_html5_style_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta id="encoding" charset="x-sjis" />')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', id="encoding")
|
||||
charset = parsed_meta['charset']
|
||||
self.assertEqual('x-sjis', charset)
|
||||
|
||||
# But that value is actually a CharsetMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('utf8', charset.encode("utf8"))
|
||||
|
||||
def test_tag_with_no_attributes_can_have_attributes_added(self):
|
||||
data = self.soup("<a>text</a>")
|
||||
data.a['foo'] = 'bar'
|
||||
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
||||
|
||||
class XMLTreeBuilderSmokeTest(object):
|
||||
|
||||
def test_pickle_and_unpickle_identity(self):
|
||||
# Pickling a tree, then unpickling it, yields a tree identical
|
||||
# to the original.
|
||||
tree = self.soup("<a><b>foo</a>")
|
||||
dumped = pickle.dumps(tree, 2)
|
||||
loaded = pickle.loads(dumped)
|
||||
self.assertEqual(loaded.__class__, BeautifulSoup)
|
||||
self.assertEqual(loaded.decode(), tree.decode())
|
||||
|
||||
def test_docstring_generated(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
|
||||
|
||||
def test_xml_declaration(self):
|
||||
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode("utf8"))
|
||||
|
||||
def test_processing_instruction(self):
|
||||
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode("utf8"))
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out *exactly* the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8"), markup)
|
||||
|
||||
def test_formatter_processes_script_tag_for_xml_documents(self):
|
||||
doc = """
|
||||
<script type="text/javascript">
|
||||
</script>
|
||||
"""
|
||||
soup = BeautifulSoup(doc, "lxml-xml")
|
||||
# lxml would have stripped this while parsing, but we can add
|
||||
# it later.
|
||||
soup.script.string = 'console.log("< < hey > > ");'
|
||||
encoded = soup.encode()
|
||||
self.assertTrue(b"< < hey > >" in encoded)
|
||||
|
||||
def test_can_parse_unicode_document(self):
|
||||
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
|
||||
|
||||
def test_popping_namespaced_tag(self):
|
||||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
unicode(soup.rss), markup)
|
||||
|
||||
def test_docstring_includes_correct_encoding(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode("latin1"),
|
||||
b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
|
||||
|
||||
def test_large_xml_document(self):
|
||||
"""A large XML document should come out the same as it went in."""
|
||||
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
|
||||
+ b'0' * (2**12)
|
||||
+ b'</root>')
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(soup.encode("utf-8"), markup)
|
||||
|
||||
|
||||
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
|
||||
self.assertSoupEquals("<p>", "<p/>")
|
||||
self.assertSoupEquals("<p>foo</p>")
|
||||
|
||||
def test_namespaces_are_preserved(self):
|
||||
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
|
||||
soup = self.soup(markup)
|
||||
root = soup.root
|
||||
self.assertEqual("http://example.com/", root['xmlns:a'])
|
||||
self.assertEqual("http://example.net/", root['xmlns:b'])
|
||||
|
||||
def test_closing_namespaced_tag(self):
|
||||
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.p), markup)
|
||||
|
||||
def test_namespaced_attributes(self):
|
||||
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.foo), markup)
|
||||
|
||||
def test_namespaced_attributes_xml_namespace(self):
|
||||
markup = '<foo xml:lang="fr">bar</foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.foo), markup)
|
||||
|
||||
def test_find_by_prefixed_name(self):
|
||||
doc = """<?xml version="1.0" encoding="utf-8"?>
|
||||
<Document xmlns="http://example.com/ns0"
|
||||
xmlns:ns1="http://example.com/ns1"
|
||||
xmlns:ns2="http://example.com/ns2"
|
||||
<ns1:tag>foo</ns1:tag>
|
||||
<ns1:tag>bar</ns1:tag>
|
||||
<ns2:tag key="value">baz</ns2:tag>
|
||||
</Document>
|
||||
"""
|
||||
soup = self.soup(doc)
|
||||
|
||||
# There are three <tag> tags.
|
||||
self.assertEqual(3, len(soup.find_all('tag')))
|
||||
|
||||
# But two of them are ns1:tag and one of them is ns2:tag.
|
||||
self.assertEqual(2, len(soup.find_all('ns1:tag')))
|
||||
self.assertEqual(1, len(soup.find_all('ns2:tag')))
|
||||
|
||||
self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
|
||||
self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
|
||||
|
||||
def test_copy_tag_preserves_namespace(self):
|
||||
xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:document xmlns:w="http://example.com/ns0"/>"""
|
||||
|
||||
soup = self.soup(xml)
|
||||
tag = soup.document
|
||||
duplicate = copy.copy(tag)
|
||||
|
||||
# The two tags have the same namespace prefix.
|
||||
self.assertEqual(tag.prefix, duplicate.prefix)
|
||||
|
||||
|
||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||
"""Smoke test for a tree builder that supports HTML5."""
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
|
||||
# XHTML documents in any particular way.
|
||||
pass
|
||||
|
||||
def test_html_tags_have_namespace(self):
|
||||
markup = "<a>"
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
|
||||
|
||||
def test_svg_tags_have_namespace(self):
|
||||
markup = '<svg><circle/></svg>'
|
||||
soup = self.soup(markup)
|
||||
namespace = "http://www.w3.org/2000/svg"
|
||||
self.assertEqual(namespace, soup.svg.namespace)
|
||||
self.assertEqual(namespace, soup.circle.namespace)
|
||||
|
||||
|
||||
def test_mathml_tags_have_namespace(self):
|
||||
markup = '<math><msqrt>5</msqrt></math>'
|
||||
soup = self.soup(markup)
|
||||
namespace = 'http://www.w3.org/1998/Math/MathML'
|
||||
self.assertEqual(namespace, soup.math.namespace)
|
||||
self.assertEqual(namespace, soup.msqrt.namespace)
|
||||
|
||||
def test_xml_declaration_becomes_comment(self):
|
||||
markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertTrue(isinstance(soup.contents[0], Comment))
|
||||
self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
|
||||
self.assertEqual("html", soup.contents[0].next_element.name)
|
||||
|
||||
def skipIf(condition, reason):
|
||||
def nothing(test, *args, **kwargs):
|
||||
return None
|
||||
|
||||
def decorator(test_item):
|
||||
if condition:
|
||||
return nothing
|
||||
else:
|
||||
return test_item
|
||||
|
||||
return decorator
|
1
libs/bs42.7/tests/__init__.py
Normal file
1
libs/bs42.7/tests/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
"The beautifulsoup tests."
|
147
libs/bs42.7/tests/test_builder_registry.py
Normal file
147
libs/bs42.7/tests/test_builder_registry.py
Normal file
|
@ -0,0 +1,147 @@
|
|||
"""Tests of the builder registry."""
|
||||
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.builder import (
|
||||
builder_registry as registry,
|
||||
HTMLParserTreeBuilder,
|
||||
TreeBuilderRegistry,
|
||||
)
|
||||
|
||||
try:
|
||||
from bs4.builder import HTML5TreeBuilder
|
||||
HTML5LIB_PRESENT = True
|
||||
except ImportError:
|
||||
HTML5LIB_PRESENT = False
|
||||
|
||||
try:
|
||||
from bs4.builder import (
|
||||
LXMLTreeBuilderForXML,
|
||||
LXMLTreeBuilder,
|
||||
)
|
||||
LXML_PRESENT = True
|
||||
except ImportError:
|
||||
LXML_PRESENT = False
|
||||
|
||||
|
||||
class BuiltInRegistryTest(unittest.TestCase):
|
||||
"""Test the built-in registry with the default builders registered."""
|
||||
|
||||
def test_combination(self):
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('fast', 'html'),
|
||||
LXMLTreeBuilder)
|
||||
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('permissive', 'xml'),
|
||||
LXMLTreeBuilderForXML)
|
||||
self.assertEqual(registry.lookup('strict', 'html'),
|
||||
HTMLParserTreeBuilder)
|
||||
if HTML5LIB_PRESENT:
|
||||
self.assertEqual(registry.lookup('html5lib', 'html'),
|
||||
HTML5TreeBuilder)
|
||||
|
||||
def test_lookup_by_markup_type(self):
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
|
||||
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
|
||||
else:
|
||||
self.assertEqual(registry.lookup('xml'), None)
|
||||
if HTML5LIB_PRESENT:
|
||||
self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
|
||||
else:
|
||||
self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
|
||||
|
||||
def test_named_library(self):
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('lxml', 'xml'),
|
||||
LXMLTreeBuilderForXML)
|
||||
self.assertEqual(registry.lookup('lxml', 'html'),
|
||||
LXMLTreeBuilder)
|
||||
if HTML5LIB_PRESENT:
|
||||
self.assertEqual(registry.lookup('html5lib'),
|
||||
HTML5TreeBuilder)
|
||||
|
||||
self.assertEqual(registry.lookup('html.parser'),
|
||||
HTMLParserTreeBuilder)
|
||||
|
||||
def test_beautifulsoup_constructor_does_lookup(self):
|
||||
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
# This will create a warning about not explicitly
|
||||
# specifying a parser, but we'll ignore it.
|
||||
|
||||
# You can pass in a string.
|
||||
BeautifulSoup("", features="html")
|
||||
# Or a list of strings.
|
||||
BeautifulSoup("", features=["html", "fast"])
|
||||
|
||||
# You'll get an exception if BS can't find an appropriate
|
||||
# builder.
|
||||
self.assertRaises(ValueError, BeautifulSoup,
|
||||
"", features="no-such-feature")
|
||||
|
||||
class RegistryTest(unittest.TestCase):
|
||||
"""Test the TreeBuilderRegistry class in general."""
|
||||
|
||||
def setUp(self):
|
||||
self.registry = TreeBuilderRegistry()
|
||||
|
||||
def builder_for_features(self, *feature_list):
|
||||
cls = type('Builder_' + '_'.join(feature_list),
|
||||
(object,), {'features' : feature_list})
|
||||
|
||||
self.registry.register(cls)
|
||||
return cls
|
||||
|
||||
def test_register_with_no_features(self):
|
||||
builder = self.builder_for_features()
|
||||
|
||||
# Since the builder advertises no features, you can't find it
|
||||
# by looking up features.
|
||||
self.assertEqual(self.registry.lookup('foo'), None)
|
||||
|
||||
# But you can find it by doing a lookup with no features, if
|
||||
# this happens to be the only registered builder.
|
||||
self.assertEqual(self.registry.lookup(), builder)
|
||||
|
||||
def test_register_with_features_makes_lookup_succeed(self):
|
||||
builder = self.builder_for_features('foo', 'bar')
|
||||
self.assertEqual(self.registry.lookup('foo'), builder)
|
||||
self.assertEqual(self.registry.lookup('bar'), builder)
|
||||
|
||||
def test_lookup_fails_when_no_builder_implements_feature(self):
|
||||
builder = self.builder_for_features('foo', 'bar')
|
||||
self.assertEqual(self.registry.lookup('baz'), None)
|
||||
|
||||
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
|
||||
builder1 = self.builder_for_features('foo')
|
||||
builder2 = self.builder_for_features('bar')
|
||||
self.assertEqual(self.registry.lookup(), builder2)
|
||||
|
||||
def test_lookup_fails_when_no_tree_builders_registered(self):
|
||||
self.assertEqual(self.registry.lookup(), None)
|
||||
|
||||
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
|
||||
has_one = self.builder_for_features('foo')
|
||||
has_the_other = self.builder_for_features('bar')
|
||||
has_both_early = self.builder_for_features('foo', 'bar', 'baz')
|
||||
has_both_late = self.builder_for_features('foo', 'bar', 'quux')
|
||||
lacks_one = self.builder_for_features('bar')
|
||||
has_the_other = self.builder_for_features('foo')
|
||||
|
||||
# There are two builders featuring 'foo' and 'bar', but
|
||||
# the one that also features 'quux' was registered later.
|
||||
self.assertEqual(self.registry.lookup('foo', 'bar'),
|
||||
has_both_late)
|
||||
|
||||
# There is only one builder featuring 'foo', 'bar', and 'baz'.
|
||||
self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
|
||||
has_both_early)
|
||||
|
||||
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
|
||||
builder1 = self.builder_for_features('foo', 'bar')
|
||||
builder2 = self.builder_for_features('foo', 'baz')
|
||||
self.assertEqual(self.registry.lookup('bar', 'baz'), None)
|
36
libs/bs42.7/tests/test_docs.py
Normal file
36
libs/bs42.7/tests/test_docs.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
"Test harness for doctests."
|
||||
|
||||
# pylint: disable-msg=E0611,W0142
|
||||
|
||||
__metaclass__ = type
|
||||
__all__ = [
|
||||
'additional_tests',
|
||||
]
|
||||
|
||||
import atexit
|
||||
import doctest
|
||||
import os
|
||||
#from pkg_resources import (
|
||||
# resource_filename, resource_exists, resource_listdir, cleanup_resources)
|
||||
import unittest
|
||||
|
||||
DOCTEST_FLAGS = (
|
||||
doctest.ELLIPSIS |
|
||||
doctest.NORMALIZE_WHITESPACE |
|
||||
doctest.REPORT_NDIFF)
|
||||
|
||||
|
||||
# def additional_tests():
|
||||
# "Run the doc tests (README.txt and docs/*, if any exist)"
|
||||
# doctest_files = [
|
||||
# os.path.abspath(resource_filename('bs4', 'README.txt'))]
|
||||
# if resource_exists('bs4', 'docs'):
|
||||
# for name in resource_listdir('bs4', 'docs'):
|
||||
# if name.endswith('.txt'):
|
||||
# doctest_files.append(
|
||||
# os.path.abspath(
|
||||
# resource_filename('bs4', 'docs/%s' % name)))
|
||||
# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
|
||||
# atexit.register(cleanup_resources)
|
||||
# return unittest.TestSuite((
|
||||
# doctest.DocFileSuite(*doctest_files, **kwargs)))
|
130
libs/bs42.7/tests/test_html5lib.py
Normal file
130
libs/bs42.7/tests/test_html5lib.py
Normal file
|
@ -0,0 +1,130 @@
|
|||
"""Tests to ensure that the html5lib tree builder generates good trees."""
|
||||
|
||||
import warnings
|
||||
|
||||
try:
|
||||
from bs4.builder import HTML5TreeBuilder
|
||||
HTML5LIB_PRESENT = True
|
||||
except ImportError, e:
|
||||
HTML5LIB_PRESENT = False
|
||||
from bs4.element import SoupStrainer
|
||||
from bs4.testing import (
|
||||
HTML5TreeBuilderSmokeTest,
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
|
||||
@skipIf(
|
||||
not HTML5LIB_PRESENT,
|
||||
"html5lib seems not to be present, not testing its tree builder.")
|
||||
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||
"""See ``HTML5TreeBuilderSmokeTest``."""
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return HTML5TreeBuilder()
|
||||
|
||||
def test_soupstrainer(self):
|
||||
# The html5lib tree builder does not support SoupStrainers.
|
||||
strainer = SoupStrainer("b")
|
||||
markup = "<p>A <b>bold</b> statement.</p>"
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(markup, parse_only=strainer)
|
||||
self.assertEqual(
|
||||
soup.decode(), self.document_for(markup))
|
||||
|
||||
self.assertTrue(
|
||||
"the html5lib tree builder doesn't support parse_only" in
|
||||
str(w[0].message))
|
||||
|
||||
def test_correctly_nested_tables(self):
|
||||
"""html5lib inserts <tbody> tags where other parsers don't."""
|
||||
markup = ('<table id="1">'
|
||||
'<tr>'
|
||||
"<td>Here's another table:"
|
||||
'<table id="2">'
|
||||
'<tr><td>foo</td></tr>'
|
||||
'</table></td>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
markup,
|
||||
'<table id="1"><tbody><tr><td>Here\'s another table:'
|
||||
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
|
||||
'</td></tr></tbody></table>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||
|
||||
def test_xml_declaration_followed_by_doctype(self):
|
||||
markup = '''<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<p>foo</p>
|
||||
</body>
|
||||
</html>'''
|
||||
soup = self.soup(markup)
|
||||
# Verify that we can reach the <p> tag; this means the tree is connected.
|
||||
self.assertEqual(b"<p>foo</p>", soup.p.encode())
|
||||
|
||||
def test_reparented_markup(self):
|
||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
|
||||
self.assertEqual(2, len(soup.find_all('p')))
|
||||
|
||||
|
||||
def test_reparented_markup_ends_with_whitespace(self):
|
||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
|
||||
self.assertEqual(2, len(soup.find_all('p')))
|
||||
|
||||
def test_reparented_markup_containing_identical_whitespace_nodes(self):
|
||||
"""Verify that we keep the two whitespace nodes in this
|
||||
document distinct when reparenting the adjacent <tbody> tags.
|
||||
"""
|
||||
markup = '<table> <tbody><tbody><ims></tbody> </table>'
|
||||
soup = self.soup(markup)
|
||||
space1, space2 = soup.find_all(string=' ')
|
||||
tbody1, tbody2 = soup.find_all('tbody')
|
||||
assert space1.next_element is tbody1
|
||||
assert tbody2.next_element is space2
|
||||
|
||||
def test_reparented_markup_containing_children(self):
|
||||
markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>'
|
||||
soup = self.soup(markup)
|
||||
noscript = soup.noscript
|
||||
self.assertEqual("target", noscript.next_element)
|
||||
target = soup.find(string='target')
|
||||
|
||||
# The 'aftermath' string was duplicated; we want the second one.
|
||||
final_aftermath = soup.find_all(string='aftermath')[-1]
|
||||
|
||||
# The <noscript> tag was moved beneath a copy of the <a> tag,
|
||||
# but the 'target' string within is still connected to the
|
||||
# (second) 'aftermath' string.
|
||||
self.assertEqual(final_aftermath, target.next_element)
|
||||
self.assertEqual(target, final_aftermath.previous_element)
|
||||
|
||||
def test_processing_instruction(self):
|
||||
"""Processing instructions become comments."""
|
||||
markup = b"""<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
assert str(soup).startswith("<!--?PITarget PIContent?-->")
|
||||
|
||||
def test_cloned_multivalue_node(self):
|
||||
markup = b"""<a class="my_class"><p></a>"""
|
||||
soup = self.soup(markup)
|
||||
a1, a2 = soup.find_all('a')
|
||||
self.assertEqual(a1, a2)
|
||||
assert a1 is not a2
|
||||
|
||||
def test_foster_parenting(self):
|
||||
markup = b"""<table><td></tbody>A"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
|
34
libs/bs42.7/tests/test_htmlparser.py
Normal file
34
libs/bs42.7/tests/test_htmlparser.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
"""Tests to ensure that the html.parser tree builder generates good
|
||||
trees."""
|
||||
|
||||
from pdb import set_trace
|
||||
import pickle
|
||||
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
|
||||
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return HTMLParserTreeBuilder()
|
||||
|
||||
def test_namespaced_system_doctype(self):
|
||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||
pass
|
||||
|
||||
def test_namespaced_public_doctype(self):
|
||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||
pass
|
||||
|
||||
def test_builder_is_pickled(self):
|
||||
"""Unlike most tree builders, HTMLParserTreeBuilder and will
|
||||
be restored after pickling.
|
||||
"""
|
||||
tree = self.soup("<a><b>foo</a>")
|
||||
dumped = pickle.dumps(tree, 2)
|
||||
loaded = pickle.loads(dumped)
|
||||
self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
|
||||
|
||||
def test_redundant_empty_element_closing_tags(self):
|
||||
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
|
||||
self.assertSoupEquals('</br></br></br>', "")
|
76
libs/bs42.7/tests/test_lxml.py
Normal file
76
libs/bs42.7/tests/test_lxml.py
Normal file
|
@ -0,0 +1,76 @@
|
|||
"""Tests to ensure that the lxml tree builder generates good trees."""
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
try:
|
||||
import lxml.etree
|
||||
LXML_PRESENT = True
|
||||
LXML_VERSION = lxml.etree.LXML_VERSION
|
||||
except ImportError, e:
|
||||
LXML_PRESENT = False
|
||||
LXML_VERSION = (0,)
|
||||
|
||||
if LXML_PRESENT:
|
||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||
|
||||
from bs4 import (
|
||||
BeautifulSoup,
|
||||
BeautifulStoneSoup,
|
||||
)
|
||||
from bs4.element import Comment, Doctype, SoupStrainer
|
||||
from bs4.testing import skipIf
|
||||
from bs4.tests import test_htmlparser
|
||||
from bs4.testing import (
|
||||
HTMLTreeBuilderSmokeTest,
|
||||
XMLTreeBuilderSmokeTest,
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
|
||||
@skipIf(
|
||||
not LXML_PRESENT,
|
||||
"lxml seems not to be present, not testing its tree builder.")
|
||||
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return LXMLTreeBuilder()
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
|
||||
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
|
||||
# test if an old version of lxml is installed.
|
||||
|
||||
@skipIf(
|
||||
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
|
||||
"Skipping doctype test for old version of lxml to avoid segfault.")
|
||||
def test_empty_doctype(self):
|
||||
soup = self.soup("<!DOCTYPE>")
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual("", doctype.strip())
|
||||
|
||||
def test_beautifulstonesoup_is_xml_parser(self):
|
||||
# Make sure that the deprecated BSS class uses an xml builder
|
||||
# if one is installed.
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = BeautifulStoneSoup("<b />")
|
||||
self.assertEqual(u"<b/>", unicode(soup.b))
|
||||
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
|
||||
|
||||
@skipIf(
|
||||
not LXML_PRESENT,
|
||||
"lxml seems not to be present, not testing its XML tree builder.")
|
||||
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
|
||||
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return LXMLTreeBuilderForXML()
|
501
libs/bs42.7/tests/test_soup.py
Normal file
501
libs/bs42.7/tests/test_soup.py
Normal file
|
@ -0,0 +1,501 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""Tests of Beautiful Soup as a whole."""
|
||||
|
||||
from pdb import set_trace
|
||||
import logging
|
||||
import unittest
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from bs4 import (
|
||||
BeautifulSoup,
|
||||
BeautifulStoneSoup,
|
||||
)
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
SoupStrainer,
|
||||
NamespacedAttribute,
|
||||
)
|
||||
import bs4.dammit
|
||||
from bs4.dammit import (
|
||||
EntitySubstitution,
|
||||
UnicodeDammit,
|
||||
EncodingDetector,
|
||||
)
|
||||
from bs4.testing import (
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
import warnings
|
||||
|
||||
try:
|
||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||
LXML_PRESENT = True
|
||||
except ImportError, e:
|
||||
LXML_PRESENT = False
|
||||
|
||||
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
||||
|
||||
class TestConstructor(SoupTest):
|
||||
|
||||
def test_short_unicode_input(self):
|
||||
data = u"<h1>éé</h1>"
|
||||
soup = self.soup(data)
|
||||
self.assertEqual(u"éé", soup.h1.string)
|
||||
|
||||
def test_embedded_null(self):
|
||||
data = u"<h1>foo\0bar</h1>"
|
||||
soup = self.soup(data)
|
||||
self.assertEqual(u"foo\0bar", soup.h1.string)
|
||||
|
||||
def test_exclude_encodings(self):
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
|
||||
self.assertEqual("windows-1252", soup.original_encoding)
|
||||
|
||||
|
||||
class TestWarnings(SoupTest):
|
||||
|
||||
def _no_parser_specified(self, s, is_there=True):
|
||||
v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
|
||||
self.assertTrue(v)
|
||||
|
||||
def test_warning_if_no_parser_specified(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>")
|
||||
msg = str(w[0].message)
|
||||
self._assert_no_parser_specified(msg)
|
||||
|
||||
def test_warning_if_parser_specified_too_vague(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>", "html")
|
||||
msg = str(w[0].message)
|
||||
self._assert_no_parser_specified(msg)
|
||||
|
||||
def test_no_warning_if_explicit_parser_specified(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>", "html.parser")
|
||||
self.assertEqual([], w)
|
||||
|
||||
def test_parseOnlyThese_renamed_to_parse_only(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
|
||||
msg = str(w[0].message)
|
||||
self.assertTrue("parseOnlyThese" in msg)
|
||||
self.assertTrue("parse_only" in msg)
|
||||
self.assertEqual(b"<b></b>", soup.encode())
|
||||
|
||||
def test_fromEncoding_renamed_to_from_encoding(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
utf8 = b"\xc3\xa9"
|
||||
soup = self.soup(utf8, fromEncoding="utf8")
|
||||
msg = str(w[0].message)
|
||||
self.assertTrue("fromEncoding" in msg)
|
||||
self.assertTrue("from_encoding" in msg)
|
||||
self.assertEqual("utf8", soup.original_encoding)
|
||||
|
||||
def test_unrecognized_keyword_argument(self):
|
||||
self.assertRaises(
|
||||
TypeError, self.soup, "<a>", no_such_argument=True)
|
||||
|
||||
class TestWarnings(SoupTest):
|
||||
|
||||
def test_disk_file_warning(self):
|
||||
filehandle = tempfile.NamedTemporaryFile()
|
||||
filename = filehandle.name
|
||||
try:
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(filename)
|
||||
msg = str(w[0].message)
|
||||
self.assertTrue("looks like a filename" in msg)
|
||||
finally:
|
||||
filehandle.close()
|
||||
|
||||
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(filename)
|
||||
self.assertEqual(0, len(w))
|
||||
|
||||
def test_url_warning_with_bytes_url(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
soup = self.soup(b"http://www.crummybytes.com/")
|
||||
# Be aware this isn't the only warning that can be raised during
|
||||
# execution..
|
||||
self.assertTrue(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
def test_url_warning_with_unicode_url(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
# note - this url must differ from the bytes one otherwise
|
||||
# python's warnings system swallows the second warning
|
||||
soup = self.soup(u"http://www.crummyunicode.com/")
|
||||
self.assertTrue(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
def test_url_warning_with_bytes_and_space(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
soup = self.soup(b"http://www.crummybytes.com/ is great")
|
||||
self.assertFalse(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
def test_url_warning_with_unicode_and_space(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
soup = self.soup(u"http://www.crummyuncode.com/ is great")
|
||||
self.assertFalse(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
|
||||
class TestSelectiveParsing(SoupTest):
|
||||
|
||||
def test_parse_with_soupstrainer(self):
|
||||
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
|
||||
strainer = SoupStrainer("b")
|
||||
soup = self.soup(markup, parse_only=strainer)
|
||||
self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
|
||||
|
||||
|
||||
class TestEntitySubstitution(unittest.TestCase):
|
||||
"""Standalone tests of the EntitySubstitution class."""
|
||||
def setUp(self):
|
||||
self.sub = EntitySubstitution
|
||||
|
||||
def test_simple_html_substitution(self):
|
||||
# Unicode characters corresponding to named HTML entites
|
||||
# are substituted, and no others.
|
||||
s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
|
||||
self.assertEqual(self.sub.substitute_html(s),
|
||||
u"foo∀\N{SNOWMAN}õbar")
|
||||
|
||||
def test_smart_quote_substitution(self):
|
||||
# MS smart quotes are a common source of frustration, so we
|
||||
# give them a special test.
|
||||
quotes = b"\x91\x92foo\x93\x94"
|
||||
dammit = UnicodeDammit(quotes)
|
||||
self.assertEqual(self.sub.substitute_html(dammit.markup),
|
||||
"‘’foo“”")
|
||||
|
||||
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
|
||||
s = 'Welcome to "my bar"'
|
||||
self.assertEqual(self.sub.substitute_xml(s, False), s)
|
||||
|
||||
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
|
||||
self.assertEqual(self.sub.substitute_xml("Welcome", True),
|
||||
'"Welcome"')
|
||||
self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
|
||||
'"Bob\'s Bar"')
|
||||
|
||||
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
|
||||
s = 'Welcome to "my bar"'
|
||||
self.assertEqual(self.sub.substitute_xml(s, True),
|
||||
"'Welcome to \"my bar\"'")
|
||||
|
||||
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
|
||||
s = 'Welcome to "Bob\'s Bar"'
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml(s, True),
|
||||
'"Welcome to "Bob\'s Bar""')
|
||||
|
||||
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
|
||||
quoted = 'Welcome to "Bob\'s Bar"'
|
||||
self.assertEqual(self.sub.substitute_xml(quoted), quoted)
|
||||
|
||||
def test_xml_quoting_handles_angle_brackets(self):
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml("foo<bar>"),
|
||||
"foo<bar>")
|
||||
|
||||
def test_xml_quoting_handles_ampersands(self):
|
||||
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T")
|
||||
|
||||
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml("ÁT&T"),
|
||||
"&Aacute;T&T")
|
||||
|
||||
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml_containing_entities("ÁT&T"),
|
||||
"ÁT&T")
|
||||
|
||||
def test_quotes_not_html_substituted(self):
|
||||
"""There's no need to do this except inside attribute values."""
|
||||
text = 'Bob\'s "bar"'
|
||||
self.assertEqual(self.sub.substitute_html(text), text)
|
||||
|
||||
|
||||
class TestEncodingConversion(SoupTest):
|
||||
# Test Beautiful Soup's ability to decode and encode from various
|
||||
# encodings.
|
||||
|
||||
def setUp(self):
|
||||
super(TestEncodingConversion, self).setUp()
|
||||
self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
||||
self.utf8_data = self.unicode_data.encode("utf-8")
|
||||
# Just so you know what it looks like.
|
||||
self.assertEqual(
|
||||
self.utf8_data,
|
||||
b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
|
||||
|
||||
def test_ascii_in_unicode_out(self):
|
||||
# ASCII input is converted to Unicode. The original_encoding
|
||||
# attribute is set to 'utf-8', a superset of ASCII.
|
||||
chardet = bs4.dammit.chardet_dammit
|
||||
logging.disable(logging.WARNING)
|
||||
try:
|
||||
def noop(str):
|
||||
return None
|
||||
# Disable chardet, which will realize that the ASCII is ASCII.
|
||||
bs4.dammit.chardet_dammit = noop
|
||||
ascii = b"<foo>a</foo>"
|
||||
soup_from_ascii = self.soup(ascii)
|
||||
unicode_output = soup_from_ascii.decode()
|
||||
self.assertTrue(isinstance(unicode_output, unicode))
|
||||
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
|
||||
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
|
||||
finally:
|
||||
logging.disable(logging.NOTSET)
|
||||
bs4.dammit.chardet_dammit = chardet
|
||||
|
||||
def test_unicode_in_unicode_out(self):
|
||||
# Unicode input is left alone. The original_encoding attribute
|
||||
# is not set.
|
||||
soup_from_unicode = self.soup(self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
|
||||
self.assertEqual(soup_from_unicode.original_encoding, None)
|
||||
|
||||
def test_utf8_in_unicode_out(self):
|
||||
# UTF-8 input is converted to Unicode. The original_encoding
|
||||
# attribute is set.
|
||||
soup_from_utf8 = self.soup(self.utf8_data)
|
||||
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
|
||||
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
|
||||
|
||||
def test_utf8_out(self):
|
||||
# The internal data structures can be encoded as UTF-8.
|
||||
soup_from_unicode = self.soup(self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
|
||||
|
||||
@skipIf(
|
||||
PYTHON_3_PRE_3_2,
|
||||
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
|
||||
def test_attribute_name_containing_unicode_characters(self):
|
||||
markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
|
||||
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
||||
|
||||
class TestUnicodeDammit(unittest.TestCase):
|
||||
"""Standalone tests of UnicodeDammit."""
|
||||
|
||||
def test_unicode_input(self):
|
||||
markup = u"I'm already Unicode! \N{SNOWMAN}"
|
||||
dammit = UnicodeDammit(markup)
|
||||
self.assertEqual(dammit.unicode_markup, markup)
|
||||
|
||||
def test_smart_quotes_to_unicode(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup)
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
|
||||
|
||||
def test_smart_quotes_to_xml_entities(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
||||
|
||||
def test_smart_quotes_to_html_entities(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="html")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
||||
|
||||
def test_smart_quotes_to_ascii(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, """<foo>''""</foo>""")
|
||||
|
||||
def test_detect_utf8(self):
|
||||
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
|
||||
dammit = UnicodeDammit(utf8)
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}')
|
||||
|
||||
|
||||
def test_convert_hebrew(self):
|
||||
hebrew = b"\xed\xe5\xec\xf9"
|
||||
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
|
||||
self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
|
||||
|
||||
def test_dont_see_smart_quotes_where_there_are_none(self):
|
||||
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
|
||||
dammit = UnicodeDammit(utf_8)
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
|
||||
|
||||
def test_ignore_inappropriate_codecs(self):
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
|
||||
def test_ignore_invalid_codecs(self):
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
|
||||
dammit = UnicodeDammit(utf8_data, [bad_encoding])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
|
||||
def test_exclude_encodings(self):
|
||||
# This is UTF-8.
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
|
||||
# But if we exclude UTF-8 from consideration, the guess is
|
||||
# Windows-1252.
|
||||
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
|
||||
|
||||
# And if we exclude that, there is no valid guess at all.
|
||||
dammit = UnicodeDammit(
|
||||
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
|
||||
self.assertEqual(dammit.original_encoding, None)
|
||||
|
||||
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
|
||||
detected = EncodingDetector(
|
||||
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
|
||||
encodings = list(detected.encodings)
|
||||
assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings
|
||||
|
||||
def test_detect_html5_style_meta_tag(self):
|
||||
|
||||
for data in (
|
||||
b'<html><meta charset="euc-jp" /></html>',
|
||||
b"<html><meta charset='euc-jp' /></html>",
|
||||
b"<html><meta charset=euc-jp /></html>",
|
||||
b"<html><meta charset=euc-jp/></html>"):
|
||||
dammit = UnicodeDammit(data, is_html=True)
|
||||
self.assertEqual(
|
||||
"euc-jp", dammit.original_encoding)
|
||||
|
||||
def test_last_ditch_entity_replacement(self):
|
||||
# This is a UTF-8 document that contains bytestrings
|
||||
# completely incompatible with UTF-8 (ie. encoded with some other
|
||||
# encoding).
|
||||
#
|
||||
# Since there is no consistent encoding for the document,
|
||||
# Unicode, Dammit will eventually encode the document as UTF-8
|
||||
# and encode the incompatible characters as REPLACEMENT
|
||||
# CHARACTER.
|
||||
#
|
||||
# If chardet is installed, it will detect that the document
|
||||
# can be converted into ISO-8859-1 without errors. This happens
|
||||
# to be the wrong encoding, but it is a consistent encoding, so the
|
||||
# code we're testing here won't run.
|
||||
#
|
||||
# So we temporarily disable chardet if it's present.
|
||||
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
|
||||
<html><b>\330\250\330\252\330\261</b>
|
||||
<i>\310\322\321\220\312\321\355\344</i></html>"""
|
||||
chardet = bs4.dammit.chardet_dammit
|
||||
logging.disable(logging.WARNING)
|
||||
try:
|
||||
def noop(str):
|
||||
return None
|
||||
bs4.dammit.chardet_dammit = noop
|
||||
dammit = UnicodeDammit(doc)
|
||||
self.assertEqual(True, dammit.contains_replacement_characters)
|
||||
self.assertTrue(u"\ufffd" in dammit.unicode_markup)
|
||||
|
||||
soup = BeautifulSoup(doc, "html.parser")
|
||||
self.assertTrue(soup.contains_replacement_characters)
|
||||
finally:
|
||||
logging.disable(logging.NOTSET)
|
||||
bs4.dammit.chardet_dammit = chardet
|
||||
|
||||
def test_byte_order_mark_removed(self):
|
||||
# A document written in UTF-16LE will have its byte order marker stripped.
|
||||
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
|
||||
dammit = UnicodeDammit(data)
|
||||
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
|
||||
self.assertEqual("utf-16le", dammit.original_encoding)
|
||||
|
||||
def test_detwingle(self):
|
||||
# Here's a UTF8 document.
|
||||
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
|
||||
|
||||
# Here's a Windows-1252 document.
|
||||
windows_1252 = (
|
||||
u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
|
||||
u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
|
||||
|
||||
# Through some unholy alchemy, they've been stuck together.
|
||||
doc = utf8 + windows_1252 + utf8
|
||||
|
||||
# The document can't be turned into UTF-8:
|
||||
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
|
||||
|
||||
# Unicode, Dammit thinks the whole document is Windows-1252,
|
||||
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
|
||||
|
||||
# But if we run it through fix_embedded_windows_1252, it's fixed:
|
||||
|
||||
fixed = UnicodeDammit.detwingle(doc)
|
||||
self.assertEqual(
|
||||
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
|
||||
|
||||
def test_detwingle_ignores_multibyte_characters(self):
|
||||
# Each of these characters has a UTF-8 representation ending
|
||||
# in \x93. \x93 is a smart quote if interpreted as
|
||||
# Windows-1252. But our code knows to skip over multibyte
|
||||
# UTF-8 characters, so they'll survive the process unscathed.
|
||||
for tricky_unicode_char in (
|
||||
u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
|
||||
u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
|
||||
u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
|
||||
):
|
||||
input = tricky_unicode_char.encode("utf8")
|
||||
self.assertTrue(input.endswith(b'\x93'))
|
||||
output = UnicodeDammit.detwingle(input)
|
||||
self.assertEqual(output, input)
|
||||
|
||||
class TestNamedspacedAttribute(SoupTest):
|
||||
|
||||
def test_name_may_be_none(self):
|
||||
a = NamespacedAttribute("xmlns", None)
|
||||
self.assertEqual(a, "xmlns")
|
||||
|
||||
def test_attribute_is_equivalent_to_colon_separated_string(self):
|
||||
a = NamespacedAttribute("a", "b")
|
||||
self.assertEqual("a:b", a)
|
||||
|
||||
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
|
||||
a = NamespacedAttribute("a", "b", "c")
|
||||
b = NamespacedAttribute("a", "b", "c")
|
||||
self.assertEqual(a, b)
|
||||
|
||||
# The actual namespace is not considered.
|
||||
c = NamespacedAttribute("a", "b", None)
|
||||
self.assertEqual(a, c)
|
||||
|
||||
# But name and prefix are important.
|
||||
d = NamespacedAttribute("a", "z", "c")
|
||||
self.assertNotEqual(a, d)
|
||||
|
||||
e = NamespacedAttribute("z", "b", "c")
|
||||
self.assertNotEqual(a, e)
|
||||
|
||||
|
||||
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
|
||||
|
||||
def test_content_meta_attribute_value(self):
|
||||
value = CharsetMetaAttributeValue("euc-jp")
|
||||
self.assertEqual("euc-jp", value)
|
||||
self.assertEqual("euc-jp", value.original_value)
|
||||
self.assertEqual("utf8", value.encode("utf8"))
|
||||
|
||||
|
||||
def test_content_meta_attribute_value(self):
|
||||
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
|
||||
self.assertEqual("text/html; charset=euc-jp", value)
|
||||
self.assertEqual("text/html; charset=euc-jp", value.original_value)
|
||||
self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
|
2050
libs/bs42.7/tests/test_tree.py
Normal file
2050
libs/bs42.7/tests/test_tree.py
Normal file
File diff suppressed because it is too large
Load diff
3
libs/concurrent2.7/__init__.py
Normal file
3
libs/concurrent2.7/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from pkgutil import extend_path
|
||||
|
||||
__path__ = extend_path(__path__, __name__)
|
23
libs/concurrent2.7/futures/__init__.py
Normal file
23
libs/concurrent2.7/futures/__init__.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
"""Execute computations asynchronously using threads or processes."""
|
||||
|
||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
||||
|
||||
from concurrent.futures._base import (FIRST_COMPLETED,
|
||||
FIRST_EXCEPTION,
|
||||
ALL_COMPLETED,
|
||||
CancelledError,
|
||||
TimeoutError,
|
||||
Future,
|
||||
Executor,
|
||||
wait,
|
||||
as_completed)
|
||||
from concurrent.futures.thread import ThreadPoolExecutor
|
||||
|
||||
try:
|
||||
from concurrent.futures.process import ProcessPoolExecutor
|
||||
except ImportError:
|
||||
# some platforms don't have multiprocessing
|
||||
pass
|
607
libs/concurrent2.7/futures/_base.py
Normal file
607
libs/concurrent2.7/futures/_base.py
Normal file
|
@ -0,0 +1,607 @@
|
|||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
import collections
|
||||
import logging
|
||||
import threading
|
||||
import itertools
|
||||
import time
|
||||
|
||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
||||
|
||||
FIRST_COMPLETED = 'FIRST_COMPLETED'
|
||||
FIRST_EXCEPTION = 'FIRST_EXCEPTION'
|
||||
ALL_COMPLETED = 'ALL_COMPLETED'
|
||||
_AS_COMPLETED = '_AS_COMPLETED'
|
||||
|
||||
# Possible future states (for internal use by the futures package).
|
||||
PENDING = 'PENDING'
|
||||
RUNNING = 'RUNNING'
|
||||
# The future was cancelled by the user...
|
||||
CANCELLED = 'CANCELLED'
|
||||
# ...and _Waiter.add_cancelled() was called by a worker.
|
||||
CANCELLED_AND_NOTIFIED = 'CANCELLED_AND_NOTIFIED'
|
||||
FINISHED = 'FINISHED'
|
||||
|
||||
_FUTURE_STATES = [
|
||||
PENDING,
|
||||
RUNNING,
|
||||
CANCELLED,
|
||||
CANCELLED_AND_NOTIFIED,
|
||||
FINISHED
|
||||
]
|
||||
|
||||
_STATE_TO_DESCRIPTION_MAP = {
|
||||
PENDING: "pending",
|
||||
RUNNING: "running",
|
||||
CANCELLED: "cancelled",
|
||||
CANCELLED_AND_NOTIFIED: "cancelled",
|
||||
FINISHED: "finished"
|
||||
}
|
||||
|
||||
# Logger for internal use by the futures package.
|
||||
LOGGER = logging.getLogger("concurrent.futures")
|
||||
|
||||
class Error(Exception):
|
||||
"""Base class for all future-related exceptions."""
|
||||
pass
|
||||
|
||||
class CancelledError(Error):
|
||||
"""The Future was cancelled."""
|
||||
pass
|
||||
|
||||
class TimeoutError(Error):
|
||||
"""The operation exceeded the given deadline."""
|
||||
pass
|
||||
|
||||
class _Waiter(object):
|
||||
"""Provides the event that wait() and as_completed() block on."""
|
||||
def __init__(self):
|
||||
self.event = threading.Event()
|
||||
self.finished_futures = []
|
||||
|
||||
def add_result(self, future):
|
||||
self.finished_futures.append(future)
|
||||
|
||||
def add_exception(self, future):
|
||||
self.finished_futures.append(future)
|
||||
|
||||
def add_cancelled(self, future):
|
||||
self.finished_futures.append(future)
|
||||
|
||||
class _AsCompletedWaiter(_Waiter):
|
||||
"""Used by as_completed()."""
|
||||
|
||||
def __init__(self):
|
||||
super(_AsCompletedWaiter, self).__init__()
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def add_result(self, future):
|
||||
with self.lock:
|
||||
super(_AsCompletedWaiter, self).add_result(future)
|
||||
self.event.set()
|
||||
|
||||
def add_exception(self, future):
|
||||
with self.lock:
|
||||
super(_AsCompletedWaiter, self).add_exception(future)
|
||||
self.event.set()
|
||||
|
||||
def add_cancelled(self, future):
|
||||
with self.lock:
|
||||
super(_AsCompletedWaiter, self).add_cancelled(future)
|
||||
self.event.set()
|
||||
|
||||
class _FirstCompletedWaiter(_Waiter):
|
||||
"""Used by wait(return_when=FIRST_COMPLETED)."""
|
||||
|
||||
def add_result(self, future):
|
||||
super(_FirstCompletedWaiter, self).add_result(future)
|
||||
self.event.set()
|
||||
|
||||
def add_exception(self, future):
|
||||
super(_FirstCompletedWaiter, self).add_exception(future)
|
||||
self.event.set()
|
||||
|
||||
def add_cancelled(self, future):
|
||||
super(_FirstCompletedWaiter, self).add_cancelled(future)
|
||||
self.event.set()
|
||||
|
||||
class _AllCompletedWaiter(_Waiter):
|
||||
"""Used by wait(return_when=FIRST_EXCEPTION and ALL_COMPLETED)."""
|
||||
|
||||
def __init__(self, num_pending_calls, stop_on_exception):
|
||||
self.num_pending_calls = num_pending_calls
|
||||
self.stop_on_exception = stop_on_exception
|
||||
self.lock = threading.Lock()
|
||||
super(_AllCompletedWaiter, self).__init__()
|
||||
|
||||
def _decrement_pending_calls(self):
|
||||
with self.lock:
|
||||
self.num_pending_calls -= 1
|
||||
if not self.num_pending_calls:
|
||||
self.event.set()
|
||||
|
||||
def add_result(self, future):
|
||||
super(_AllCompletedWaiter, self).add_result(future)
|
||||
self._decrement_pending_calls()
|
||||
|
||||
def add_exception(self, future):
|
||||
super(_AllCompletedWaiter, self).add_exception(future)
|
||||
if self.stop_on_exception:
|
||||
self.event.set()
|
||||
else:
|
||||
self._decrement_pending_calls()
|
||||
|
||||
def add_cancelled(self, future):
|
||||
super(_AllCompletedWaiter, self).add_cancelled(future)
|
||||
self._decrement_pending_calls()
|
||||
|
||||
class _AcquireFutures(object):
|
||||
"""A context manager that does an ordered acquire of Future conditions."""
|
||||
|
||||
def __init__(self, futures):
|
||||
self.futures = sorted(futures, key=id)
|
||||
|
||||
def __enter__(self):
|
||||
for future in self.futures:
|
||||
future._condition.acquire()
|
||||
|
||||
def __exit__(self, *args):
|
||||
for future in self.futures:
|
||||
future._condition.release()
|
||||
|
||||
def _create_and_install_waiters(fs, return_when):
|
||||
if return_when == _AS_COMPLETED:
|
||||
waiter = _AsCompletedWaiter()
|
||||
elif return_when == FIRST_COMPLETED:
|
||||
waiter = _FirstCompletedWaiter()
|
||||
else:
|
||||
pending_count = sum(
|
||||
f._state not in [CANCELLED_AND_NOTIFIED, FINISHED] for f in fs)
|
||||
|
||||
if return_when == FIRST_EXCEPTION:
|
||||
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=True)
|
||||
elif return_when == ALL_COMPLETED:
|
||||
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=False)
|
||||
else:
|
||||
raise ValueError("Invalid return condition: %r" % return_when)
|
||||
|
||||
for f in fs:
|
||||
f._waiters.append(waiter)
|
||||
|
||||
return waiter
|
||||
|
||||
def as_completed(fs, timeout=None):
|
||||
"""An iterator over the given futures that yields each as it completes.
|
||||
|
||||
Args:
|
||||
fs: The sequence of Futures (possibly created by different Executors) to
|
||||
iterate over.
|
||||
timeout: The maximum number of seconds to wait. If None, then there
|
||||
is no limit on the wait time.
|
||||
|
||||
Returns:
|
||||
An iterator that yields the given Futures as they complete (finished or
|
||||
cancelled). If any given Futures are duplicated, they will be returned
|
||||
once.
|
||||
|
||||
Raises:
|
||||
TimeoutError: If the entire result iterator could not be generated
|
||||
before the given timeout.
|
||||
"""
|
||||
if timeout is not None:
|
||||
end_time = timeout + time.time()
|
||||
|
||||
fs = set(fs)
|
||||
with _AcquireFutures(fs):
|
||||
finished = set(
|
||||
f for f in fs
|
||||
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
|
||||
pending = fs - finished
|
||||
waiter = _create_and_install_waiters(fs, _AS_COMPLETED)
|
||||
|
||||
try:
|
||||
for future in finished:
|
||||
yield future
|
||||
|
||||
while pending:
|
||||
if timeout is None:
|
||||
wait_timeout = None
|
||||
else:
|
||||
wait_timeout = end_time - time.time()
|
||||
if wait_timeout < 0:
|
||||
raise TimeoutError(
|
||||
'%d (of %d) futures unfinished' % (
|
||||
len(pending), len(fs)))
|
||||
|
||||
waiter.event.wait(wait_timeout)
|
||||
|
||||
with waiter.lock:
|
||||
finished = waiter.finished_futures
|
||||
waiter.finished_futures = []
|
||||
waiter.event.clear()
|
||||
|
||||
for future in finished:
|
||||
yield future
|
||||
pending.remove(future)
|
||||
|
||||
finally:
|
||||
for f in fs:
|
||||
with f._condition:
|
||||
f._waiters.remove(waiter)
|
||||
|
||||
DoneAndNotDoneFutures = collections.namedtuple(
|
||||
'DoneAndNotDoneFutures', 'done not_done')
|
||||
def wait(fs, timeout=None, return_when=ALL_COMPLETED):
|
||||
"""Wait for the futures in the given sequence to complete.
|
||||
|
||||
Args:
|
||||
fs: The sequence of Futures (possibly created by different Executors) to
|
||||
wait upon.
|
||||
timeout: The maximum number of seconds to wait. If None, then there
|
||||
is no limit on the wait time.
|
||||
return_when: Indicates when this function should return. The options
|
||||
are:
|
||||
|
||||
FIRST_COMPLETED - Return when any future finishes or is
|
||||
cancelled.
|
||||
FIRST_EXCEPTION - Return when any future finishes by raising an
|
||||
exception. If no future raises an exception
|
||||
then it is equivalent to ALL_COMPLETED.
|
||||
ALL_COMPLETED - Return when all futures finish or are cancelled.
|
||||
|
||||
Returns:
|
||||
A named 2-tuple of sets. The first set, named 'done', contains the
|
||||
futures that completed (is finished or cancelled) before the wait
|
||||
completed. The second set, named 'not_done', contains uncompleted
|
||||
futures.
|
||||
"""
|
||||
with _AcquireFutures(fs):
|
||||
done = set(f for f in fs
|
||||
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
|
||||
not_done = set(fs) - done
|
||||
|
||||
if (return_when == FIRST_COMPLETED) and done:
|
||||
return DoneAndNotDoneFutures(done, not_done)
|
||||
elif (return_when == FIRST_EXCEPTION) and done:
|
||||
if any(f for f in done
|
||||
if not f.cancelled() and f.exception() is not None):
|
||||
return DoneAndNotDoneFutures(done, not_done)
|
||||
|
||||
if len(done) == len(fs):
|
||||
return DoneAndNotDoneFutures(done, not_done)
|
||||
|
||||
waiter = _create_and_install_waiters(fs, return_when)
|
||||
|
||||
waiter.event.wait(timeout)
|
||||
for f in fs:
|
||||
with f._condition:
|
||||
f._waiters.remove(waiter)
|
||||
|
||||
done.update(waiter.finished_futures)
|
||||
return DoneAndNotDoneFutures(done, set(fs) - done)
|
||||
|
||||
class Future(object):
|
||||
"""Represents the result of an asynchronous computation."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initializes the future. Should not be called by clients."""
|
||||
self._condition = threading.Condition()
|
||||
self._state = PENDING
|
||||
self._result = None
|
||||
self._exception = None
|
||||
self._traceback = None
|
||||
self._waiters = []
|
||||
self._done_callbacks = []
|
||||
|
||||
def _invoke_callbacks(self):
|
||||
for callback in self._done_callbacks:
|
||||
try:
|
||||
callback(self)
|
||||
except Exception:
|
||||
LOGGER.exception('exception calling callback for %r', self)
|
||||
|
||||
def __repr__(self):
|
||||
with self._condition:
|
||||
if self._state == FINISHED:
|
||||
if self._exception:
|
||||
return '<Future at %s state=%s raised %s>' % (
|
||||
hex(id(self)),
|
||||
_STATE_TO_DESCRIPTION_MAP[self._state],
|
||||
self._exception.__class__.__name__)
|
||||
else:
|
||||
return '<Future at %s state=%s returned %s>' % (
|
||||
hex(id(self)),
|
||||
_STATE_TO_DESCRIPTION_MAP[self._state],
|
||||
self._result.__class__.__name__)
|
||||
return '<Future at %s state=%s>' % (
|
||||
hex(id(self)),
|
||||
_STATE_TO_DESCRIPTION_MAP[self._state])
|
||||
|
||||
def cancel(self):
|
||||
"""Cancel the future if possible.
|
||||
|
||||
Returns True if the future was cancelled, False otherwise. A future
|
||||
cannot be cancelled if it is running or has already completed.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state in [RUNNING, FINISHED]:
|
||||
return False
|
||||
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
return True
|
||||
|
||||
self._state = CANCELLED
|
||||
self._condition.notify_all()
|
||||
|
||||
self._invoke_callbacks()
|
||||
return True
|
||||
|
||||
def cancelled(self):
|
||||
"""Return True if the future has cancelled."""
|
||||
with self._condition:
|
||||
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]
|
||||
|
||||
def running(self):
|
||||
"""Return True if the future is currently executing."""
|
||||
with self._condition:
|
||||
return self._state == RUNNING
|
||||
|
||||
def done(self):
|
||||
"""Return True of the future was cancelled or finished executing."""
|
||||
with self._condition:
|
||||
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]
|
||||
|
||||
def __get_result(self):
|
||||
if self._exception:
|
||||
raise type(self._exception), self._exception, self._traceback
|
||||
else:
|
||||
return self._result
|
||||
|
||||
def add_done_callback(self, fn):
|
||||
"""Attaches a callable that will be called when the future finishes.
|
||||
|
||||
Args:
|
||||
fn: A callable that will be called with this future as its only
|
||||
argument when the future completes or is cancelled. The callable
|
||||
will always be called by a thread in the same process in which
|
||||
it was added. If the future has already completed or been
|
||||
cancelled then the callable will be called immediately. These
|
||||
callables are called in the order that they were added.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state not in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]:
|
||||
self._done_callbacks.append(fn)
|
||||
return
|
||||
fn(self)
|
||||
|
||||
def result(self, timeout=None):
|
||||
"""Return the result of the call that the future represents.
|
||||
|
||||
Args:
|
||||
timeout: The number of seconds to wait for the result if the future
|
||||
isn't done. If None, then there is no limit on the wait time.
|
||||
|
||||
Returns:
|
||||
The result of the call that the future represents.
|
||||
|
||||
Raises:
|
||||
CancelledError: If the future was cancelled.
|
||||
TimeoutError: If the future didn't finish executing before the given
|
||||
timeout.
|
||||
Exception: If the call raised then that exception will be raised.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
raise CancelledError()
|
||||
elif self._state == FINISHED:
|
||||
return self.__get_result()
|
||||
|
||||
self._condition.wait(timeout)
|
||||
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
raise CancelledError()
|
||||
elif self._state == FINISHED:
|
||||
return self.__get_result()
|
||||
else:
|
||||
raise TimeoutError()
|
||||
|
||||
def exception_info(self, timeout=None):
|
||||
"""Return a tuple of (exception, traceback) raised by the call that the
|
||||
future represents.
|
||||
|
||||
Args:
|
||||
timeout: The number of seconds to wait for the exception if the
|
||||
future isn't done. If None, then there is no limit on the wait
|
||||
time.
|
||||
|
||||
Returns:
|
||||
The exception raised by the call that the future represents or None
|
||||
if the call completed without raising.
|
||||
|
||||
Raises:
|
||||
CancelledError: If the future was cancelled.
|
||||
TimeoutError: If the future didn't finish executing before the given
|
||||
timeout.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
raise CancelledError()
|
||||
elif self._state == FINISHED:
|
||||
return self._exception, self._traceback
|
||||
|
||||
self._condition.wait(timeout)
|
||||
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
raise CancelledError()
|
||||
elif self._state == FINISHED:
|
||||
return self._exception, self._traceback
|
||||
else:
|
||||
raise TimeoutError()
|
||||
|
||||
def exception(self, timeout=None):
|
||||
"""Return the exception raised by the call that the future represents.
|
||||
|
||||
Args:
|
||||
timeout: The number of seconds to wait for the exception if the
|
||||
future isn't done. If None, then there is no limit on the wait
|
||||
time.
|
||||
|
||||
Returns:
|
||||
The exception raised by the call that the future represents or None
|
||||
if the call completed without raising.
|
||||
|
||||
Raises:
|
||||
CancelledError: If the future was cancelled.
|
||||
TimeoutError: If the future didn't finish executing before the given
|
||||
timeout.
|
||||
"""
|
||||
return self.exception_info(timeout)[0]
|
||||
|
||||
# The following methods should only be used by Executors and in tests.
|
||||
def set_running_or_notify_cancel(self):
|
||||
"""Mark the future as running or process any cancel notifications.
|
||||
|
||||
Should only be used by Executor implementations and unit tests.
|
||||
|
||||
If the future has been cancelled (cancel() was called and returned
|
||||
True) then any threads waiting on the future completing (though calls
|
||||
to as_completed() or wait()) are notified and False is returned.
|
||||
|
||||
If the future was not cancelled then it is put in the running state
|
||||
(future calls to running() will return True) and True is returned.
|
||||
|
||||
This method should be called by Executor implementations before
|
||||
executing the work associated with this future. If this method returns
|
||||
False then the work should not be executed.
|
||||
|
||||
Returns:
|
||||
False if the Future was cancelled, True otherwise.
|
||||
|
||||
Raises:
|
||||
RuntimeError: if this method was already called or if set_result()
|
||||
or set_exception() was called.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state == CANCELLED:
|
||||
self._state = CANCELLED_AND_NOTIFIED
|
||||
for waiter in self._waiters:
|
||||
waiter.add_cancelled(self)
|
||||
# self._condition.notify_all() is not necessary because
|
||||
# self.cancel() triggers a notification.
|
||||
return False
|
||||
elif self._state == PENDING:
|
||||
self._state = RUNNING
|
||||
return True
|
||||
else:
|
||||
LOGGER.critical('Future %s in unexpected state: %s',
|
||||
id(self),
|
||||
self._state)
|
||||
raise RuntimeError('Future in unexpected state')
|
||||
|
||||
def set_result(self, result):
|
||||
"""Sets the return value of work associated with the future.
|
||||
|
||||
Should only be used by Executor implementations and unit tests.
|
||||
"""
|
||||
with self._condition:
|
||||
self._result = result
|
||||
self._state = FINISHED
|
||||
for waiter in self._waiters:
|
||||
waiter.add_result(self)
|
||||
self._condition.notify_all()
|
||||
self._invoke_callbacks()
|
||||
|
||||
def set_exception_info(self, exception, traceback):
|
||||
"""Sets the result of the future as being the given exception
|
||||
and traceback.
|
||||
|
||||
Should only be used by Executor implementations and unit tests.
|
||||
"""
|
||||
with self._condition:
|
||||
self._exception = exception
|
||||
self._traceback = traceback
|
||||
self._state = FINISHED
|
||||
for waiter in self._waiters:
|
||||
waiter.add_exception(self)
|
||||
self._condition.notify_all()
|
||||
self._invoke_callbacks()
|
||||
|
||||
def set_exception(self, exception):
|
||||
"""Sets the result of the future as being the given exception.
|
||||
|
||||
Should only be used by Executor implementations and unit tests.
|
||||
"""
|
||||
self.set_exception_info(exception, None)
|
||||
|
||||
class Executor(object):
|
||||
"""This is an abstract base class for concrete asynchronous executors."""
|
||||
|
||||
def submit(self, fn, *args, **kwargs):
|
||||
"""Submits a callable to be executed with the given arguments.
|
||||
|
||||
Schedules the callable to be executed as fn(*args, **kwargs) and returns
|
||||
a Future instance representing the execution of the callable.
|
||||
|
||||
Returns:
|
||||
A Future representing the given call.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def map(self, fn, *iterables, **kwargs):
|
||||
"""Returns a iterator equivalent to map(fn, iter).
|
||||
|
||||
Args:
|
||||
fn: A callable that will take as many arguments as there are
|
||||
passed iterables.
|
||||
timeout: The maximum number of seconds to wait. If None, then there
|
||||
is no limit on the wait time.
|
||||
|
||||
Returns:
|
||||
An iterator equivalent to: map(func, *iterables) but the calls may
|
||||
be evaluated out-of-order.
|
||||
|
||||
Raises:
|
||||
TimeoutError: If the entire result iterator could not be generated
|
||||
before the given timeout.
|
||||
Exception: If fn(*args) raises for any values.
|
||||
"""
|
||||
timeout = kwargs.get('timeout')
|
||||
if timeout is not None:
|
||||
end_time = timeout + time.time()
|
||||
|
||||
fs = [self.submit(fn, *args) for args in itertools.izip(*iterables)]
|
||||
|
||||
# Yield must be hidden in closure so that the futures are submitted
|
||||
# before the first iterator value is required.
|
||||
def result_iterator():
|
||||
try:
|
||||
for future in fs:
|
||||
if timeout is None:
|
||||
yield future.result()
|
||||
else:
|
||||
yield future.result(end_time - time.time())
|
||||
finally:
|
||||
for future in fs:
|
||||
future.cancel()
|
||||
return result_iterator()
|
||||
|
||||
def shutdown(self, wait=True):
|
||||
"""Clean-up the resources associated with the Executor.
|
||||
|
||||
It is safe to call this method several times. Otherwise, no other
|
||||
methods can be called after this one.
|
||||
|
||||
Args:
|
||||
wait: If True then shutdown will not return until all running
|
||||
futures have finished executing and the resources used by the
|
||||
executor have been reclaimed.
|
||||
"""
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.shutdown(wait=True)
|
||||
return False
|
359
libs/concurrent2.7/futures/process.py
Normal file
359
libs/concurrent2.7/futures/process.py
Normal file
|
@ -0,0 +1,359 @@
|
|||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
"""Implements ProcessPoolExecutor.
|
||||
|
||||
The follow diagram and text describe the data-flow through the system:
|
||||
|
||||
|======================= In-process =====================|== Out-of-process ==|
|
||||
|
||||
+----------+ +----------+ +--------+ +-----------+ +---------+
|
||||
| | => | Work Ids | => | | => | Call Q | => | |
|
||||
| | +----------+ | | +-----------+ | |
|
||||
| | | ... | | | | ... | | |
|
||||
| | | 6 | | | | 5, call() | | |
|
||||
| | | 7 | | | | ... | | |
|
||||
| Process | | ... | | Local | +-----------+ | Process |
|
||||
| Pool | +----------+ | Worker | | #1..n |
|
||||
| Executor | | Thread | | |
|
||||
| | +----------- + | | +-----------+ | |
|
||||
| | <=> | Work Items | <=> | | <= | Result Q | <= | |
|
||||
| | +------------+ | | +-----------+ | |
|
||||
| | | 6: call() | | | | ... | | |
|
||||
| | | future | | | | 4, result | | |
|
||||
| | | ... | | | | 3, except | | |
|
||||
+----------+ +------------+ +--------+ +-----------+ +---------+
|
||||
|
||||
Executor.submit() called:
|
||||
- creates a uniquely numbered _WorkItem and adds it to the "Work Items" dict
|
||||
- adds the id of the _WorkItem to the "Work Ids" queue
|
||||
|
||||
Local worker thread:
|
||||
- reads work ids from the "Work Ids" queue and looks up the corresponding
|
||||
WorkItem from the "Work Items" dict: if the work item has been cancelled then
|
||||
it is simply removed from the dict, otherwise it is repackaged as a
|
||||
_CallItem and put in the "Call Q". New _CallItems are put in the "Call Q"
|
||||
until "Call Q" is full. NOTE: the size of the "Call Q" is kept small because
|
||||
calls placed in the "Call Q" can no longer be cancelled with Future.cancel().
|
||||
- reads _ResultItems from "Result Q", updates the future stored in the
|
||||
"Work Items" dict and deletes the dict entry
|
||||
|
||||
Process #1..n:
|
||||
- reads _CallItems from "Call Q", executes the calls, and puts the resulting
|
||||
_ResultItems in "Request Q"
|
||||
"""
|
||||
|
||||
import atexit
|
||||
from concurrent.futures import _base
|
||||
import Queue as queue
|
||||
import multiprocessing
|
||||
import threading
|
||||
import weakref
|
||||
import sys
|
||||
|
||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
||||
|
||||
# Workers are created as daemon threads and processes. This is done to allow the
|
||||
# interpreter to exit when there are still idle processes in a
|
||||
# ProcessPoolExecutor's process pool (i.e. shutdown() was not called). However,
|
||||
# allowing workers to die with the interpreter has two undesirable properties:
|
||||
# - The workers would still be running during interpretor shutdown,
|
||||
# meaning that they would fail in unpredictable ways.
|
||||
# - The workers could be killed while evaluating a work item, which could
|
||||
# be bad if the callable being evaluated has external side-effects e.g.
|
||||
# writing to a file.
|
||||
#
|
||||
# To work around this problem, an exit handler is installed which tells the
|
||||
# workers to exit when their work queues are empty and then waits until the
|
||||
# threads/processes finish.
|
||||
|
||||
_threads_queues = weakref.WeakKeyDictionary()
|
||||
_shutdown = False
|
||||
|
||||
def _python_exit():
|
||||
global _shutdown
|
||||
_shutdown = True
|
||||
items = list(_threads_queues.items()) if _threads_queues else ()
|
||||
for t, q in items:
|
||||
q.put(None)
|
||||
for t, q in items:
|
||||
t.join(sys.maxint)
|
||||
|
||||
# Controls how many more calls than processes will be queued in the call queue.
|
||||
# A smaller number will mean that processes spend more time idle waiting for
|
||||
# work while a larger number will make Future.cancel() succeed less frequently
|
||||
# (Futures in the call queue cannot be cancelled).
|
||||
EXTRA_QUEUED_CALLS = 1
|
||||
|
||||
class _WorkItem(object):
|
||||
def __init__(self, future, fn, args, kwargs):
|
||||
self.future = future
|
||||
self.fn = fn
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
|
||||
class _ResultItem(object):
|
||||
def __init__(self, work_id, exception=None, result=None):
|
||||
self.work_id = work_id
|
||||
self.exception = exception
|
||||
self.result = result
|
||||
|
||||
class _CallItem(object):
|
||||
def __init__(self, work_id, fn, args, kwargs):
|
||||
self.work_id = work_id
|
||||
self.fn = fn
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
|
||||
def _process_worker(call_queue, result_queue):
|
||||
"""Evaluates calls from call_queue and places the results in result_queue.
|
||||
|
||||
This worker is run in a separate process.
|
||||
|
||||
Args:
|
||||
call_queue: A multiprocessing.Queue of _CallItems that will be read and
|
||||
evaluated by the worker.
|
||||
result_queue: A multiprocessing.Queue of _ResultItems that will written
|
||||
to by the worker.
|
||||
shutdown: A multiprocessing.Event that will be set as a signal to the
|
||||
worker that it should exit when call_queue is empty.
|
||||
"""
|
||||
while True:
|
||||
call_item = call_queue.get(block=True)
|
||||
if call_item is None:
|
||||
# Wake up queue management thread
|
||||
result_queue.put(None)
|
||||
return
|
||||
try:
|
||||
r = call_item.fn(*call_item.args, **call_item.kwargs)
|
||||
except BaseException:
|
||||
e = sys.exc_info()[1]
|
||||
result_queue.put(_ResultItem(call_item.work_id,
|
||||
exception=e))
|
||||
else:
|
||||
result_queue.put(_ResultItem(call_item.work_id,
|
||||
result=r))
|
||||
|
||||
def _add_call_item_to_queue(pending_work_items,
|
||||
work_ids,
|
||||
call_queue):
|
||||
"""Fills call_queue with _WorkItems from pending_work_items.
|
||||
|
||||
This function never blocks.
|
||||
|
||||
Args:
|
||||
pending_work_items: A dict mapping work ids to _WorkItems e.g.
|
||||
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
|
||||
work_ids: A queue.Queue of work ids e.g. Queue([5, 6, ...]). Work ids
|
||||
are consumed and the corresponding _WorkItems from
|
||||
pending_work_items are transformed into _CallItems and put in
|
||||
call_queue.
|
||||
call_queue: A multiprocessing.Queue that will be filled with _CallItems
|
||||
derived from _WorkItems.
|
||||
"""
|
||||
while True:
|
||||
if call_queue.full():
|
||||
return
|
||||
try:
|
||||
work_id = work_ids.get(block=False)
|
||||
except queue.Empty:
|
||||
return
|
||||
else:
|
||||
work_item = pending_work_items[work_id]
|
||||
|
||||
if work_item.future.set_running_or_notify_cancel():
|
||||
call_queue.put(_CallItem(work_id,
|
||||
work_item.fn,
|
||||
work_item.args,
|
||||
work_item.kwargs),
|
||||
block=True)
|
||||
else:
|
||||
del pending_work_items[work_id]
|
||||
continue
|
||||
|
||||
def _queue_management_worker(executor_reference,
|
||||
processes,
|
||||
pending_work_items,
|
||||
work_ids_queue,
|
||||
call_queue,
|
||||
result_queue):
|
||||
"""Manages the communication between this process and the worker processes.
|
||||
|
||||
This function is run in a local thread.
|
||||
|
||||
Args:
|
||||
executor_reference: A weakref.ref to the ProcessPoolExecutor that owns
|
||||
this thread. Used to determine if the ProcessPoolExecutor has been
|
||||
garbage collected and that this function can exit.
|
||||
process: A list of the multiprocessing.Process instances used as
|
||||
workers.
|
||||
pending_work_items: A dict mapping work ids to _WorkItems e.g.
|
||||
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
|
||||
work_ids_queue: A queue.Queue of work ids e.g. Queue([5, 6, ...]).
|
||||
call_queue: A multiprocessing.Queue that will be filled with _CallItems
|
||||
derived from _WorkItems for processing by the process workers.
|
||||
result_queue: A multiprocessing.Queue of _ResultItems generated by the
|
||||
process workers.
|
||||
"""
|
||||
nb_shutdown_processes = [0]
|
||||
def shutdown_one_process():
|
||||
"""Tell a worker to terminate, which will in turn wake us again"""
|
||||
call_queue.put(None)
|
||||
nb_shutdown_processes[0] += 1
|
||||
while True:
|
||||
_add_call_item_to_queue(pending_work_items,
|
||||
work_ids_queue,
|
||||
call_queue)
|
||||
|
||||
result_item = result_queue.get(block=True)
|
||||
if result_item is not None:
|
||||
work_item = pending_work_items[result_item.work_id]
|
||||
del pending_work_items[result_item.work_id]
|
||||
|
||||
if result_item.exception:
|
||||
work_item.future.set_exception(result_item.exception)
|
||||
else:
|
||||
work_item.future.set_result(result_item.result)
|
||||
# Delete references to object. See issue16284
|
||||
del work_item
|
||||
# Check whether we should start shutting down.
|
||||
executor = executor_reference()
|
||||
# No more work items can be added if:
|
||||
# - The interpreter is shutting down OR
|
||||
# - The executor that owns this worker has been collected OR
|
||||
# - The executor that owns this worker has been shutdown.
|
||||
if _shutdown or executor is None or executor._shutdown_thread:
|
||||
# Since no new work items can be added, it is safe to shutdown
|
||||
# this thread if there are no pending work items.
|
||||
if not pending_work_items:
|
||||
while nb_shutdown_processes[0] < len(processes):
|
||||
shutdown_one_process()
|
||||
# If .join() is not called on the created processes then
|
||||
# some multiprocessing.Queue methods may deadlock on Mac OS
|
||||
# X.
|
||||
for p in processes:
|
||||
p.join()
|
||||
call_queue.close()
|
||||
return
|
||||
del executor
|
||||
|
||||
_system_limits_checked = False
|
||||
_system_limited = None
|
||||
def _check_system_limits():
|
||||
global _system_limits_checked, _system_limited
|
||||
if _system_limits_checked:
|
||||
if _system_limited:
|
||||
raise NotImplementedError(_system_limited)
|
||||
_system_limits_checked = True
|
||||
try:
|
||||
import os
|
||||
nsems_max = os.sysconf("SC_SEM_NSEMS_MAX")
|
||||
except (AttributeError, ValueError):
|
||||
# sysconf not available or setting not available
|
||||
return
|
||||
if nsems_max == -1:
|
||||
# indetermine limit, assume that limit is determined
|
||||
# by available memory only
|
||||
return
|
||||
if nsems_max >= 256:
|
||||
# minimum number of semaphores available
|
||||
# according to POSIX
|
||||
return
|
||||
_system_limited = "system provides too few semaphores (%d available, 256 necessary)" % nsems_max
|
||||
raise NotImplementedError(_system_limited)
|
||||
|
||||
class ProcessPoolExecutor(_base.Executor):
|
||||
def __init__(self, max_workers=None):
|
||||
"""Initializes a new ProcessPoolExecutor instance.
|
||||
|
||||
Args:
|
||||
max_workers: The maximum number of processes that can be used to
|
||||
execute the given calls. If None or not given then as many
|
||||
worker processes will be created as the machine has processors.
|
||||
"""
|
||||
_check_system_limits()
|
||||
|
||||
if max_workers is None:
|
||||
self._max_workers = multiprocessing.cpu_count()
|
||||
else:
|
||||
self._max_workers = max_workers
|
||||
|
||||
# Make the call queue slightly larger than the number of processes to
|
||||
# prevent the worker processes from idling. But don't make it too big
|
||||
# because futures in the call queue cannot be cancelled.
|
||||
self._call_queue = multiprocessing.Queue(self._max_workers +
|
||||
EXTRA_QUEUED_CALLS)
|
||||
self._result_queue = multiprocessing.Queue()
|
||||
self._work_ids = queue.Queue()
|
||||
self._queue_management_thread = None
|
||||
self._processes = set()
|
||||
|
||||
# Shutdown is a two-step process.
|
||||
self._shutdown_thread = False
|
||||
self._shutdown_lock = threading.Lock()
|
||||
self._queue_count = 0
|
||||
self._pending_work_items = {}
|
||||
|
||||
def _start_queue_management_thread(self):
|
||||
# When the executor gets lost, the weakref callback will wake up
|
||||
# the queue management thread.
|
||||
def weakref_cb(_, q=self._result_queue):
|
||||
q.put(None)
|
||||
if self._queue_management_thread is None:
|
||||
self._queue_management_thread = threading.Thread(
|
||||
target=_queue_management_worker,
|
||||
args=(weakref.ref(self, weakref_cb),
|
||||
self._processes,
|
||||
self._pending_work_items,
|
||||
self._work_ids,
|
||||
self._call_queue,
|
||||
self._result_queue))
|
||||
self._queue_management_thread.daemon = True
|
||||
self._queue_management_thread.start()
|
||||
_threads_queues[self._queue_management_thread] = self._result_queue
|
||||
|
||||
def _adjust_process_count(self):
|
||||
for _ in range(len(self._processes), self._max_workers):
|
||||
p = multiprocessing.Process(
|
||||
target=_process_worker,
|
||||
args=(self._call_queue,
|
||||
self._result_queue))
|
||||
p.start()
|
||||
self._processes.add(p)
|
||||
|
||||
def submit(self, fn, *args, **kwargs):
|
||||
with self._shutdown_lock:
|
||||
if self._shutdown_thread:
|
||||
raise RuntimeError('cannot schedule new futures after shutdown')
|
||||
|
||||
f = _base.Future()
|
||||
w = _WorkItem(f, fn, args, kwargs)
|
||||
|
||||
self._pending_work_items[self._queue_count] = w
|
||||
self._work_ids.put(self._queue_count)
|
||||
self._queue_count += 1
|
||||
# Wake up queue management thread
|
||||
self._result_queue.put(None)
|
||||
|
||||
self._start_queue_management_thread()
|
||||
self._adjust_process_count()
|
||||
return f
|
||||
submit.__doc__ = _base.Executor.submit.__doc__
|
||||
|
||||
def shutdown(self, wait=True):
|
||||
with self._shutdown_lock:
|
||||
self._shutdown_thread = True
|
||||
if self._queue_management_thread:
|
||||
# Wake up queue management thread
|
||||
self._result_queue.put(None)
|
||||
if wait:
|
||||
self._queue_management_thread.join(sys.maxint)
|
||||
# To reduce the risk of openning too many files, remove references to
|
||||
# objects that use file descriptors.
|
||||
self._queue_management_thread = None
|
||||
self._call_queue = None
|
||||
self._result_queue = None
|
||||
self._processes = None
|
||||
shutdown.__doc__ = _base.Executor.shutdown.__doc__
|
||||
|
||||
atexit.register(_python_exit)
|
134
libs/concurrent2.7/futures/thread.py
Normal file
134
libs/concurrent2.7/futures/thread.py
Normal file
|
@ -0,0 +1,134 @@
|
|||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
"""Implements ThreadPoolExecutor."""
|
||||
|
||||
import atexit
|
||||
from concurrent.futures import _base
|
||||
import Queue as queue
|
||||
import threading
|
||||
import weakref
|
||||
import sys
|
||||
|
||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
||||
|
||||
# Workers are created as daemon threads. This is done to allow the interpreter
|
||||
# to exit when there are still idle threads in a ThreadPoolExecutor's thread
|
||||
# pool (i.e. shutdown() was not called). However, allowing workers to die with
|
||||
# the interpreter has two undesirable properties:
|
||||
# - The workers would still be running during interpretor shutdown,
|
||||
# meaning that they would fail in unpredictable ways.
|
||||
# - The workers could be killed while evaluating a work item, which could
|
||||
# be bad if the callable being evaluated has external side-effects e.g.
|
||||
# writing to a file.
|
||||
#
|
||||
# To work around this problem, an exit handler is installed which tells the
|
||||
# workers to exit when their work queues are empty and then waits until the
|
||||
# threads finish.
|
||||
|
||||
_threads_queues = weakref.WeakKeyDictionary()
|
||||
_shutdown = False
|
||||
|
||||
def _python_exit():
|
||||
global _shutdown
|
||||
_shutdown = True
|
||||
items = list(_threads_queues.items()) if _threads_queues else ()
|
||||
for t, q in items:
|
||||
q.put(None)
|
||||
for t, q in items:
|
||||
t.join(sys.maxint)
|
||||
|
||||
atexit.register(_python_exit)
|
||||
|
||||
class _WorkItem(object):
|
||||
def __init__(self, future, fn, args, kwargs):
|
||||
self.future = future
|
||||
self.fn = fn
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
|
||||
def run(self):
|
||||
if not self.future.set_running_or_notify_cancel():
|
||||
return
|
||||
|
||||
try:
|
||||
result = self.fn(*self.args, **self.kwargs)
|
||||
except BaseException:
|
||||
e, tb = sys.exc_info()[1:]
|
||||
self.future.set_exception_info(e, tb)
|
||||
else:
|
||||
self.future.set_result(result)
|
||||
|
||||
def _worker(executor_reference, work_queue):
|
||||
try:
|
||||
while True:
|
||||
work_item = work_queue.get(block=True)
|
||||
if work_item is not None:
|
||||
work_item.run()
|
||||
# Delete references to object. See issue16284
|
||||
del work_item
|
||||
continue
|
||||
executor = executor_reference()
|
||||
# Exit if:
|
||||
# - The interpreter is shutting down OR
|
||||
# - The executor that owns the worker has been collected OR
|
||||
# - The executor that owns the worker has been shutdown.
|
||||
if _shutdown or executor is None or executor._shutdown:
|
||||
# Notice other workers
|
||||
work_queue.put(None)
|
||||
return
|
||||
del executor
|
||||
except BaseException:
|
||||
_base.LOGGER.critical('Exception in worker', exc_info=True)
|
||||
|
||||
class ThreadPoolExecutor(_base.Executor):
|
||||
def __init__(self, max_workers):
|
||||
"""Initializes a new ThreadPoolExecutor instance.
|
||||
|
||||
Args:
|
||||
max_workers: The maximum number of threads that can be used to
|
||||
execute the given calls.
|
||||
"""
|
||||
self._max_workers = max_workers
|
||||
self._work_queue = queue.Queue()
|
||||
self._threads = set()
|
||||
self._shutdown = False
|
||||
self._shutdown_lock = threading.Lock()
|
||||
|
||||
def submit(self, fn, *args, **kwargs):
|
||||
with self._shutdown_lock:
|
||||
if self._shutdown:
|
||||
raise RuntimeError('cannot schedule new futures after shutdown')
|
||||
|
||||
f = _base.Future()
|
||||
w = _WorkItem(f, fn, args, kwargs)
|
||||
|
||||
self._work_queue.put(w)
|
||||
self._adjust_thread_count()
|
||||
return f
|
||||
submit.__doc__ = _base.Executor.submit.__doc__
|
||||
|
||||
def _adjust_thread_count(self):
|
||||
# When the executor gets lost, the weakref callback will wake up
|
||||
# the worker threads.
|
||||
def weakref_cb(_, q=self._work_queue):
|
||||
q.put(None)
|
||||
# TODO(bquinlan): Should avoid creating new threads if there are more
|
||||
# idle threads than items in the work queue.
|
||||
if len(self._threads) < self._max_workers:
|
||||
t = threading.Thread(target=_worker,
|
||||
args=(weakref.ref(self, weakref_cb),
|
||||
self._work_queue))
|
||||
t.daemon = True
|
||||
t.start()
|
||||
self._threads.add(t)
|
||||
_threads_queues[t] = self._work_queue
|
||||
|
||||
def shutdown(self, wait=True):
|
||||
with self._shutdown_lock:
|
||||
self._shutdown = True
|
||||
self._work_queue.put(None)
|
||||
if wait:
|
||||
for t in self._threads:
|
||||
t.join(sys.maxint)
|
||||
shutdown.__doc__ = _base.Executor.shutdown.__doc__
|
516
libs/deathbycaptcha2.7.py
Normal file
516
libs/deathbycaptcha2.7.py
Normal file
|
@ -0,0 +1,516 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
"""Death by Captcha HTTP and socket API clients.
|
||||
|
||||
There are two types of Death by Captcha (DBC hereinafter) API: HTTP and
|
||||
socket ones. Both offer the same functionalily, with the socket API
|
||||
sporting faster responses and using way less connections.
|
||||
|
||||
To access the socket API, use SocketClient class; for the HTTP API, use
|
||||
HttpClient class. Both are thread-safe. SocketClient keeps a persistent
|
||||
connection opened and serializes all API requests sent through it, thus
|
||||
it is advised to keep a pool of them if you're script is heavily
|
||||
multithreaded.
|
||||
|
||||
Both SocketClient and HttpClient give you the following methods:
|
||||
|
||||
get_user()
|
||||
Returns your DBC account details as a dict with the following keys:
|
||||
|
||||
"user": your account numeric ID; if login fails, it will be the only
|
||||
item with the value of 0;
|
||||
"rate": your CAPTCHA rate, i.e. how much you will be charged for one
|
||||
solved CAPTCHA in US cents;
|
||||
"balance": your DBC account balance in US cents;
|
||||
"is_banned": flag indicating whether your account is suspended or not.
|
||||
|
||||
get_balance()
|
||||
Returns your DBC account balance in US cents.
|
||||
|
||||
get_captcha(cid)
|
||||
Returns an uploaded CAPTCHA details as a dict with the following keys:
|
||||
|
||||
"captcha": the CAPTCHA numeric ID; if no such CAPTCHAs found, it will
|
||||
be the only item with the value of 0;
|
||||
"text": the CAPTCHA text, if solved, otherwise None;
|
||||
"is_correct": flag indicating whether the CAPTCHA was solved correctly
|
||||
(DBC can detect that in rare cases).
|
||||
|
||||
The only argument `cid` is the CAPTCHA numeric ID.
|
||||
|
||||
get_text(cid)
|
||||
Returns an uploaded CAPTCHA text (None if not solved). The only argument
|
||||
`cid` is the CAPTCHA numeric ID.
|
||||
|
||||
report(cid)
|
||||
Reports an incorrectly solved CAPTCHA. The only argument `cid` is the
|
||||
CAPTCHA numeric ID. Returns True on success, False otherwise.
|
||||
|
||||
upload(captcha)
|
||||
Uploads a CAPTCHA. The only argument `captcha` can be either file-like
|
||||
object (any object with `read` method defined, actually, so StringIO
|
||||
will do), or CAPTCHA image file name. On successul upload you'll get
|
||||
the CAPTCHA details dict (see get_captcha() method).
|
||||
|
||||
NOTE: AT THIS POINT THE UPLOADED CAPTCHA IS NOT SOLVED YET! You have
|
||||
to poll for its status periodically using get_captcha() or get_text()
|
||||
method until the CAPTCHA is solved and you get the text.
|
||||
|
||||
decode(captcha, timeout=DEFAULT_TIMEOUT)
|
||||
A convenient method that uploads a CAPTCHA and polls for its status
|
||||
periodically, but no longer than `timeout` (defaults to 60 seconds).
|
||||
If solved, you'll get the CAPTCHA details dict (see get_captcha()
|
||||
method for details). See upload() method for details on `captcha`
|
||||
argument.
|
||||
|
||||
Visit http://www.deathbycaptcha.com/user/api for updates.
|
||||
|
||||
"""
|
||||
|
||||
import base64
|
||||
import binascii
|
||||
import errno
|
||||
import imghdr
|
||||
import random
|
||||
import os
|
||||
import select
|
||||
import socket
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import urllib
|
||||
import urllib2
|
||||
try:
|
||||
from json import read as json_decode, write as json_encode
|
||||
except ImportError:
|
||||
try:
|
||||
from json import loads as json_decode, dumps as json_encode
|
||||
except ImportError:
|
||||
from simplejson import loads as json_decode, dumps as json_encode
|
||||
|
||||
|
||||
# API version and unique software ID
|
||||
API_VERSION = 'DBC/Python v4.6'
|
||||
|
||||
# Default CAPTCHA timeout and decode() polling interval
|
||||
DEFAULT_TIMEOUT = 60
|
||||
DEFAULT_TOKEN_TIMEOUT = 120
|
||||
POLLS_INTERVAL = [1, 1, 2, 3, 2, 2, 3, 2, 2]
|
||||
DFLT_POLL_INTERVAL = 3
|
||||
|
||||
# Base HTTP API url
|
||||
HTTP_BASE_URL = 'http://api.dbcapi.me/api'
|
||||
|
||||
# Preferred HTTP API server's response content type, do not change
|
||||
HTTP_RESPONSE_TYPE = 'application/json'
|
||||
|
||||
# Socket API server's host & ports range
|
||||
SOCKET_HOST = 'api.dbcapi.me'
|
||||
SOCKET_PORTS = range(8123, 8131)
|
||||
|
||||
|
||||
def _load_image(captcha):
|
||||
if hasattr(captcha, 'read'):
|
||||
img = captcha.read()
|
||||
elif type(captcha) == bytearray:
|
||||
img = captcha
|
||||
else:
|
||||
img = ''
|
||||
try:
|
||||
captcha_file = open(captcha, 'rb')
|
||||
except Exception:
|
||||
raise
|
||||
else:
|
||||
img = captcha_file.read()
|
||||
captcha_file.close()
|
||||
if not len(img):
|
||||
raise ValueError('CAPTCHA image is empty')
|
||||
elif imghdr.what(None, img) is None:
|
||||
raise TypeError('Unknown CAPTCHA image type')
|
||||
else:
|
||||
return img
|
||||
|
||||
|
||||
class AccessDeniedException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Client(object):
|
||||
|
||||
"""Death by Captcha API Client."""
|
||||
|
||||
def __init__(self, username, password):
|
||||
self.is_verbose = False
|
||||
self.userpwd = {'username': username, 'password': password}
|
||||
|
||||
def _log(self, cmd, msg=''):
|
||||
if self.is_verbose:
|
||||
print '%d %s %s' % (time.time(), cmd, msg.rstrip())
|
||||
return self
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def connect(self):
|
||||
pass
|
||||
|
||||
def get_user(self):
|
||||
"""Fetch user details -- ID, balance, rate and banned status."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_balance(self):
|
||||
"""Fetch user balance (in US cents)."""
|
||||
return self.get_user().get('balance')
|
||||
|
||||
def get_captcha(self, cid):
|
||||
"""Fetch a CAPTCHA details -- ID, text and correctness flag."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_text(self, cid):
|
||||
"""Fetch a CAPTCHA text."""
|
||||
return self.get_captcha(cid).get('text') or None
|
||||
|
||||
def report(self, cid):
|
||||
"""Report a CAPTCHA as incorrectly solved."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def upload(self, captcha):
|
||||
"""Upload a CAPTCHA.
|
||||
|
||||
Accepts file names and file-like objects. Returns CAPTCHA details
|
||||
dict on success.
|
||||
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def decode(self, captcha=None, timeout=None, **kwargs):
|
||||
"""
|
||||
Try to solve a CAPTCHA.
|
||||
|
||||
See Client.upload() for arguments details.
|
||||
|
||||
Uploads a CAPTCHA, polls for its status periodically with arbitrary
|
||||
timeout (in seconds), returns CAPTCHA details if (correctly) solved.
|
||||
"""
|
||||
if not timeout:
|
||||
if not captcha:
|
||||
timeout = DEFAULT_TOKEN_TIMEOUT
|
||||
else:
|
||||
timeout = DEFAULT_TIMEOUT
|
||||
|
||||
deadline = time.time() + (max(0, timeout) or DEFAULT_TIMEOUT)
|
||||
uploaded_captcha = self.upload(captcha, **kwargs)
|
||||
if uploaded_captcha:
|
||||
intvl_idx = 0 # POLL_INTERVAL index
|
||||
while deadline > time.time() and not uploaded_captcha.get('text'):
|
||||
intvl, intvl_idx = self._get_poll_interval(intvl_idx)
|
||||
time.sleep(intvl)
|
||||
pulled = self.get_captcha(uploaded_captcha['captcha'])
|
||||
if pulled['captcha'] == uploaded_captcha['captcha']:
|
||||
uploaded_captcha = pulled
|
||||
if uploaded_captcha.get('text') and \
|
||||
uploaded_captcha.get('is_correct'):
|
||||
return uploaded_captcha
|
||||
|
||||
def _get_poll_interval(self, idx):
|
||||
"""Returns poll interval and next index depending on index provided"""
|
||||
|
||||
if len(POLLS_INTERVAL) > idx:
|
||||
intvl = POLLS_INTERVAL[idx]
|
||||
else:
|
||||
intvl = DFLT_POLL_INTERVAL
|
||||
idx += 1
|
||||
|
||||
return intvl, idx
|
||||
|
||||
|
||||
class HttpClient(Client):
|
||||
|
||||
"""Death by Captcha HTTP API client."""
|
||||
|
||||
def __init__(self, *args):
|
||||
Client.__init__(self, *args)
|
||||
self.opener = urllib2.build_opener(urllib2.HTTPRedirectHandler())
|
||||
|
||||
def _call(self, cmd, payload=None, headers=None):
|
||||
if headers is None:
|
||||
headers = {}
|
||||
headers['Accept'] = HTTP_RESPONSE_TYPE
|
||||
headers['User-Agent'] = API_VERSION
|
||||
if hasattr(payload, 'items'):
|
||||
payload = urllib.urlencode(payload)
|
||||
self._log('SEND', '%s %d %s' % (cmd, len(payload), payload))
|
||||
else:
|
||||
self._log('SEND', '%s' % cmd)
|
||||
if payload is not None:
|
||||
headers['Content-Length'] = len(payload)
|
||||
try:
|
||||
response = self.opener.open(urllib2.Request(
|
||||
HTTP_BASE_URL + '/' + cmd.strip('/'),
|
||||
data=payload,
|
||||
headers=headers
|
||||
)).read()
|
||||
except urllib2.HTTPError, err:
|
||||
if 403 == err.code:
|
||||
raise AccessDeniedException('Access denied, please check'
|
||||
' your credentials and/or balance')
|
||||
elif 400 == err.code or 413 == err.code:
|
||||
raise ValueError("CAPTCHA was rejected by the service, check"
|
||||
" if it's a valid image")
|
||||
elif 503 == err.code:
|
||||
raise OverflowError("CAPTCHA was rejected due to service"
|
||||
" overload, try again later")
|
||||
else:
|
||||
raise err
|
||||
else:
|
||||
self._log('RECV', '%d %s' % (len(response), response))
|
||||
try:
|
||||
return json_decode(response)
|
||||
except Exception:
|
||||
raise RuntimeError('Invalid API response')
|
||||
return {}
|
||||
|
||||
def get_user(self):
|
||||
return self._call('user', self.userpwd.copy()) or {'user': 0}
|
||||
|
||||
def get_captcha(self, cid):
|
||||
return self._call('captcha/%d' % cid) or {'captcha': 0}
|
||||
|
||||
def report(self, cid):
|
||||
return not self._call('captcha/%d/report' % cid,
|
||||
self.userpwd.copy()).get('is_correct')
|
||||
|
||||
def upload(self, captcha=None, **kwargs):
|
||||
boundary = binascii.hexlify(os.urandom(16))
|
||||
banner = kwargs.get('banner', '')
|
||||
if banner:
|
||||
kwargs['banner'] = 'base64:' + base64.b64encode(_load_image(banner))
|
||||
body = '\r\n'.join(('\r\n'.join((
|
||||
'--%s' % boundary,
|
||||
'Content-Disposition: form-data; name="%s"' % k,
|
||||
'Content-Type: text/plain',
|
||||
'Content-Length: %d' % len(str(v)),
|
||||
'',
|
||||
str(v)
|
||||
))) for k, v in self.userpwd.items())
|
||||
|
||||
body += '\r\n'.join(('\r\n'.join((
|
||||
'--%s' % boundary,
|
||||
'Content-Disposition: form-data; name="%s"' % k,
|
||||
'Content-Type: text/plain',
|
||||
'Content-Length: %d' % len(str(v)),
|
||||
'',
|
||||
str(v)
|
||||
))) for k, v in kwargs.items())
|
||||
|
||||
if captcha:
|
||||
img = _load_image(captcha)
|
||||
body += '\r\n'.join((
|
||||
'',
|
||||
'--%s' % boundary,
|
||||
'Content-Disposition: form-data; name="captchafile"; '
|
||||
'filename="captcha"',
|
||||
'Content-Type: application/octet-stream',
|
||||
'Content-Length: %d' % len(img),
|
||||
'',
|
||||
img,
|
||||
'--%s--' % boundary,
|
||||
''
|
||||
))
|
||||
|
||||
response = self._call('captcha', body, {
|
||||
'Content-Type': 'multipart/form-data; boundary="%s"' % boundary
|
||||
}) or {}
|
||||
if response.get('captcha'):
|
||||
return response
|
||||
|
||||
|
||||
class SocketClient(Client):
|
||||
|
||||
"""Death by Captcha socket API client."""
|
||||
|
||||
TERMINATOR = '\r\n'
|
||||
|
||||
def __init__(self, *args):
|
||||
Client.__init__(self, *args)
|
||||
self.socket_lock = threading.Lock()
|
||||
self.socket = None
|
||||
|
||||
def close(self):
|
||||
if self.socket:
|
||||
self._log('CLOSE')
|
||||
try:
|
||||
self.socket.shutdown(socket.SHUT_RDWR)
|
||||
except socket.error:
|
||||
pass
|
||||
finally:
|
||||
self.socket.close()
|
||||
self.socket = None
|
||||
|
||||
def connect(self):
|
||||
if not self.socket:
|
||||
self._log('CONN')
|
||||
host = (socket.gethostbyname(SOCKET_HOST),
|
||||
random.choice(SOCKET_PORTS))
|
||||
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
self.socket.settimeout(0)
|
||||
try:
|
||||
self.socket.connect(host)
|
||||
except socket.error, err:
|
||||
if (err.args[0] not in
|
||||
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)):
|
||||
self.close()
|
||||
raise err
|
||||
return self.socket
|
||||
|
||||
def __del__(self):
|
||||
self.close()
|
||||
|
||||
def _sendrecv(self, sock, buf):
|
||||
self._log('SEND', buf)
|
||||
fds = [sock]
|
||||
buf += self.TERMINATOR
|
||||
response = ''
|
||||
intvl_idx = 0
|
||||
while True:
|
||||
intvl, intvl_idx = self._get_poll_interval(intvl_idx)
|
||||
rds, wrs, exs = select.select((not buf and fds) or [],
|
||||
(buf and fds) or [],
|
||||
fds,
|
||||
intvl)
|
||||
if exs:
|
||||
raise IOError('select() failed')
|
||||
try:
|
||||
if wrs:
|
||||
while buf:
|
||||
buf = buf[wrs[0].send(buf):]
|
||||
elif rds:
|
||||
while True:
|
||||
s = rds[0].recv(256)
|
||||
if not s:
|
||||
raise IOError('recv(): connection lost')
|
||||
else:
|
||||
response += s
|
||||
except socket.error, err:
|
||||
if (err.args[0] not in
|
||||
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)):
|
||||
raise err
|
||||
if response.endswith(self.TERMINATOR):
|
||||
self._log('RECV', response)
|
||||
return response.rstrip(self.TERMINATOR)
|
||||
raise IOError('send/recv timed out')
|
||||
|
||||
def _call(self, cmd, data=None):
|
||||
if data is None:
|
||||
data = {}
|
||||
data['cmd'] = cmd
|
||||
data['version'] = API_VERSION
|
||||
request = json_encode(data)
|
||||
|
||||
response = None
|
||||
for _ in range(2):
|
||||
if not self.socket and cmd != 'login':
|
||||
self._call('login', self.userpwd.copy())
|
||||
self.socket_lock.acquire()
|
||||
try:
|
||||
sock = self.connect()
|
||||
response = self._sendrecv(sock, request)
|
||||
except IOError, err:
|
||||
sys.stderr.write(str(err) + "\n")
|
||||
self.close()
|
||||
except socket.error, err:
|
||||
sys.stderr.write(str(err) + "\n")
|
||||
self.close()
|
||||
raise IOError('Connection refused')
|
||||
else:
|
||||
break
|
||||
finally:
|
||||
self.socket_lock.release()
|
||||
|
||||
if response is None:
|
||||
raise IOError('Connection lost or timed out during API request')
|
||||
|
||||
try:
|
||||
response = json_decode(response)
|
||||
except Exception:
|
||||
raise RuntimeError('Invalid API response')
|
||||
|
||||
if not response.get('error'):
|
||||
return response
|
||||
|
||||
error = response['error']
|
||||
if error in ('not-logged-in', 'invalid-credentials'):
|
||||
raise AccessDeniedException('Access denied, check your credentials')
|
||||
elif 'banned' == error:
|
||||
raise AccessDeniedException('Access denied, account is suspended')
|
||||
elif 'insufficient-funds' == error:
|
||||
raise AccessDeniedException(
|
||||
'CAPTCHA was rejected due to low balance')
|
||||
elif 'invalid-captcha' == error:
|
||||
raise ValueError('CAPTCHA is not a valid image')
|
||||
elif 'service-overload' == error:
|
||||
raise OverflowError(
|
||||
'CAPTCHA was rejected due to service overload, try again later')
|
||||
else:
|
||||
self.socket_lock.acquire()
|
||||
self.close()
|
||||
self.socket_lock.release()
|
||||
raise RuntimeError('API server error occured: %s' % error)
|
||||
|
||||
def get_user(self):
|
||||
return self._call('user') or {'user': 0}
|
||||
|
||||
def get_captcha(self, cid):
|
||||
return self._call('captcha', {'captcha': cid}) or {'captcha': 0}
|
||||
|
||||
def upload(self, captcha=None, **kwargs):
|
||||
data = {}
|
||||
if captcha:
|
||||
data['captcha'] = base64.b64encode(_load_image(captcha))
|
||||
if kwargs:
|
||||
banner = kwargs.get('banner', '')
|
||||
if banner:
|
||||
kwargs['banner'] = base64.b64encode(_load_image(banner))
|
||||
data.update(kwargs)
|
||||
response = self._call('upload', data)
|
||||
if response.get('captcha'):
|
||||
uploaded_captcha = dict(
|
||||
(k, response.get(k))
|
||||
for k in ('captcha', 'text', 'is_correct')
|
||||
)
|
||||
if not uploaded_captcha['text']:
|
||||
uploaded_captcha['text'] = None
|
||||
return uploaded_captcha
|
||||
|
||||
def report(self, cid):
|
||||
return not self._call('report', {'captcha': cid}).get('is_correct')
|
||||
|
||||
|
||||
if '__main__' == __name__:
|
||||
# Put your DBC username & password here:
|
||||
# client = HttpClient(sys.argv[1], sys.argv[2])
|
||||
client = SocketClient(sys.argv[1], sys.argv[2])
|
||||
client.is_verbose = True
|
||||
|
||||
print 'Your balance is %s US cents' % client.get_balance()
|
||||
|
||||
for fn in sys.argv[3:]:
|
||||
try:
|
||||
# Put your CAPTCHA image file name or file-like object, and optional
|
||||
# solving timeout (in seconds) here:
|
||||
captcha = client.decode(fn, DEFAULT_TIMEOUT)
|
||||
except Exception, e:
|
||||
sys.stderr.write('Failed uploading CAPTCHA: %s\n' % (e, ))
|
||||
captcha = None
|
||||
|
||||
if captcha:
|
||||
print 'CAPTCHA %d solved: %s' % \
|
||||
(captcha['captcha'], captcha['text'])
|
||||
|
||||
# Report as incorrectly solved if needed. Make sure the CAPTCHA was
|
||||
# in fact incorrectly solved!
|
||||
# try:
|
||||
# client.report(captcha['captcha'])
|
||||
# except Exception, e:
|
||||
# sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, ))
|
4
libs/dogpile2.7/__init__.py
Normal file
4
libs/dogpile2.7/__init__.py
Normal file
|
@ -0,0 +1,4 @@
|
|||
__version__ = '0.6.5'
|
||||
|
||||
from .lock import Lock # noqa
|
||||
from .lock import NeedRegenerationException # noqa
|
4
libs/dogpile2.7/cache/__init__.py
vendored
Normal file
4
libs/dogpile2.7/cache/__init__.py
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
from .region import CacheRegion, register_backend, make_region # noqa
|
||||
|
||||
# backwards compat
|
||||
from .. import __version__ # noqa
|
215
libs/dogpile2.7/cache/api.py
vendored
Normal file
215
libs/dogpile2.7/cache/api.py
vendored
Normal file
|
@ -0,0 +1,215 @@
|
|||
import operator
|
||||
from ..util.compat import py3k
|
||||
|
||||
|
||||
class NoValue(object):
|
||||
"""Describe a missing cache value.
|
||||
|
||||
The :attr:`.NO_VALUE` module global
|
||||
should be used.
|
||||
|
||||
"""
|
||||
@property
|
||||
def payload(self):
|
||||
return self
|
||||
|
||||
def __repr__(self):
|
||||
"""Ensure __repr__ is a consistent value in case NoValue is used to
|
||||
fill another cache key.
|
||||
|
||||
"""
|
||||
return '<dogpile.cache.api.NoValue object>'
|
||||
|
||||
if py3k:
|
||||
def __bool__(self): # pragma NO COVERAGE
|
||||
return False
|
||||
else:
|
||||
def __nonzero__(self): # pragma NO COVERAGE
|
||||
return False
|
||||
|
||||
|
||||
NO_VALUE = NoValue()
|
||||
"""Value returned from ``get()`` that describes
|
||||
a key not present."""
|
||||
|
||||
|
||||
class CachedValue(tuple):
|
||||
"""Represent a value stored in the cache.
|
||||
|
||||
:class:`.CachedValue` is a two-tuple of
|
||||
``(payload, metadata)``, where ``metadata``
|
||||
is dogpile.cache's tracking information (
|
||||
currently the creation time). The metadata
|
||||
and tuple structure is pickleable, if
|
||||
the backend requires serialization.
|
||||
|
||||
"""
|
||||
payload = property(operator.itemgetter(0))
|
||||
"""Named accessor for the payload."""
|
||||
|
||||
metadata = property(operator.itemgetter(1))
|
||||
"""Named accessor for the dogpile.cache metadata dictionary."""
|
||||
|
||||
def __new__(cls, payload, metadata):
|
||||
return tuple.__new__(cls, (payload, metadata))
|
||||
|
||||
def __reduce__(self):
|
||||
return CachedValue, (self.payload, self.metadata)
|
||||
|
||||
|
||||
class CacheBackend(object):
|
||||
"""Base class for backend implementations."""
|
||||
|
||||
key_mangler = None
|
||||
"""Key mangling function.
|
||||
|
||||
May be None, or otherwise declared
|
||||
as an ordinary instance method.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, arguments): # pragma NO COVERAGE
|
||||
"""Construct a new :class:`.CacheBackend`.
|
||||
|
||||
Subclasses should override this to
|
||||
handle the given arguments.
|
||||
|
||||
:param arguments: The ``arguments`` parameter
|
||||
passed to :func:`.make_registry`.
|
||||
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@classmethod
|
||||
def from_config_dict(cls, config_dict, prefix):
|
||||
prefix_len = len(prefix)
|
||||
return cls(
|
||||
dict(
|
||||
(key[prefix_len:], config_dict[key])
|
||||
for key in config_dict
|
||||
if key.startswith(prefix)
|
||||
)
|
||||
)
|
||||
|
||||
def has_lock_timeout(self):
|
||||
return False
|
||||
|
||||
def get_mutex(self, key):
|
||||
"""Return an optional mutexing object for the given key.
|
||||
|
||||
This object need only provide an ``acquire()``
|
||||
and ``release()`` method.
|
||||
|
||||
May return ``None``, in which case the dogpile
|
||||
lock will use a regular ``threading.Lock``
|
||||
object to mutex concurrent threads for
|
||||
value creation. The default implementation
|
||||
returns ``None``.
|
||||
|
||||
Different backends may want to provide various
|
||||
kinds of "mutex" objects, such as those which
|
||||
link to lock files, distributed mutexes,
|
||||
memcached semaphores, etc. Whatever
|
||||
kind of system is best suited for the scope
|
||||
and behavior of the caching backend.
|
||||
|
||||
A mutex that takes the key into account will
|
||||
allow multiple regenerate operations across
|
||||
keys to proceed simultaneously, while a mutex
|
||||
that does not will serialize regenerate operations
|
||||
to just one at a time across all keys in the region.
|
||||
The latter approach, or a variant that involves
|
||||
a modulus of the given key's hash value,
|
||||
can be used as a means of throttling the total
|
||||
number of value recreation operations that may
|
||||
proceed at one time.
|
||||
|
||||
"""
|
||||
return None
|
||||
|
||||
def get(self, key): # pragma NO COVERAGE
|
||||
"""Retrieve a value from the cache.
|
||||
|
||||
The returned value should be an instance of
|
||||
:class:`.CachedValue`, or ``NO_VALUE`` if
|
||||
not present.
|
||||
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_multi(self, keys): # pragma NO COVERAGE
|
||||
"""Retrieve multiple values from the cache.
|
||||
|
||||
The returned value should be a list, corresponding
|
||||
to the list of keys given.
|
||||
|
||||
.. versionadded:: 0.5.0
|
||||
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def set(self, key, value): # pragma NO COVERAGE
|
||||
"""Set a value in the cache.
|
||||
|
||||
The key will be whatever was passed
|
||||
to the registry, processed by the
|
||||
"key mangling" function, if any.
|
||||
The value will always be an instance
|
||||
of :class:`.CachedValue`.
|
||||
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def set_multi(self, mapping): # pragma NO COVERAGE
|
||||
"""Set multiple values in the cache.
|
||||
|
||||
``mapping`` is a dict in which
|
||||
the key will be whatever was passed
|
||||
to the registry, processed by the
|
||||
"key mangling" function, if any.
|
||||
The value will always be an instance
|
||||
of :class:`.CachedValue`.
|
||||
|
||||
When implementing a new :class:`.CacheBackend` or cutomizing via
|
||||
:class:`.ProxyBackend`, be aware that when this method is invoked by
|
||||
:meth:`.Region.get_or_create_multi`, the ``mapping`` values are the
|
||||
same ones returned to the upstream caller. If the subclass alters the
|
||||
values in any way, it must not do so 'in-place' on the ``mapping`` dict
|
||||
-- that will have the undesirable effect of modifying the returned
|
||||
values as well.
|
||||
|
||||
.. versionadded:: 0.5.0
|
||||
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def delete(self, key): # pragma NO COVERAGE
|
||||
"""Delete a value from the cache.
|
||||
|
||||
The key will be whatever was passed
|
||||
to the registry, processed by the
|
||||
"key mangling" function, if any.
|
||||
|
||||
The behavior here should be idempotent,
|
||||
that is, can be called any number of times
|
||||
regardless of whether or not the
|
||||
key exists.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def delete_multi(self, keys): # pragma NO COVERAGE
|
||||
"""Delete multiple values from the cache.
|
||||
|
||||
The key will be whatever was passed
|
||||
to the registry, processed by the
|
||||
"key mangling" function, if any.
|
||||
|
||||
The behavior here should be idempotent,
|
||||
that is, can be called any number of times
|
||||
regardless of whether or not the
|
||||
key exists.
|
||||
|
||||
.. versionadded:: 0.5.0
|
||||
|
||||
"""
|
||||
raise NotImplementedError()
|
22
libs/dogpile2.7/cache/backends/__init__.py
vendored
Normal file
22
libs/dogpile2.7/cache/backends/__init__.py
vendored
Normal file
|
@ -0,0 +1,22 @@
|
|||
from dogpile.cache.region import register_backend
|
||||
|
||||
register_backend(
|
||||
"dogpile.cache.null", "dogpile.cache.backends.null", "NullBackend")
|
||||
register_backend(
|
||||
"dogpile.cache.dbm", "dogpile.cache.backends.file", "DBMBackend")
|
||||
register_backend(
|
||||
"dogpile.cache.pylibmc", "dogpile.cache.backends.memcached",
|
||||
"PylibmcBackend")
|
||||
register_backend(
|
||||
"dogpile.cache.bmemcached", "dogpile.cache.backends.memcached",
|
||||
"BMemcachedBackend")
|
||||
register_backend(
|
||||
"dogpile.cache.memcached", "dogpile.cache.backends.memcached",
|
||||
"MemcachedBackend")
|
||||
register_backend(
|
||||
"dogpile.cache.memory", "dogpile.cache.backends.memory", "MemoryBackend")
|
||||
register_backend(
|
||||
"dogpile.cache.memory_pickle", "dogpile.cache.backends.memory",
|
||||
"MemoryPickleBackend")
|
||||
register_backend(
|
||||
"dogpile.cache.redis", "dogpile.cache.backends.redis", "RedisBackend")
|
447
libs/dogpile2.7/cache/backends/file.py
vendored
Normal file
447
libs/dogpile2.7/cache/backends/file.py
vendored
Normal file
|
@ -0,0 +1,447 @@
|
|||
"""
|
||||
File Backends
|
||||
------------------
|
||||
|
||||
Provides backends that deal with local filesystem access.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import with_statement
|
||||
from ..api import CacheBackend, NO_VALUE
|
||||
from contextlib import contextmanager
|
||||
from ...util import compat
|
||||
from ... import util
|
||||
import os
|
||||
|
||||
__all__ = 'DBMBackend', 'FileLock', 'AbstractFileLock'
|
||||
|
||||
|
||||
class DBMBackend(CacheBackend):
|
||||
"""A file-backend using a dbm file to store keys.
|
||||
|
||||
Basic usage::
|
||||
|
||||
from dogpile.cache import make_region
|
||||
|
||||
region = make_region().configure(
|
||||
'dogpile.cache.dbm',
|
||||
expiration_time = 3600,
|
||||
arguments = {
|
||||
"filename":"/path/to/cachefile.dbm"
|
||||
}
|
||||
)
|
||||
|
||||
DBM access is provided using the Python ``anydbm`` module,
|
||||
which selects a platform-specific dbm module to use.
|
||||
This may be made to be more configurable in a future
|
||||
release.
|
||||
|
||||
Note that different dbm modules have different behaviors.
|
||||
Some dbm implementations handle their own locking, while
|
||||
others don't. The :class:`.DBMBackend` uses a read/write
|
||||
lockfile by default, which is compatible even with those
|
||||
DBM implementations for which this is unnecessary,
|
||||
though the behavior can be disabled.
|
||||
|
||||
The DBM backend by default makes use of two lockfiles.
|
||||
One is in order to protect the DBM file itself from
|
||||
concurrent writes, the other is to coordinate
|
||||
value creation (i.e. the dogpile lock). By default,
|
||||
these lockfiles use the ``flock()`` system call
|
||||
for locking; this is **only available on Unix
|
||||
platforms**. An alternative lock implementation, such as one
|
||||
which is based on threads or uses a third-party system
|
||||
such as `portalocker <https://pypi.python.org/pypi/portalocker>`_,
|
||||
can be dropped in using the ``lock_factory`` argument
|
||||
in conjunction with the :class:`.AbstractFileLock` base class.
|
||||
|
||||
Currently, the dogpile lock is against the entire
|
||||
DBM file, not per key. This means there can
|
||||
only be one "creator" job running at a time
|
||||
per dbm file.
|
||||
|
||||
A future improvement might be to have the dogpile lock
|
||||
using a filename that's based on a modulus of the key.
|
||||
Locking on a filename that uniquely corresponds to the
|
||||
key is problematic, since it's not generally safe to
|
||||
delete lockfiles as the application runs, implying an
|
||||
unlimited number of key-based files would need to be
|
||||
created and never deleted.
|
||||
|
||||
Parameters to the ``arguments`` dictionary are
|
||||
below.
|
||||
|
||||
:param filename: path of the filename in which to
|
||||
create the DBM file. Note that some dbm backends
|
||||
will change this name to have additional suffixes.
|
||||
:param rw_lockfile: the name of the file to use for
|
||||
read/write locking. If omitted, a default name
|
||||
is used by appending the suffix ".rw.lock" to the
|
||||
DBM filename. If False, then no lock is used.
|
||||
:param dogpile_lockfile: the name of the file to use
|
||||
for value creation, i.e. the dogpile lock. If
|
||||
omitted, a default name is used by appending the
|
||||
suffix ".dogpile.lock" to the DBM filename. If
|
||||
False, then dogpile.cache uses the default dogpile
|
||||
lock, a plain thread-based mutex.
|
||||
:param lock_factory: a function or class which provides
|
||||
for a read/write lock. Defaults to :class:`.FileLock`.
|
||||
Custom implementations need to implement context-manager
|
||||
based ``read()`` and ``write()`` functions - the
|
||||
:class:`.AbstractFileLock` class is provided as a base class
|
||||
which provides these methods based on individual read/write lock
|
||||
functions. E.g. to replace the lock with the dogpile.core
|
||||
:class:`.ReadWriteMutex`::
|
||||
|
||||
from dogpile.core.readwrite_lock import ReadWriteMutex
|
||||
from dogpile.cache.backends.file import AbstractFileLock
|
||||
|
||||
class MutexLock(AbstractFileLock):
|
||||
def __init__(self, filename):
|
||||
self.mutex = ReadWriteMutex()
|
||||
|
||||
def acquire_read_lock(self, wait):
|
||||
ret = self.mutex.acquire_read_lock(wait)
|
||||
return wait or ret
|
||||
|
||||
def acquire_write_lock(self, wait):
|
||||
ret = self.mutex.acquire_write_lock(wait)
|
||||
return wait or ret
|
||||
|
||||
def release_read_lock(self):
|
||||
return self.mutex.release_read_lock()
|
||||
|
||||
def release_write_lock(self):
|
||||
return self.mutex.release_write_lock()
|
||||
|
||||
from dogpile.cache import make_region
|
||||
|
||||
region = make_region().configure(
|
||||
"dogpile.cache.dbm",
|
||||
expiration_time=300,
|
||||
arguments={
|
||||
"filename": "file.dbm",
|
||||
"lock_factory": MutexLock
|
||||
}
|
||||
)
|
||||
|
||||
While the included :class:`.FileLock` uses ``os.flock()``, a
|
||||
windows-compatible implementation can be built using a library
|
||||
such as `portalocker <https://pypi.python.org/pypi/portalocker>`_.
|
||||
|
||||
.. versionadded:: 0.5.2
|
||||
|
||||
|
||||
|
||||
"""
|
||||
def __init__(self, arguments):
|
||||
self.filename = os.path.abspath(
|
||||
os.path.normpath(arguments['filename'])
|
||||
)
|
||||
dir_, filename = os.path.split(self.filename)
|
||||
|
||||
self.lock_factory = arguments.get("lock_factory", FileLock)
|
||||
self._rw_lock = self._init_lock(
|
||||
arguments.get('rw_lockfile'),
|
||||
".rw.lock", dir_, filename)
|
||||
self._dogpile_lock = self._init_lock(
|
||||
arguments.get('dogpile_lockfile'),
|
||||
".dogpile.lock",
|
||||
dir_, filename,
|
||||
util.KeyReentrantMutex.factory)
|
||||
|
||||
# TODO: make this configurable
|
||||
if compat.py3k:
|
||||
import dbm
|
||||
else:
|
||||
import anydbm as dbm
|
||||
self.dbmmodule = dbm
|
||||
self._init_dbm_file()
|
||||
|
||||
def _init_lock(self, argument, suffix, basedir, basefile, wrapper=None):
|
||||
if argument is None:
|
||||
lock = self.lock_factory(os.path.join(basedir, basefile + suffix))
|
||||
elif argument is not False:
|
||||
lock = self.lock_factory(
|
||||
os.path.abspath(
|
||||
os.path.normpath(argument)
|
||||
))
|
||||
else:
|
||||
return None
|
||||
if wrapper:
|
||||
lock = wrapper(lock)
|
||||
return lock
|
||||
|
||||
def _init_dbm_file(self):
|
||||
exists = os.access(self.filename, os.F_OK)
|
||||
if not exists:
|
||||
for ext in ('db', 'dat', 'pag', 'dir'):
|
||||
if os.access(self.filename + os.extsep + ext, os.F_OK):
|
||||
exists = True
|
||||
break
|
||||
if not exists:
|
||||
fh = self.dbmmodule.open(self.filename, 'c')
|
||||
fh.close()
|
||||
|
||||
def get_mutex(self, key):
|
||||
# using one dogpile for the whole file. Other ways
|
||||
# to do this might be using a set of files keyed to a
|
||||
# hash/modulus of the key. the issue is it's never
|
||||
# really safe to delete a lockfile as this can
|
||||
# break other processes trying to get at the file
|
||||
# at the same time - so handling unlimited keys
|
||||
# can't imply unlimited filenames
|
||||
if self._dogpile_lock:
|
||||
return self._dogpile_lock(key)
|
||||
else:
|
||||
return None
|
||||
|
||||
@contextmanager
|
||||
def _use_rw_lock(self, write):
|
||||
if self._rw_lock is None:
|
||||
yield
|
||||
elif write:
|
||||
with self._rw_lock.write():
|
||||
yield
|
||||
else:
|
||||
with self._rw_lock.read():
|
||||
yield
|
||||
|
||||
@contextmanager
|
||||
def _dbm_file(self, write):
|
||||
with self._use_rw_lock(write):
|
||||
dbm = self.dbmmodule.open(
|
||||
self.filename,
|
||||
"w" if write else "r")
|
||||
yield dbm
|
||||
dbm.close()
|
||||
|
||||
def get(self, key):
|
||||
with self._dbm_file(False) as dbm:
|
||||
if hasattr(dbm, 'get'):
|
||||
value = dbm.get(key, NO_VALUE)
|
||||
else:
|
||||
# gdbm objects lack a .get method
|
||||
try:
|
||||
value = dbm[key]
|
||||
except KeyError:
|
||||
value = NO_VALUE
|
||||
if value is not NO_VALUE:
|
||||
value = compat.pickle.loads(value)
|
||||
return value
|
||||
|
||||
def get_multi(self, keys):
|
||||
return [self.get(key) for key in keys]
|
||||
|
||||
def set(self, key, value):
|
||||
with self._dbm_file(True) as dbm:
|
||||
dbm[key] = compat.pickle.dumps(value,
|
||||
compat.pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def set_multi(self, mapping):
|
||||
with self._dbm_file(True) as dbm:
|
||||
for key, value in mapping.items():
|
||||
dbm[key] = compat.pickle.dumps(value,
|
||||
compat.pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def delete(self, key):
|
||||
with self._dbm_file(True) as dbm:
|
||||
try:
|
||||
del dbm[key]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
def delete_multi(self, keys):
|
||||
with self._dbm_file(True) as dbm:
|
||||
for key in keys:
|
||||
try:
|
||||
del dbm[key]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
|
||||
class AbstractFileLock(object):
|
||||
"""Coordinate read/write access to a file.
|
||||
|
||||
typically is a file-based lock but doesn't necessarily have to be.
|
||||
|
||||
The default implementation here is :class:`.FileLock`.
|
||||
|
||||
Implementations should provide the following methods::
|
||||
|
||||
* __init__()
|
||||
* acquire_read_lock()
|
||||
* acquire_write_lock()
|
||||
* release_read_lock()
|
||||
* release_write_lock()
|
||||
|
||||
The ``__init__()`` method accepts a single argument "filename", which
|
||||
may be used as the "lock file", for those implementations that use a lock
|
||||
file.
|
||||
|
||||
Note that multithreaded environments must provide a thread-safe
|
||||
version of this lock. The recommended approach for file-
|
||||
descriptor-based locks is to use a Python ``threading.local()`` so
|
||||
that a unique file descriptor is held per thread. See the source
|
||||
code of :class:`.FileLock` for an implementation example.
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, filename):
|
||||
"""Constructor, is given the filename of a potential lockfile.
|
||||
|
||||
The usage of this filename is optional and no file is
|
||||
created by default.
|
||||
|
||||
Raises ``NotImplementedError`` by default, must be
|
||||
implemented by subclasses.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def acquire(self, wait=True):
|
||||
"""Acquire the "write" lock.
|
||||
|
||||
This is a direct call to :meth:`.AbstractFileLock.acquire_write_lock`.
|
||||
|
||||
"""
|
||||
return self.acquire_write_lock(wait)
|
||||
|
||||
def release(self):
|
||||
"""Release the "write" lock.
|
||||
|
||||
This is a direct call to :meth:`.AbstractFileLock.release_write_lock`.
|
||||
|
||||
"""
|
||||
self.release_write_lock()
|
||||
|
||||
@contextmanager
|
||||
def read(self):
|
||||
"""Provide a context manager for the "read" lock.
|
||||
|
||||
This method makes use of :meth:`.AbstractFileLock.acquire_read_lock`
|
||||
and :meth:`.AbstractFileLock.release_read_lock`
|
||||
|
||||
"""
|
||||
|
||||
self.acquire_read_lock(True)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
self.release_read_lock()
|
||||
|
||||
@contextmanager
|
||||
def write(self):
|
||||
"""Provide a context manager for the "write" lock.
|
||||
|
||||
This method makes use of :meth:`.AbstractFileLock.acquire_write_lock`
|
||||
and :meth:`.AbstractFileLock.release_write_lock`
|
||||
|
||||
"""
|
||||
|
||||
self.acquire_write_lock(True)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
self.release_write_lock()
|
||||
|
||||
@property
|
||||
def is_open(self):
|
||||
"""optional method."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def acquire_read_lock(self, wait):
|
||||
"""Acquire a 'reader' lock.
|
||||
|
||||
Raises ``NotImplementedError`` by default, must be
|
||||
implemented by subclasses.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def acquire_write_lock(self, wait):
|
||||
"""Acquire a 'write' lock.
|
||||
|
||||
Raises ``NotImplementedError`` by default, must be
|
||||
implemented by subclasses.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def release_read_lock(self):
|
||||
"""Release a 'reader' lock.
|
||||
|
||||
Raises ``NotImplementedError`` by default, must be
|
||||
implemented by subclasses.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def release_write_lock(self):
|
||||
"""Release a 'writer' lock.
|
||||
|
||||
Raises ``NotImplementedError`` by default, must be
|
||||
implemented by subclasses.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class FileLock(AbstractFileLock):
|
||||
"""Use lockfiles to coordinate read/write access to a file.
|
||||
|
||||
Only works on Unix systems, using
|
||||
`fcntl.flock() <http://docs.python.org/library/fcntl.html>`_.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, filename):
|
||||
self._filedescriptor = compat.threading.local()
|
||||
self.filename = filename
|
||||
|
||||
@util.memoized_property
|
||||
def _module(self):
|
||||
import fcntl
|
||||
return fcntl
|
||||
|
||||
@property
|
||||
def is_open(self):
|
||||
return hasattr(self._filedescriptor, 'fileno')
|
||||
|
||||
def acquire_read_lock(self, wait):
|
||||
return self._acquire(wait, os.O_RDONLY, self._module.LOCK_SH)
|
||||
|
||||
def acquire_write_lock(self, wait):
|
||||
return self._acquire(wait, os.O_WRONLY, self._module.LOCK_EX)
|
||||
|
||||
def release_read_lock(self):
|
||||
self._release()
|
||||
|
||||
def release_write_lock(self):
|
||||
self._release()
|
||||
|
||||
def _acquire(self, wait, wrflag, lockflag):
|
||||
wrflag |= os.O_CREAT
|
||||
fileno = os.open(self.filename, wrflag)
|
||||
try:
|
||||
if not wait:
|
||||
lockflag |= self._module.LOCK_NB
|
||||
self._module.flock(fileno, lockflag)
|
||||
except IOError:
|
||||
os.close(fileno)
|
||||
if not wait:
|
||||
# this is typically
|
||||
# "[Errno 35] Resource temporarily unavailable",
|
||||
# because of LOCK_NB
|
||||
return False
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
self._filedescriptor.fileno = fileno
|
||||
return True
|
||||
|
||||
def _release(self):
|
||||
try:
|
||||
fileno = self._filedescriptor.fileno
|
||||
except AttributeError:
|
||||
return
|
||||
else:
|
||||
self._module.flock(fileno, self._module.LOCK_UN)
|
||||
os.close(fileno)
|
||||
del self._filedescriptor.fileno
|
364
libs/dogpile2.7/cache/backends/memcached.py
vendored
Normal file
364
libs/dogpile2.7/cache/backends/memcached.py
vendored
Normal file
|
@ -0,0 +1,364 @@
|
|||
"""
|
||||
Memcached Backends
|
||||
------------------
|
||||
|
||||
Provides backends for talking to `memcached <http://memcached.org>`_.
|
||||
|
||||
"""
|
||||
|
||||
from ..api import CacheBackend, NO_VALUE
|
||||
from ...util import compat
|
||||
from ... import util
|
||||
import random
|
||||
import time
|
||||
|
||||
__all__ = 'GenericMemcachedBackend', 'MemcachedBackend',\
|
||||
'PylibmcBackend', 'BMemcachedBackend', 'MemcachedLock'
|
||||
|
||||
|
||||
class MemcachedLock(object):
|
||||
"""Simple distributed lock using memcached.
|
||||
|
||||
This is an adaptation of the lock featured at
|
||||
http://amix.dk/blog/post/19386
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, client_fn, key, timeout=0):
|
||||
self.client_fn = client_fn
|
||||
self.key = "_lock" + key
|
||||
self.timeout = timeout
|
||||
|
||||
def acquire(self, wait=True):
|
||||
client = self.client_fn()
|
||||
i = 0
|
||||
while True:
|
||||
if client.add(self.key, 1, self.timeout):
|
||||
return True
|
||||
elif not wait:
|
||||
return False
|
||||
else:
|
||||
sleep_time = (((i + 1) * random.random()) + 2 ** i) / 2.5
|
||||
time.sleep(sleep_time)
|
||||
if i < 15:
|
||||
i += 1
|
||||
|
||||
def release(self):
|
||||
client = self.client_fn()
|
||||
client.delete(self.key)
|
||||
|
||||
|
||||
class GenericMemcachedBackend(CacheBackend):
|
||||
"""Base class for memcached backends.
|
||||
|
||||
This base class accepts a number of paramters
|
||||
common to all backends.
|
||||
|
||||
:param url: the string URL to connect to. Can be a single
|
||||
string or a list of strings. This is the only argument
|
||||
that's required.
|
||||
:param distributed_lock: boolean, when True, will use a
|
||||
memcached-lock as the dogpile lock (see :class:`.MemcachedLock`).
|
||||
Use this when multiple
|
||||
processes will be talking to the same memcached instance.
|
||||
When left at False, dogpile will coordinate on a regular
|
||||
threading mutex.
|
||||
:param lock_timeout: integer, number of seconds after acquiring a lock that
|
||||
memcached should expire it. This argument is only valid when
|
||||
``distributed_lock`` is ``True``.
|
||||
|
||||
.. versionadded:: 0.5.7
|
||||
|
||||
:param memcached_expire_time: integer, when present will
|
||||
be passed as the ``time`` parameter to ``pylibmc.Client.set``.
|
||||
This is used to set the memcached expiry time for a value.
|
||||
|
||||
.. note::
|
||||
|
||||
This parameter is **different** from Dogpile's own
|
||||
``expiration_time``, which is the number of seconds after
|
||||
which Dogpile will consider the value to be expired.
|
||||
When Dogpile considers a value to be expired,
|
||||
it **continues to use the value** until generation
|
||||
of a new value is complete, when using
|
||||
:meth:`.CacheRegion.get_or_create`.
|
||||
Therefore, if you are setting ``memcached_expire_time``, you'll
|
||||
want to make sure it is greater than ``expiration_time``
|
||||
by at least enough seconds for new values to be generated,
|
||||
else the value won't be available during a regeneration,
|
||||
forcing all threads to wait for a regeneration each time
|
||||
a value expires.
|
||||
|
||||
The :class:`.GenericMemachedBackend` uses a ``threading.local()``
|
||||
object to store individual client objects per thread,
|
||||
as most modern memcached clients do not appear to be inherently
|
||||
threadsafe.
|
||||
|
||||
In particular, ``threading.local()`` has the advantage over pylibmc's
|
||||
built-in thread pool in that it automatically discards objects
|
||||
associated with a particular thread when that thread ends.
|
||||
|
||||
"""
|
||||
|
||||
set_arguments = {}
|
||||
"""Additional arguments which will be passed
|
||||
to the :meth:`set` method."""
|
||||
|
||||
def __init__(self, arguments):
|
||||
self._imports()
|
||||
# using a plain threading.local here. threading.local
|
||||
# automatically deletes the __dict__ when a thread ends,
|
||||
# so the idea is that this is superior to pylibmc's
|
||||
# own ThreadMappedPool which doesn't handle this
|
||||
# automatically.
|
||||
self.url = util.to_list(arguments['url'])
|
||||
self.distributed_lock = arguments.get('distributed_lock', False)
|
||||
self.lock_timeout = arguments.get('lock_timeout', 0)
|
||||
self.memcached_expire_time = arguments.get(
|
||||
'memcached_expire_time', 0)
|
||||
|
||||
def has_lock_timeout(self):
|
||||
return self.lock_timeout != 0
|
||||
|
||||
def _imports(self):
|
||||
"""client library imports go here."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def _create_client(self):
|
||||
"""Creation of a Client instance goes here."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@util.memoized_property
|
||||
def _clients(self):
|
||||
backend = self
|
||||
|
||||
class ClientPool(compat.threading.local):
|
||||
def __init__(self):
|
||||
self.memcached = backend._create_client()
|
||||
|
||||
return ClientPool()
|
||||
|
||||
@property
|
||||
def client(self):
|
||||
"""Return the memcached client.
|
||||
|
||||
This uses a threading.local by
|
||||
default as it appears most modern
|
||||
memcached libs aren't inherently
|
||||
threadsafe.
|
||||
|
||||
"""
|
||||
return self._clients.memcached
|
||||
|
||||
def get_mutex(self, key):
|
||||
if self.distributed_lock:
|
||||
return MemcachedLock(lambda: self.client, key,
|
||||
timeout=self.lock_timeout)
|
||||
else:
|
||||
return None
|
||||
|
||||
def get(self, key):
|
||||
value = self.client.get(key)
|
||||
if value is None:
|
||||
return NO_VALUE
|
||||
else:
|
||||
return value
|
||||
|
||||
def get_multi(self, keys):
|
||||
values = self.client.get_multi(keys)
|
||||
return [
|
||||
NO_VALUE if key not in values
|
||||
else values[key] for key in keys
|
||||
]
|
||||
|
||||
def set(self, key, value):
|
||||
self.client.set(
|
||||
key,
|
||||
value,
|
||||
**self.set_arguments
|
||||
)
|
||||
|
||||
def set_multi(self, mapping):
|
||||
self.client.set_multi(
|
||||
mapping,
|
||||
**self.set_arguments
|
||||
)
|
||||
|
||||
def delete(self, key):
|
||||
self.client.delete(key)
|
||||
|
||||
def delete_multi(self, keys):
|
||||
self.client.delete_multi(keys)
|
||||
|
||||
|
||||
class MemcacheArgs(object):
|
||||
"""Mixin which provides support for the 'time' argument to set(),
|
||||
'min_compress_len' to other methods.
|
||||
|
||||
"""
|
||||
def __init__(self, arguments):
|
||||
self.min_compress_len = arguments.get('min_compress_len', 0)
|
||||
|
||||
self.set_arguments = {}
|
||||
if "memcached_expire_time" in arguments:
|
||||
self.set_arguments["time"] = arguments["memcached_expire_time"]
|
||||
if "min_compress_len" in arguments:
|
||||
self.set_arguments["min_compress_len"] = \
|
||||
arguments["min_compress_len"]
|
||||
super(MemcacheArgs, self).__init__(arguments)
|
||||
|
||||
pylibmc = None
|
||||
|
||||
|
||||
class PylibmcBackend(MemcacheArgs, GenericMemcachedBackend):
|
||||
"""A backend for the
|
||||
`pylibmc <http://sendapatch.se/projects/pylibmc/index.html>`_
|
||||
memcached client.
|
||||
|
||||
A configuration illustrating several of the optional
|
||||
arguments described in the pylibmc documentation::
|
||||
|
||||
from dogpile.cache import make_region
|
||||
|
||||
region = make_region().configure(
|
||||
'dogpile.cache.pylibmc',
|
||||
expiration_time = 3600,
|
||||
arguments = {
|
||||
'url':["127.0.0.1"],
|
||||
'binary':True,
|
||||
'behaviors':{"tcp_nodelay": True,"ketama":True}
|
||||
}
|
||||
)
|
||||
|
||||
Arguments accepted here include those of
|
||||
:class:`.GenericMemcachedBackend`, as well as
|
||||
those below.
|
||||
|
||||
:param binary: sets the ``binary`` flag understood by
|
||||
``pylibmc.Client``.
|
||||
:param behaviors: a dictionary which will be passed to
|
||||
``pylibmc.Client`` as the ``behaviors`` parameter.
|
||||
:param min_compress_len: Integer, will be passed as the
|
||||
``min_compress_len`` parameter to the ``pylibmc.Client.set``
|
||||
method.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, arguments):
|
||||
self.binary = arguments.get('binary', False)
|
||||
self.behaviors = arguments.get('behaviors', {})
|
||||
super(PylibmcBackend, self).__init__(arguments)
|
||||
|
||||
def _imports(self):
|
||||
global pylibmc
|
||||
import pylibmc # noqa
|
||||
|
||||
def _create_client(self):
|
||||
return pylibmc.Client(
|
||||
self.url,
|
||||
binary=self.binary,
|
||||
behaviors=self.behaviors
|
||||
)
|
||||
|
||||
memcache = None
|
||||
|
||||
|
||||
class MemcachedBackend(MemcacheArgs, GenericMemcachedBackend):
|
||||
"""A backend using the standard
|
||||
`Python-memcached <http://www.tummy.com/Community/software/\
|
||||
python-memcached/>`_
|
||||
library.
|
||||
|
||||
Example::
|
||||
|
||||
from dogpile.cache import make_region
|
||||
|
||||
region = make_region().configure(
|
||||
'dogpile.cache.memcached',
|
||||
expiration_time = 3600,
|
||||
arguments = {
|
||||
'url':"127.0.0.1:11211"
|
||||
}
|
||||
)
|
||||
|
||||
"""
|
||||
def _imports(self):
|
||||
global memcache
|
||||
import memcache # noqa
|
||||
|
||||
def _create_client(self):
|
||||
return memcache.Client(self.url)
|
||||
|
||||
|
||||
bmemcached = None
|
||||
|
||||
|
||||
class BMemcachedBackend(GenericMemcachedBackend):
|
||||
"""A backend for the
|
||||
`python-binary-memcached <https://github.com/jaysonsantos/\
|
||||
python-binary-memcached>`_
|
||||
memcached client.
|
||||
|
||||
This is a pure Python memcached client which
|
||||
includes the ability to authenticate with a memcached
|
||||
server using SASL.
|
||||
|
||||
A typical configuration using username/password::
|
||||
|
||||
from dogpile.cache import make_region
|
||||
|
||||
region = make_region().configure(
|
||||
'dogpile.cache.bmemcached',
|
||||
expiration_time = 3600,
|
||||
arguments = {
|
||||
'url':["127.0.0.1"],
|
||||
'username':'scott',
|
||||
'password':'tiger'
|
||||
}
|
||||
)
|
||||
|
||||
Arguments which can be passed to the ``arguments``
|
||||
dictionary include:
|
||||
|
||||
:param username: optional username, will be used for
|
||||
SASL authentication.
|
||||
:param password: optional password, will be used for
|
||||
SASL authentication.
|
||||
|
||||
"""
|
||||
def __init__(self, arguments):
|
||||
self.username = arguments.get('username', None)
|
||||
self.password = arguments.get('password', None)
|
||||
super(BMemcachedBackend, self).__init__(arguments)
|
||||
|
||||
def _imports(self):
|
||||
global bmemcached
|
||||
import bmemcached
|
||||
|
||||
class RepairBMemcachedAPI(bmemcached.Client):
|
||||
"""Repairs BMemcached's non-standard method
|
||||
signatures, which was fixed in BMemcached
|
||||
ef206ed4473fec3b639e.
|
||||
|
||||
"""
|
||||
|
||||
def add(self, key, value, timeout=0):
|
||||
try:
|
||||
return super(RepairBMemcachedAPI, self).add(
|
||||
key, value, timeout)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
self.Client = RepairBMemcachedAPI
|
||||
|
||||
def _create_client(self):
|
||||
return self.Client(
|
||||
self.url,
|
||||
username=self.username,
|
||||
password=self.password
|
||||
)
|
||||
|
||||
def delete_multi(self, keys):
|
||||
"""python-binary-memcached api does not implements delete_multi"""
|
||||
for key in keys:
|
||||
self.delete(key)
|
124
libs/dogpile2.7/cache/backends/memory.py
vendored
Normal file
124
libs/dogpile2.7/cache/backends/memory.py
vendored
Normal file
|
@ -0,0 +1,124 @@
|
|||
"""
|
||||
Memory Backends
|
||||
---------------
|
||||
|
||||
Provides simple dictionary-based backends.
|
||||
|
||||
The two backends are :class:`.MemoryBackend` and :class:`.MemoryPickleBackend`;
|
||||
the latter applies a serialization step to cached values while the former
|
||||
places the value as given into the dictionary.
|
||||
|
||||
"""
|
||||
|
||||
from ..api import CacheBackend, NO_VALUE
|
||||
from ...util.compat import pickle
|
||||
|
||||
|
||||
class MemoryBackend(CacheBackend):
|
||||
"""A backend that uses a plain dictionary.
|
||||
|
||||
There is no size management, and values which
|
||||
are placed into the dictionary will remain
|
||||
until explicitly removed. Note that
|
||||
Dogpile's expiration of items is based on
|
||||
timestamps and does not remove them from
|
||||
the cache.
|
||||
|
||||
E.g.::
|
||||
|
||||
from dogpile.cache import make_region
|
||||
|
||||
region = make_region().configure(
|
||||
'dogpile.cache.memory'
|
||||
)
|
||||
|
||||
|
||||
To use a Python dictionary of your choosing,
|
||||
it can be passed in with the ``cache_dict``
|
||||
argument::
|
||||
|
||||
my_dictionary = {}
|
||||
region = make_region().configure(
|
||||
'dogpile.cache.memory',
|
||||
arguments={
|
||||
"cache_dict":my_dictionary
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
"""
|
||||
pickle_values = False
|
||||
|
||||
def __init__(self, arguments):
|
||||
self._cache = arguments.pop("cache_dict", {})
|
||||
|
||||
def get(self, key):
|
||||
value = self._cache.get(key, NO_VALUE)
|
||||
if value is not NO_VALUE and self.pickle_values:
|
||||
value = pickle.loads(value)
|
||||
return value
|
||||
|
||||
def get_multi(self, keys):
|
||||
ret = [
|
||||
self._cache.get(key, NO_VALUE)
|
||||
for key in keys]
|
||||
if self.pickle_values:
|
||||
ret = [
|
||||
pickle.loads(value)
|
||||
if value is not NO_VALUE else value
|
||||
for value in ret
|
||||
]
|
||||
return ret
|
||||
|
||||
def set(self, key, value):
|
||||
if self.pickle_values:
|
||||
value = pickle.dumps(value, pickle.HIGHEST_PROTOCOL)
|
||||
self._cache[key] = value
|
||||
|
||||
def set_multi(self, mapping):
|
||||
pickle_values = self.pickle_values
|
||||
for key, value in mapping.items():
|
||||
if pickle_values:
|
||||
value = pickle.dumps(value, pickle.HIGHEST_PROTOCOL)
|
||||
self._cache[key] = value
|
||||
|
||||
def delete(self, key):
|
||||
self._cache.pop(key, None)
|
||||
|
||||
def delete_multi(self, keys):
|
||||
for key in keys:
|
||||
self._cache.pop(key, None)
|
||||
|
||||
|
||||
class MemoryPickleBackend(MemoryBackend):
|
||||
"""A backend that uses a plain dictionary, but serializes objects on
|
||||
:meth:`.MemoryBackend.set` and deserializes :meth:`.MemoryBackend.get`.
|
||||
|
||||
E.g.::
|
||||
|
||||
from dogpile.cache import make_region
|
||||
|
||||
region = make_region().configure(
|
||||
'dogpile.cache.memory_pickle'
|
||||
)
|
||||
|
||||
The usage of pickle to serialize cached values allows an object
|
||||
as placed in the cache to be a copy of the original given object, so
|
||||
that any subsequent changes to the given object aren't reflected
|
||||
in the cached value, thus making the backend behave the same way
|
||||
as other backends which make use of serialization.
|
||||
|
||||
The serialization is performed via pickle, and incurs the same
|
||||
performance hit in doing so as that of other backends; in this way
|
||||
the :class:`.MemoryPickleBackend` performance is somewhere in between
|
||||
that of the pure :class:`.MemoryBackend` and the remote server oriented
|
||||
backends such as that of Memcached or Redis.
|
||||
|
||||
Pickle behavior here is the same as that of the Redis backend, using
|
||||
either ``cPickle`` or ``pickle`` and specifying ``HIGHEST_PROTOCOL``
|
||||
upon serialize.
|
||||
|
||||
.. versionadded:: 0.5.3
|
||||
|
||||
"""
|
||||
pickle_values = True
|
62
libs/dogpile2.7/cache/backends/null.py
vendored
Normal file
62
libs/dogpile2.7/cache/backends/null.py
vendored
Normal file
|
@ -0,0 +1,62 @@
|
|||
"""
|
||||
Null Backend
|
||||
-------------
|
||||
|
||||
The Null backend does not do any caching at all. It can be
|
||||
used to test behavior without caching, or as a means of disabling
|
||||
caching for a region that is otherwise used normally.
|
||||
|
||||
.. versionadded:: 0.5.4
|
||||
|
||||
"""
|
||||
|
||||
from ..api import CacheBackend, NO_VALUE
|
||||
|
||||
|
||||
__all__ = ['NullBackend']
|
||||
|
||||
|
||||
class NullLock(object):
|
||||
def acquire(self, wait=True):
|
||||
return True
|
||||
|
||||
def release(self):
|
||||
pass
|
||||
|
||||
|
||||
class NullBackend(CacheBackend):
|
||||
"""A "null" backend that effectively disables all cache operations.
|
||||
|
||||
Basic usage::
|
||||
|
||||
from dogpile.cache import make_region
|
||||
|
||||
region = make_region().configure(
|
||||
'dogpile.cache.null'
|
||||
)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, arguments):
|
||||
pass
|
||||
|
||||
def get_mutex(self, key):
|
||||
return NullLock()
|
||||
|
||||
def get(self, key):
|
||||
return NO_VALUE
|
||||
|
||||
def get_multi(self, keys):
|
||||
return [NO_VALUE for k in keys]
|
||||
|
||||
def set(self, key, value):
|
||||
pass
|
||||
|
||||
def set_multi(self, mapping):
|
||||
pass
|
||||
|
||||
def delete(self, key):
|
||||
pass
|
||||
|
||||
def delete_multi(self, keys):
|
||||
pass
|
183
libs/dogpile2.7/cache/backends/redis.py
vendored
Normal file
183
libs/dogpile2.7/cache/backends/redis.py
vendored
Normal file
|
@ -0,0 +1,183 @@
|
|||
"""
|
||||
Redis Backends
|
||||
------------------
|
||||
|
||||
Provides backends for talking to `Redis <http://redis.io>`_.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from ..api import CacheBackend, NO_VALUE
|
||||
from ...util.compat import pickle, u
|
||||
|
||||
redis = None
|
||||
|
||||
__all__ = 'RedisBackend',
|
||||
|
||||
|
||||
class RedisBackend(CacheBackend):
|
||||
"""A `Redis <http://redis.io/>`_ backend, using the
|
||||
`redis-py <http://pypi.python.org/pypi/redis/>`_ backend.
|
||||
|
||||
Example configuration::
|
||||
|
||||
from dogpile.cache import make_region
|
||||
|
||||
region = make_region().configure(
|
||||
'dogpile.cache.redis',
|
||||
arguments = {
|
||||
'host': 'localhost',
|
||||
'port': 6379,
|
||||
'db': 0,
|
||||
'redis_expiration_time': 60*60*2, # 2 hours
|
||||
'distributed_lock': True
|
||||
}
|
||||
)
|
||||
|
||||
Arguments accepted in the arguments dictionary:
|
||||
|
||||
:param url: string. If provided, will override separate host/port/db
|
||||
params. The format is that accepted by ``StrictRedis.from_url()``.
|
||||
|
||||
.. versionadded:: 0.4.1
|
||||
|
||||
:param host: string, default is ``localhost``.
|
||||
|
||||
:param password: string, default is no password.
|
||||
|
||||
.. versionadded:: 0.4.1
|
||||
|
||||
:param port: integer, default is ``6379``.
|
||||
|
||||
:param db: integer, default is ``0``.
|
||||
|
||||
:param redis_expiration_time: integer, number of seconds after setting
|
||||
a value that Redis should expire it. This should be larger than dogpile's
|
||||
cache expiration. By default no expiration is set.
|
||||
|
||||
:param distributed_lock: boolean, when True, will use a
|
||||
redis-lock as the dogpile lock.
|
||||
Use this when multiple
|
||||
processes will be talking to the same redis instance.
|
||||
When left at False, dogpile will coordinate on a regular
|
||||
threading mutex.
|
||||
|
||||
:param lock_timeout: integer, number of seconds after acquiring a lock that
|
||||
Redis should expire it. This argument is only valid when
|
||||
``distributed_lock`` is ``True``.
|
||||
|
||||
.. versionadded:: 0.5.0
|
||||
|
||||
:param socket_timeout: float, seconds for socket timeout.
|
||||
Default is None (no timeout).
|
||||
|
||||
.. versionadded:: 0.5.4
|
||||
|
||||
:param lock_sleep: integer, number of seconds to sleep when failed to
|
||||
acquire a lock. This argument is only valid when
|
||||
``distributed_lock`` is ``True``.
|
||||
|
||||
.. versionadded:: 0.5.0
|
||||
|
||||
:param connection_pool: ``redis.ConnectionPool`` object. If provided,
|
||||
this object supersedes other connection arguments passed to the
|
||||
``redis.StrictRedis`` instance, including url and/or host as well as
|
||||
socket_timeout, and will be passed to ``redis.StrictRedis`` as the
|
||||
source of connectivity.
|
||||
|
||||
.. versionadded:: 0.5.4
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, arguments):
|
||||
arguments = arguments.copy()
|
||||
self._imports()
|
||||
self.url = arguments.pop('url', None)
|
||||
self.host = arguments.pop('host', 'localhost')
|
||||
self.password = arguments.pop('password', None)
|
||||
self.port = arguments.pop('port', 6379)
|
||||
self.db = arguments.pop('db', 0)
|
||||
self.distributed_lock = arguments.get('distributed_lock', False)
|
||||
self.socket_timeout = arguments.pop('socket_timeout', None)
|
||||
|
||||
self.lock_timeout = arguments.get('lock_timeout', None)
|
||||
self.lock_sleep = arguments.get('lock_sleep', 0.1)
|
||||
|
||||
self.redis_expiration_time = arguments.pop('redis_expiration_time', 0)
|
||||
self.connection_pool = arguments.get('connection_pool', None)
|
||||
self.client = self._create_client()
|
||||
|
||||
def _imports(self):
|
||||
# defer imports until backend is used
|
||||
global redis
|
||||
import redis # noqa
|
||||
|
||||
def _create_client(self):
|
||||
if self.connection_pool is not None:
|
||||
# the connection pool already has all other connection
|
||||
# options present within, so here we disregard socket_timeout
|
||||
# and others.
|
||||
return redis.StrictRedis(connection_pool=self.connection_pool)
|
||||
|
||||
args = {}
|
||||
if self.socket_timeout:
|
||||
args['socket_timeout'] = self.socket_timeout
|
||||
|
||||
if self.url is not None:
|
||||
args.update(url=self.url)
|
||||
return redis.StrictRedis.from_url(**args)
|
||||
else:
|
||||
args.update(
|
||||
host=self.host, password=self.password,
|
||||
port=self.port, db=self.db
|
||||
)
|
||||
return redis.StrictRedis(**args)
|
||||
|
||||
def get_mutex(self, key):
|
||||
if self.distributed_lock:
|
||||
return self.client.lock(u('_lock{0}').format(key),
|
||||
self.lock_timeout, self.lock_sleep)
|
||||
else:
|
||||
return None
|
||||
|
||||
def get(self, key):
|
||||
value = self.client.get(key)
|
||||
if value is None:
|
||||
return NO_VALUE
|
||||
return pickle.loads(value)
|
||||
|
||||
def get_multi(self, keys):
|
||||
if not keys:
|
||||
return []
|
||||
values = self.client.mget(keys)
|
||||
return [
|
||||
pickle.loads(v) if v is not None else NO_VALUE
|
||||
for v in values]
|
||||
|
||||
def set(self, key, value):
|
||||
if self.redis_expiration_time:
|
||||
self.client.setex(key, self.redis_expiration_time,
|
||||
pickle.dumps(value, pickle.HIGHEST_PROTOCOL))
|
||||
else:
|
||||
self.client.set(key, pickle.dumps(value, pickle.HIGHEST_PROTOCOL))
|
||||
|
||||
def set_multi(self, mapping):
|
||||
mapping = dict(
|
||||
(k, pickle.dumps(v, pickle.HIGHEST_PROTOCOL))
|
||||
for k, v in mapping.items()
|
||||
)
|
||||
|
||||
if not self.redis_expiration_time:
|
||||
self.client.mset(mapping)
|
||||
else:
|
||||
pipe = self.client.pipeline()
|
||||
for key, value in mapping.items():
|
||||
pipe.setex(key, self.redis_expiration_time, value)
|
||||
pipe.execute()
|
||||
|
||||
def delete(self, key):
|
||||
self.client.delete(key)
|
||||
|
||||
def delete_multi(self, keys):
|
||||
self.client.delete(*keys)
|
25
libs/dogpile2.7/cache/exception.py
vendored
Normal file
25
libs/dogpile2.7/cache/exception.py
vendored
Normal file
|
@ -0,0 +1,25 @@
|
|||
"""Exception classes for dogpile.cache."""
|
||||
|
||||
|
||||
class DogpileCacheException(Exception):
|
||||
"""Base Exception for dogpile.cache exceptions to inherit from."""
|
||||
|
||||
|
||||
class RegionAlreadyConfigured(DogpileCacheException):
|
||||
"""CacheRegion instance is already configured."""
|
||||
|
||||
|
||||
class RegionNotConfigured(DogpileCacheException):
|
||||
"""CacheRegion instance has not been configured."""
|
||||
|
||||
|
||||
class ValidationError(DogpileCacheException):
|
||||
"""Error validating a value or option."""
|
||||
|
||||
|
||||
class PluginNotFound(DogpileCacheException):
|
||||
"""The specified plugin could not be found.
|
||||
|
||||
.. versionadded:: 0.6.4
|
||||
|
||||
"""
|
0
libs/dogpile2.7/cache/plugins/__init__.py
vendored
Normal file
0
libs/dogpile2.7/cache/plugins/__init__.py
vendored
Normal file
90
libs/dogpile2.7/cache/plugins/mako_cache.py
vendored
Normal file
90
libs/dogpile2.7/cache/plugins/mako_cache.py
vendored
Normal file
|
@ -0,0 +1,90 @@
|
|||
"""
|
||||
Mako Integration
|
||||
----------------
|
||||
|
||||
dogpile.cache includes a `Mako <http://www.makotemplates.org>`_ plugin
|
||||
that replaces `Beaker <http://beaker.groovie.org>`_
|
||||
as the cache backend.
|
||||
Setup a Mako template lookup using the "dogpile.cache" cache implementation
|
||||
and a region dictionary::
|
||||
|
||||
from dogpile.cache import make_region
|
||||
from mako.lookup import TemplateLookup
|
||||
|
||||
my_regions = {
|
||||
"local":make_region().configure(
|
||||
"dogpile.cache.dbm",
|
||||
expiration_time=360,
|
||||
arguments={"filename":"file.dbm"}
|
||||
),
|
||||
"memcached":make_region().configure(
|
||||
"dogpile.cache.pylibmc",
|
||||
expiration_time=3600,
|
||||
arguments={"url":["127.0.0.1"]}
|
||||
)
|
||||
}
|
||||
|
||||
mako_lookup = TemplateLookup(
|
||||
directories=["/myapp/templates"],
|
||||
cache_impl="dogpile.cache",
|
||||
cache_args={
|
||||
'regions':my_regions
|
||||
}
|
||||
)
|
||||
|
||||
To use the above configuration in a template, use the ``cached=True``
|
||||
argument on any Mako tag which accepts it, in conjunction with the
|
||||
name of the desired region as the ``cache_region`` argument::
|
||||
|
||||
<%def name="mysection()" cached="True" cache_region="memcached">
|
||||
some content that's cached
|
||||
</%def>
|
||||
|
||||
|
||||
"""
|
||||
from mako.cache import CacheImpl
|
||||
|
||||
|
||||
class MakoPlugin(CacheImpl):
|
||||
"""A Mako ``CacheImpl`` which talks to dogpile.cache."""
|
||||
|
||||
def __init__(self, cache):
|
||||
super(MakoPlugin, self).__init__(cache)
|
||||
try:
|
||||
self.regions = self.cache.template.cache_args['regions']
|
||||
except KeyError:
|
||||
raise KeyError(
|
||||
"'cache_regions' argument is required on the "
|
||||
"Mako Lookup or Template object for usage "
|
||||
"with the dogpile.cache plugin.")
|
||||
|
||||
def _get_region(self, **kw):
|
||||
try:
|
||||
region = kw['region']
|
||||
except KeyError:
|
||||
raise KeyError(
|
||||
"'cache_region' argument must be specified with 'cache=True'"
|
||||
"within templates for usage with the dogpile.cache plugin.")
|
||||
try:
|
||||
return self.regions[region]
|
||||
except KeyError:
|
||||
raise KeyError("No such region '%s'" % region)
|
||||
|
||||
def get_and_replace(self, key, creation_function, **kw):
|
||||
expiration_time = kw.pop("timeout", None)
|
||||
return self._get_region(**kw).get_or_create(
|
||||
key, creation_function,
|
||||
expiration_time=expiration_time)
|
||||
|
||||
def get_or_create(self, key, creation_function, **kw):
|
||||
return self.get_and_replace(key, creation_function, **kw)
|
||||
|
||||
def put(self, key, value, **kw):
|
||||
self._get_region(**kw).put(key, value)
|
||||
|
||||
def get(self, key, **kw):
|
||||
expiration_time = kw.pop("timeout", None)
|
||||
return self._get_region(**kw).get(key, expiration_time=expiration_time)
|
||||
|
||||
def invalidate(self, key, **kw):
|
||||
self._get_region(**kw).delete(key)
|
95
libs/dogpile2.7/cache/proxy.py
vendored
Normal file
95
libs/dogpile2.7/cache/proxy.py
vendored
Normal file
|
@ -0,0 +1,95 @@
|
|||
"""
|
||||
Proxy Backends
|
||||
------------------
|
||||
|
||||
Provides a utility and a decorator class that allow for modifying the behavior
|
||||
of different backends without altering the class itself or having to extend the
|
||||
base backend.
|
||||
|
||||
.. versionadded:: 0.5.0 Added support for the :class:`.ProxyBackend` class.
|
||||
|
||||
"""
|
||||
|
||||
from .api import CacheBackend
|
||||
|
||||
|
||||
class ProxyBackend(CacheBackend):
|
||||
"""A decorator class for altering the functionality of backends.
|
||||
|
||||
Basic usage::
|
||||
|
||||
from dogpile.cache import make_region
|
||||
from dogpile.cache.proxy import ProxyBackend
|
||||
|
||||
class MyFirstProxy(ProxyBackend):
|
||||
def get(self, key):
|
||||
# ... custom code goes here ...
|
||||
return self.proxied.get(key)
|
||||
|
||||
def set(self, key, value):
|
||||
# ... custom code goes here ...
|
||||
self.proxied.set(key)
|
||||
|
||||
class MySecondProxy(ProxyBackend):
|
||||
def get(self, key):
|
||||
# ... custom code goes here ...
|
||||
return self.proxied.get(key)
|
||||
|
||||
|
||||
region = make_region().configure(
|
||||
'dogpile.cache.dbm',
|
||||
expiration_time = 3600,
|
||||
arguments = {
|
||||
"filename":"/path/to/cachefile.dbm"
|
||||
},
|
||||
wrap = [ MyFirstProxy, MySecondProxy ]
|
||||
)
|
||||
|
||||
Classes that extend :class:`.ProxyBackend` can be stacked
|
||||
together. The ``.proxied`` property will always
|
||||
point to either the concrete backend instance or
|
||||
the next proxy in the chain that a method can be
|
||||
delegated towards.
|
||||
|
||||
.. versionadded:: 0.5.0
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.proxied = None
|
||||
|
||||
def wrap(self, backend):
|
||||
''' Take a backend as an argument and setup the self.proxied property.
|
||||
Return an object that be used as a backend by a :class:`.CacheRegion`
|
||||
object.
|
||||
'''
|
||||
assert(
|
||||
isinstance(backend, CacheBackend) or
|
||||
isinstance(backend, ProxyBackend))
|
||||
self.proxied = backend
|
||||
return self
|
||||
|
||||
#
|
||||
# Delegate any functions that are not already overridden to
|
||||
# the proxies backend
|
||||
#
|
||||
def get(self, key):
|
||||
return self.proxied.get(key)
|
||||
|
||||
def set(self, key, value):
|
||||
self.proxied.set(key, value)
|
||||
|
||||
def delete(self, key):
|
||||
self.proxied.delete(key)
|
||||
|
||||
def get_multi(self, keys):
|
||||
return self.proxied.get_multi(keys)
|
||||
|
||||
def set_multi(self, mapping):
|
||||
self.proxied.set_multi(mapping)
|
||||
|
||||
def delete_multi(self, keys):
|
||||
self.proxied.delete_multi(keys)
|
||||
|
||||
def get_mutex(self, key):
|
||||
return self.proxied.get_mutex(key)
|
1498
libs/dogpile2.7/cache/region.py
vendored
Normal file
1498
libs/dogpile2.7/cache/region.py
vendored
Normal file
File diff suppressed because it is too large
Load diff
146
libs/dogpile2.7/cache/util.py
vendored
Normal file
146
libs/dogpile2.7/cache/util.py
vendored
Normal file
|
@ -0,0 +1,146 @@
|
|||
from hashlib import sha1
|
||||
import inspect
|
||||
from ..util import compat
|
||||
from ..util import langhelpers
|
||||
|
||||
|
||||
def function_key_generator(namespace, fn, to_str=compat.string_type):
|
||||
"""Return a function that generates a string
|
||||
key, based on a given function as well as
|
||||
arguments to the returned function itself.
|
||||
|
||||
This is used by :meth:`.CacheRegion.cache_on_arguments`
|
||||
to generate a cache key from a decorated function.
|
||||
|
||||
An alternate function may be used by specifying
|
||||
the :paramref:`.CacheRegion.function_key_generator` argument
|
||||
for :class:`.CacheRegion`.
|
||||
|
||||
.. seealso::
|
||||
|
||||
:func:`.kwarg_function_key_generator` - similar function that also
|
||||
takes keyword arguments into account
|
||||
|
||||
"""
|
||||
|
||||
if namespace is None:
|
||||
namespace = '%s:%s' % (fn.__module__, fn.__name__)
|
||||
else:
|
||||
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
|
||||
|
||||
args = inspect.getargspec(fn)
|
||||
has_self = args[0] and args[0][0] in ('self', 'cls')
|
||||
|
||||
def generate_key(*args, **kw):
|
||||
if kw:
|
||||
raise ValueError(
|
||||
"dogpile.cache's default key creation "
|
||||
"function does not accept keyword arguments.")
|
||||
if has_self:
|
||||
args = args[1:]
|
||||
|
||||
return namespace + "|" + " ".join(map(to_str, args))
|
||||
return generate_key
|
||||
|
||||
|
||||
def function_multi_key_generator(namespace, fn, to_str=compat.string_type):
|
||||
|
||||
if namespace is None:
|
||||
namespace = '%s:%s' % (fn.__module__, fn.__name__)
|
||||
else:
|
||||
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
|
||||
|
||||
args = inspect.getargspec(fn)
|
||||
has_self = args[0] and args[0][0] in ('self', 'cls')
|
||||
|
||||
def generate_keys(*args, **kw):
|
||||
if kw:
|
||||
raise ValueError(
|
||||
"dogpile.cache's default key creation "
|
||||
"function does not accept keyword arguments.")
|
||||
if has_self:
|
||||
args = args[1:]
|
||||
return [namespace + "|" + key for key in map(to_str, args)]
|
||||
return generate_keys
|
||||
|
||||
|
||||
def kwarg_function_key_generator(namespace, fn, to_str=compat.string_type):
|
||||
"""Return a function that generates a string
|
||||
key, based on a given function as well as
|
||||
arguments to the returned function itself.
|
||||
|
||||
For kwargs passed in, we will build a dict of
|
||||
all argname (key) argvalue (values) including
|
||||
default args from the argspec and then
|
||||
alphabetize the list before generating the
|
||||
key.
|
||||
|
||||
.. versionadded:: 0.6.2
|
||||
|
||||
.. seealso::
|
||||
|
||||
:func:`.function_key_generator` - default key generation function
|
||||
|
||||
"""
|
||||
|
||||
if namespace is None:
|
||||
namespace = '%s:%s' % (fn.__module__, fn.__name__)
|
||||
else:
|
||||
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
|
||||
|
||||
argspec = inspect.getargspec(fn)
|
||||
default_list = list(argspec.defaults or [])
|
||||
# Reverse the list, as we want to compare the argspec by negative index,
|
||||
# meaning default_list[0] should be args[-1], which works well with
|
||||
# enumerate()
|
||||
default_list.reverse()
|
||||
# use idx*-1 to create the correct right-lookup index.
|
||||
args_with_defaults = dict((argspec.args[(idx*-1)], default)
|
||||
for idx, default in enumerate(default_list, 1))
|
||||
if argspec.args and argspec.args[0] in ('self', 'cls'):
|
||||
arg_index_start = 1
|
||||
else:
|
||||
arg_index_start = 0
|
||||
|
||||
def generate_key(*args, **kwargs):
|
||||
as_kwargs = dict(
|
||||
[(argspec.args[idx], arg)
|
||||
for idx, arg in enumerate(args[arg_index_start:],
|
||||
arg_index_start)])
|
||||
as_kwargs.update(kwargs)
|
||||
for arg, val in args_with_defaults.items():
|
||||
if arg not in as_kwargs:
|
||||
as_kwargs[arg] = val
|
||||
|
||||
argument_values = [as_kwargs[key]
|
||||
for key in sorted(as_kwargs.keys())]
|
||||
return namespace + '|' + " ".join(map(to_str, argument_values))
|
||||
return generate_key
|
||||
|
||||
|
||||
def sha1_mangle_key(key):
|
||||
"""a SHA1 key mangler."""
|
||||
|
||||
return sha1(key).hexdigest()
|
||||
|
||||
|
||||
def length_conditional_mangler(length, mangler):
|
||||
"""a key mangler that mangles if the length of the key is
|
||||
past a certain threshold.
|
||||
|
||||
"""
|
||||
def mangle(key):
|
||||
if len(key) >= length:
|
||||
return mangler(key)
|
||||
else:
|
||||
return key
|
||||
return mangle
|
||||
|
||||
# in the 0.6 release these functions were moved to the dogpile.util namespace.
|
||||
# They are linked here to maintain compatibility with older versions.
|
||||
|
||||
coerce_string_conf = langhelpers.coerce_string_conf
|
||||
KeyReentrantMutex = langhelpers.KeyReentrantMutex
|
||||
memoized_property = langhelpers.memoized_property
|
||||
PluginLoader = langhelpers.PluginLoader
|
||||
to_list = langhelpers.to_list
|
17
libs/dogpile2.7/core.py
Normal file
17
libs/dogpile2.7/core.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
"""Compatibility namespace for those using dogpile.core.
|
||||
|
||||
As of dogpile.cache 0.6.0, dogpile.core as a separate package
|
||||
is no longer used by dogpile.cache.
|
||||
|
||||
Note that this namespace will not take effect if an actual
|
||||
dogpile.core installation is present.
|
||||
|
||||
"""
|
||||
|
||||
from .util import nameregistry # noqa
|
||||
from .util import readwrite_lock # noqa
|
||||
from .util.readwrite_lock import ReadWriteMutex # noqa
|
||||
from .util.nameregistry import NameRegistry # noqa
|
||||
from .lock import Lock # noqa
|
||||
from .lock import NeedRegenerationException # noqa
|
||||
from . import __version__ # noqa
|
158
libs/dogpile2.7/lock.py
Normal file
158
libs/dogpile2.7/lock.py
Normal file
|
@ -0,0 +1,158 @@
|
|||
import time
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NeedRegenerationException(Exception):
|
||||
"""An exception that when raised in the 'with' block,
|
||||
forces the 'has_value' flag to False and incurs a
|
||||
regeneration of the value.
|
||||
|
||||
"""
|
||||
|
||||
NOT_REGENERATED = object()
|
||||
|
||||
|
||||
class Lock(object):
|
||||
"""Dogpile lock class.
|
||||
|
||||
Provides an interface around an arbitrary mutex
|
||||
that allows one thread/process to be elected as
|
||||
the creator of a new value, while other threads/processes
|
||||
continue to return the previous version
|
||||
of that value.
|
||||
|
||||
:param mutex: A mutex object that provides ``acquire()``
|
||||
and ``release()`` methods.
|
||||
:param creator: Callable which returns a tuple of the form
|
||||
(new_value, creation_time). "new_value" should be a newly
|
||||
generated value representing completed state. "creation_time"
|
||||
should be a floating point time value which is relative
|
||||
to Python's ``time.time()`` call, representing the time
|
||||
at which the value was created. This time value should
|
||||
be associated with the created value.
|
||||
:param value_and_created_fn: Callable which returns
|
||||
a tuple of the form (existing_value, creation_time). This
|
||||
basically should return what the last local call to the ``creator()``
|
||||
callable has returned, i.e. the value and the creation time,
|
||||
which would be assumed here to be from a cache. If the
|
||||
value is not available, the :class:`.NeedRegenerationException`
|
||||
exception should be thrown.
|
||||
:param expiretime: Expiration time in seconds. Set to
|
||||
``None`` for never expires. This timestamp is compared
|
||||
to the creation_time result and ``time.time()`` to determine if
|
||||
the value returned by value_and_created_fn is "expired".
|
||||
:param async_creator: A callable. If specified, this callable will be
|
||||
passed the mutex as an argument and is responsible for releasing the mutex
|
||||
after it finishes some asynchronous value creation. The intent is for
|
||||
this to be used to defer invocation of the creator callable until some
|
||||
later time.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mutex,
|
||||
creator,
|
||||
value_and_created_fn,
|
||||
expiretime,
|
||||
async_creator=None,
|
||||
):
|
||||
self.mutex = mutex
|
||||
self.creator = creator
|
||||
self.value_and_created_fn = value_and_created_fn
|
||||
self.expiretime = expiretime
|
||||
self.async_creator = async_creator
|
||||
|
||||
def _is_expired(self, createdtime):
|
||||
"""Return true if the expiration time is reached, or no
|
||||
value is available."""
|
||||
|
||||
return not self._has_value(createdtime) or \
|
||||
(
|
||||
self.expiretime is not None and
|
||||
time.time() - createdtime > self.expiretime
|
||||
)
|
||||
|
||||
def _has_value(self, createdtime):
|
||||
"""Return true if the creation function has proceeded
|
||||
at least once."""
|
||||
return createdtime > 0
|
||||
|
||||
def _enter(self):
|
||||
value_fn = self.value_and_created_fn
|
||||
|
||||
try:
|
||||
value = value_fn()
|
||||
value, createdtime = value
|
||||
except NeedRegenerationException:
|
||||
log.debug("NeedRegenerationException")
|
||||
value = NOT_REGENERATED
|
||||
createdtime = -1
|
||||
|
||||
generated = self._enter_create(createdtime)
|
||||
|
||||
if generated is not NOT_REGENERATED:
|
||||
generated, createdtime = generated
|
||||
return generated
|
||||
elif value is NOT_REGENERATED:
|
||||
try:
|
||||
value, createdtime = value_fn()
|
||||
return value
|
||||
except NeedRegenerationException:
|
||||
raise Exception("Generation function should "
|
||||
"have just been called by a concurrent "
|
||||
"thread.")
|
||||
else:
|
||||
return value
|
||||
|
||||
def _enter_create(self, createdtime):
|
||||
|
||||
if not self._is_expired(createdtime):
|
||||
return NOT_REGENERATED
|
||||
|
||||
async = False
|
||||
|
||||
if self._has_value(createdtime):
|
||||
if not self.mutex.acquire(False):
|
||||
log.debug("creation function in progress "
|
||||
"elsewhere, returning")
|
||||
return NOT_REGENERATED
|
||||
else:
|
||||
log.debug("no value, waiting for create lock")
|
||||
self.mutex.acquire()
|
||||
|
||||
try:
|
||||
log.debug("value creation lock %r acquired" % self.mutex)
|
||||
|
||||
# see if someone created the value already
|
||||
try:
|
||||
value, createdtime = self.value_and_created_fn()
|
||||
except NeedRegenerationException:
|
||||
pass
|
||||
else:
|
||||
if not self._is_expired(createdtime):
|
||||
log.debug("value already present")
|
||||
return value, createdtime
|
||||
elif self.async_creator:
|
||||
log.debug("Passing creation lock to async runner")
|
||||
self.async_creator(self.mutex)
|
||||
async = True
|
||||
return value, createdtime
|
||||
|
||||
log.debug("Calling creation function")
|
||||
created = self.creator()
|
||||
return created
|
||||
finally:
|
||||
if not async:
|
||||
self.mutex.release()
|
||||
log.debug("Released creation lock")
|
||||
|
||||
|
||||
def __enter__(self):
|
||||
return self._enter()
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
pass
|
||||
|
4
libs/dogpile2.7/util/__init__.py
Normal file
4
libs/dogpile2.7/util/__init__.py
Normal file
|
@ -0,0 +1,4 @@
|
|||
from .nameregistry import NameRegistry # noqa
|
||||
from .readwrite_lock import ReadWriteMutex # noqa
|
||||
from .langhelpers import PluginLoader, memoized_property, \
|
||||
coerce_string_conf, to_list, KeyReentrantMutex # noqa
|
65
libs/dogpile2.7/util/compat.py
Normal file
65
libs/dogpile2.7/util/compat.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
import sys
|
||||
|
||||
py2k = sys.version_info < (3, 0)
|
||||
py3k = sys.version_info >= (3, 0)
|
||||
py32 = sys.version_info >= (3, 2)
|
||||
py27 = sys.version_info >= (2, 7)
|
||||
jython = sys.platform.startswith('java')
|
||||
win32 = sys.platform.startswith('win')
|
||||
|
||||
try:
|
||||
import threading
|
||||
except ImportError:
|
||||
import dummy_threading as threading # noqa
|
||||
|
||||
|
||||
if py3k: # pragma: no cover
|
||||
string_types = str,
|
||||
text_type = str
|
||||
string_type = str
|
||||
|
||||
if py32:
|
||||
callable = callable
|
||||
else:
|
||||
def callable(fn):
|
||||
return hasattr(fn, '__call__')
|
||||
|
||||
def u(s):
|
||||
return s
|
||||
|
||||
def ue(s):
|
||||
return s
|
||||
|
||||
import configparser
|
||||
import io
|
||||
import _thread as thread
|
||||
else:
|
||||
string_types = basestring,
|
||||
text_type = unicode
|
||||
string_type = str
|
||||
|
||||
def u(s):
|
||||
return unicode(s, "utf-8")
|
||||
|
||||
def ue(s):
|
||||
return unicode(s, "unicode_escape")
|
||||
|
||||
import ConfigParser as configparser # noqa
|
||||
import StringIO as io # noqa
|
||||
|
||||
callable = callable # noqa
|
||||
import thread # noqa
|
||||
|
||||
|
||||
if py3k or jython:
|
||||
import pickle
|
||||
else:
|
||||
import cPickle as pickle # noqa
|
||||
|
||||
|
||||
def timedelta_total_seconds(td):
|
||||
if py27:
|
||||
return td.total_seconds()
|
||||
else:
|
||||
return (td.microseconds + (
|
||||
td.seconds + td.days * 24 * 3600) * 1e6) / 1e6
|
123
libs/dogpile2.7/util/langhelpers.py
Normal file
123
libs/dogpile2.7/util/langhelpers.py
Normal file
|
@ -0,0 +1,123 @@
|
|||
import re
|
||||
import collections
|
||||
from . import compat
|
||||
|
||||
|
||||
def coerce_string_conf(d):
|
||||
result = {}
|
||||
for k, v in d.items():
|
||||
if not isinstance(v, compat.string_types):
|
||||
result[k] = v
|
||||
continue
|
||||
|
||||
v = v.strip()
|
||||
if re.match(r'^[-+]?\d+$', v):
|
||||
result[k] = int(v)
|
||||
elif re.match(r'^[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?$', v):
|
||||
result[k] = float(v)
|
||||
elif v.lower() in ('false', 'true'):
|
||||
result[k] = v.lower() == 'true'
|
||||
elif v == 'None':
|
||||
result[k] = None
|
||||
else:
|
||||
result[k] = v
|
||||
return result
|
||||
|
||||
|
||||
class PluginLoader(object):
|
||||
def __init__(self, group):
|
||||
self.group = group
|
||||
self.impls = {}
|
||||
|
||||
def load(self, name):
|
||||
if name in self.impls:
|
||||
return self.impls[name]()
|
||||
else: # pragma NO COVERAGE
|
||||
import pkg_resources
|
||||
for impl in pkg_resources.iter_entry_points(
|
||||
self.group, name):
|
||||
self.impls[name] = impl.load
|
||||
return impl.load()
|
||||
else:
|
||||
raise self.NotFound(
|
||||
"Can't load plugin %s %s" % (self.group, name)
|
||||
)
|
||||
|
||||
def register(self, name, modulepath, objname):
|
||||
def load():
|
||||
mod = __import__(modulepath, fromlist=[objname])
|
||||
return getattr(mod, objname)
|
||||
self.impls[name] = load
|
||||
|
||||
class NotFound(Exception):
|
||||
"""The specified plugin could not be found."""
|
||||
|
||||
|
||||
class memoized_property(object):
|
||||
"""A read-only @property that is only evaluated once."""
|
||||
def __init__(self, fget, doc=None):
|
||||
self.fget = fget
|
||||
self.__doc__ = doc or fget.__doc__
|
||||
self.__name__ = fget.__name__
|
||||
|
||||
def __get__(self, obj, cls):
|
||||
if obj is None:
|
||||
return self
|
||||
obj.__dict__[self.__name__] = result = self.fget(obj)
|
||||
return result
|
||||
|
||||
|
||||
def to_list(x, default=None):
|
||||
"""Coerce to a list."""
|
||||
if x is None:
|
||||
return default
|
||||
if not isinstance(x, (list, tuple)):
|
||||
return [x]
|
||||
else:
|
||||
return x
|
||||
|
||||
|
||||
class KeyReentrantMutex(object):
|
||||
|
||||
def __init__(self, key, mutex, keys):
|
||||
self.key = key
|
||||
self.mutex = mutex
|
||||
self.keys = keys
|
||||
|
||||
@classmethod
|
||||
def factory(cls, mutex):
|
||||
# this collection holds zero or one
|
||||
# thread idents as the key; a set of
|
||||
# keynames held as the value.
|
||||
keystore = collections.defaultdict(set)
|
||||
|
||||
def fac(key):
|
||||
return KeyReentrantMutex(key, mutex, keystore)
|
||||
return fac
|
||||
|
||||
def acquire(self, wait=True):
|
||||
current_thread = compat.threading.current_thread().ident
|
||||
keys = self.keys.get(current_thread)
|
||||
if keys is not None and \
|
||||
self.key not in keys:
|
||||
# current lockholder, new key. add it in
|
||||
keys.add(self.key)
|
||||
return True
|
||||
elif self.mutex.acquire(wait=wait):
|
||||
# after acquire, create new set and add our key
|
||||
self.keys[current_thread].add(self.key)
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def release(self):
|
||||
current_thread = compat.threading.current_thread().ident
|
||||
keys = self.keys.get(current_thread)
|
||||
assert keys is not None, "this thread didn't do the acquire"
|
||||
assert self.key in keys, "No acquire held for key '%s'" % self.key
|
||||
keys.remove(self.key)
|
||||
if not keys:
|
||||
# when list of keys empty, remove
|
||||
# the thread ident and unlock.
|
||||
del self.keys[current_thread]
|
||||
self.mutex.release()
|
84
libs/dogpile2.7/util/nameregistry.py
Normal file
84
libs/dogpile2.7/util/nameregistry.py
Normal file
|
@ -0,0 +1,84 @@
|
|||
from .compat import threading
|
||||
import weakref
|
||||
|
||||
|
||||
class NameRegistry(object):
|
||||
"""Generates and return an object, keeping it as a
|
||||
singleton for a certain identifier for as long as its
|
||||
strongly referenced.
|
||||
|
||||
e.g.::
|
||||
|
||||
class MyFoo(object):
|
||||
"some important object."
|
||||
def __init__(self, identifier):
|
||||
self.identifier = identifier
|
||||
|
||||
registry = NameRegistry(MyFoo)
|
||||
|
||||
# thread 1:
|
||||
my_foo = registry.get("foo1")
|
||||
|
||||
# thread 2
|
||||
my_foo = registry.get("foo1")
|
||||
|
||||
Above, ``my_foo`` in both thread #1 and #2 will
|
||||
be *the same object*. The constructor for
|
||||
``MyFoo`` will be called once, passing the
|
||||
identifier ``foo1`` as the argument.
|
||||
|
||||
When thread 1 and thread 2 both complete or
|
||||
otherwise delete references to ``my_foo``, the
|
||||
object is *removed* from the :class:`.NameRegistry` as
|
||||
a result of Python garbage collection.
|
||||
|
||||
:param creator: A function that will create a new
|
||||
value, given the identifier passed to the :meth:`.NameRegistry.get`
|
||||
method.
|
||||
|
||||
"""
|
||||
_locks = weakref.WeakValueDictionary()
|
||||
_mutex = threading.RLock()
|
||||
|
||||
def __init__(self, creator):
|
||||
"""Create a new :class:`.NameRegistry`.
|
||||
|
||||
|
||||
"""
|
||||
self._values = weakref.WeakValueDictionary()
|
||||
self._mutex = threading.RLock()
|
||||
self.creator = creator
|
||||
|
||||
def get(self, identifier, *args, **kw):
|
||||
"""Get and possibly create the value.
|
||||
|
||||
:param identifier: Hash key for the value.
|
||||
If the creation function is called, this identifier
|
||||
will also be passed to the creation function.
|
||||
:param \*args, \**kw: Additional arguments which will
|
||||
also be passed to the creation function if it is
|
||||
called.
|
||||
|
||||
"""
|
||||
try:
|
||||
if identifier in self._values:
|
||||
return self._values[identifier]
|
||||
else:
|
||||
return self._sync_get(identifier, *args, **kw)
|
||||
except KeyError:
|
||||
return self._sync_get(identifier, *args, **kw)
|
||||
|
||||
def _sync_get(self, identifier, *args, **kw):
|
||||
self._mutex.acquire()
|
||||
try:
|
||||
try:
|
||||
if identifier in self._values:
|
||||
return self._values[identifier]
|
||||
else:
|
||||
self._values[identifier] = value = self.creator(identifier, *args, **kw)
|
||||
return value
|
||||
except KeyError:
|
||||
self._values[identifier] = value = self.creator(identifier, *args, **kw)
|
||||
return value
|
||||
finally:
|
||||
self._mutex.release()
|
132
libs/dogpile2.7/util/readwrite_lock.py
Normal file
132
libs/dogpile2.7/util/readwrite_lock.py
Normal file
|
@ -0,0 +1,132 @@
|
|||
from .compat import threading
|
||||
|
||||
import logging
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LockError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ReadWriteMutex(object):
|
||||
"""A mutex which allows multiple readers, single writer.
|
||||
|
||||
:class:`.ReadWriteMutex` uses a Python ``threading.Condition``
|
||||
to provide this functionality across threads within a process.
|
||||
|
||||
The Beaker package also contained a file-lock based version
|
||||
of this concept, so that readers/writers could be synchronized
|
||||
across processes with a common filesystem. A future Dogpile
|
||||
release may include this additional class at some point.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# counts how many asynchronous methods are executing
|
||||
self.async = 0
|
||||
|
||||
# pointer to thread that is the current sync operation
|
||||
self.current_sync_operation = None
|
||||
|
||||
# condition object to lock on
|
||||
self.condition = threading.Condition(threading.Lock())
|
||||
|
||||
def acquire_read_lock(self, wait = True):
|
||||
"""Acquire the 'read' lock."""
|
||||
self.condition.acquire()
|
||||
try:
|
||||
# see if a synchronous operation is waiting to start
|
||||
# or is already running, in which case we wait (or just
|
||||
# give up and return)
|
||||
if wait:
|
||||
while self.current_sync_operation is not None:
|
||||
self.condition.wait()
|
||||
else:
|
||||
if self.current_sync_operation is not None:
|
||||
return False
|
||||
|
||||
self.async += 1
|
||||
log.debug("%s acquired read lock", self)
|
||||
finally:
|
||||
self.condition.release()
|
||||
|
||||
if not wait:
|
||||
return True
|
||||
|
||||
def release_read_lock(self):
|
||||
"""Release the 'read' lock."""
|
||||
self.condition.acquire()
|
||||
try:
|
||||
self.async -= 1
|
||||
|
||||
# check if we are the last asynchronous reader thread
|
||||
# out the door.
|
||||
if self.async == 0:
|
||||
# yes. so if a sync operation is waiting, notifyAll to wake
|
||||
# it up
|
||||
if self.current_sync_operation is not None:
|
||||
self.condition.notifyAll()
|
||||
elif self.async < 0:
|
||||
raise LockError("Synchronizer error - too many "
|
||||
"release_read_locks called")
|
||||
log.debug("%s released read lock", self)
|
||||
finally:
|
||||
self.condition.release()
|
||||
|
||||
def acquire_write_lock(self, wait = True):
|
||||
"""Acquire the 'write' lock."""
|
||||
self.condition.acquire()
|
||||
try:
|
||||
# here, we are not a synchronous reader, and after returning,
|
||||
# assuming waiting or immediate availability, we will be.
|
||||
|
||||
if wait:
|
||||
# if another sync is working, wait
|
||||
while self.current_sync_operation is not None:
|
||||
self.condition.wait()
|
||||
else:
|
||||
# if another sync is working,
|
||||
# we dont want to wait, so forget it
|
||||
if self.current_sync_operation is not None:
|
||||
return False
|
||||
|
||||
# establish ourselves as the current sync
|
||||
# this indicates to other read/write operations
|
||||
# that they should wait until this is None again
|
||||
self.current_sync_operation = threading.currentThread()
|
||||
|
||||
# now wait again for asyncs to finish
|
||||
if self.async > 0:
|
||||
if wait:
|
||||
# wait
|
||||
self.condition.wait()
|
||||
else:
|
||||
# we dont want to wait, so forget it
|
||||
self.current_sync_operation = None
|
||||
return False
|
||||
log.debug("%s acquired write lock", self)
|
||||
finally:
|
||||
self.condition.release()
|
||||
|
||||
if not wait:
|
||||
return True
|
||||
|
||||
def release_write_lock(self):
|
||||
"""Release the 'write' lock."""
|
||||
self.condition.acquire()
|
||||
try:
|
||||
if self.current_sync_operation is not threading.currentThread():
|
||||
raise LockError("Synchronizer error - current thread doesn't "
|
||||
"have the write lock")
|
||||
|
||||
# reset the current sync operation so
|
||||
# another can get it
|
||||
self.current_sync_operation = None
|
||||
|
||||
# tell everyone to get ready
|
||||
self.condition.notifyAll()
|
||||
|
||||
log.debug("%s released write lock", self)
|
||||
finally:
|
||||
# everyone go !!
|
||||
self.condition.release()
|
127
libs/soupsieve/__init__.py
Normal file
127
libs/soupsieve/__init__.py
Normal file
|
@ -0,0 +1,127 @@
|
|||
"""
|
||||
Soup Sieve.
|
||||
|
||||
A CSS selector filter for BeautifulSoup4.
|
||||
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2018 Isaac Muse
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
from .__meta__ import __version__, __version_info__ # noqa: F401
|
||||
from . import css_parser as cp
|
||||
from . import css_match as cm
|
||||
from . import css_types as ct
|
||||
from .util import DEBUG, deprecated, SelectorSyntaxError # noqa: F401
|
||||
|
||||
__all__ = (
|
||||
'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
|
||||
'closest', 'comments', 'compile', 'filter', 'icomments',
|
||||
'iselect', 'match', 'select', 'select_one'
|
||||
)
|
||||
|
||||
SoupSieve = cm.SoupSieve
|
||||
|
||||
|
||||
def compile(pattern, namespaces=None, flags=0, **kwargs): # noqa: A001
|
||||
"""Compile CSS pattern."""
|
||||
|
||||
if namespaces is not None:
|
||||
namespaces = ct.Namespaces(**namespaces)
|
||||
|
||||
custom = kwargs.get('custom')
|
||||
if custom is not None:
|
||||
custom = ct.CustomSelectors(**custom)
|
||||
|
||||
if isinstance(pattern, SoupSieve):
|
||||
if flags:
|
||||
raise ValueError("Cannot process 'flags' argument on a compiled selector list")
|
||||
elif namespaces is not None:
|
||||
raise ValueError("Cannot process 'namespaces' argument on a compiled selector list")
|
||||
elif custom is not None:
|
||||
raise ValueError("Cannot process 'custom' argument on a compiled selector list")
|
||||
return pattern
|
||||
|
||||
return cp._cached_css_compile(pattern, namespaces, custom, flags)
|
||||
|
||||
|
||||
def purge():
|
||||
"""Purge cached patterns."""
|
||||
|
||||
cp._purge_cache()
|
||||
|
||||
|
||||
def closest(select, tag, namespaces=None, flags=0, **kwargs):
|
||||
"""Match closest ancestor."""
|
||||
|
||||
return compile(select, namespaces, flags, **kwargs).closest(tag)
|
||||
|
||||
|
||||
def match(select, tag, namespaces=None, flags=0, **kwargs):
|
||||
"""Match node."""
|
||||
|
||||
return compile(select, namespaces, flags, **kwargs).match(tag)
|
||||
|
||||
|
||||
def filter(select, iterable, namespaces=None, flags=0, **kwargs): # noqa: A001
|
||||
"""Filter list of nodes."""
|
||||
|
||||
return compile(select, namespaces, flags, **kwargs).filter(iterable)
|
||||
|
||||
|
||||
@deprecated("'comments' is not related to CSS selectors and will be removed in the future.")
|
||||
def comments(tag, limit=0, flags=0, **kwargs):
|
||||
"""Get comments only."""
|
||||
|
||||
return [comment for comment in cm.CommentsMatch(tag).get_comments(limit)]
|
||||
|
||||
|
||||
@deprecated("'icomments' is not related to CSS selectors and will be removed in the future.")
|
||||
def icomments(tag, limit=0, flags=0, **kwargs):
|
||||
"""Iterate comments only."""
|
||||
|
||||
for comment in cm.CommentsMatch(tag).get_comments(limit):
|
||||
yield comment
|
||||
|
||||
|
||||
def select_one(select, tag, namespaces=None, flags=0, **kwargs):
|
||||
"""Select a single tag."""
|
||||
|
||||
return compile(select, namespaces, flags, **kwargs).select_one(tag)
|
||||
|
||||
|
||||
def select(select, tag, namespaces=None, limit=0, flags=0, **kwargs):
|
||||
"""Select the specified tags."""
|
||||
|
||||
return compile(select, namespaces, flags, **kwargs).select(tag, limit)
|
||||
|
||||
|
||||
def iselect(select, tag, namespaces=None, limit=0, flags=0, **kwargs):
|
||||
"""Iterate the specified tags."""
|
||||
|
||||
for el in compile(select, namespaces, flags, **kwargs).iselect(tag, limit):
|
||||
yield el
|
||||
|
||||
|
||||
def escape(ident):
|
||||
"""Escape identifier."""
|
||||
|
||||
return cp.escape(ident)
|
190
libs/soupsieve/__meta__.py
Normal file
190
libs/soupsieve/__meta__.py
Normal file
|
@ -0,0 +1,190 @@
|
|||
"""Meta related things."""
|
||||
from __future__ import unicode_literals
|
||||
from collections import namedtuple
|
||||
import re
|
||||
|
||||
RE_VER = re.compile(
|
||||
r'''(?x)
|
||||
(?P<major>\d+)(?:\.(?P<minor>\d+))?(?:\.(?P<micro>\d+))?
|
||||
(?:(?P<type>a|b|rc)(?P<pre>\d+))?
|
||||
(?:\.post(?P<post>\d+))?
|
||||
(?:\.dev(?P<dev>\d+))?
|
||||
'''
|
||||
)
|
||||
|
||||
REL_MAP = {
|
||||
".dev": "",
|
||||
".dev-alpha": "a",
|
||||
".dev-beta": "b",
|
||||
".dev-candidate": "rc",
|
||||
"alpha": "a",
|
||||
"beta": "b",
|
||||
"candidate": "rc",
|
||||
"final": ""
|
||||
}
|
||||
|
||||
DEV_STATUS = {
|
||||
".dev": "2 - Pre-Alpha",
|
||||
".dev-alpha": "2 - Pre-Alpha",
|
||||
".dev-beta": "2 - Pre-Alpha",
|
||||
".dev-candidate": "2 - Pre-Alpha",
|
||||
"alpha": "3 - Alpha",
|
||||
"beta": "4 - Beta",
|
||||
"candidate": "4 - Beta",
|
||||
"final": "5 - Production/Stable"
|
||||
}
|
||||
|
||||
PRE_REL_MAP = {"a": 'alpha', "b": 'beta', "rc": 'candidate'}
|
||||
|
||||
|
||||
class Version(namedtuple("Version", ["major", "minor", "micro", "release", "pre", "post", "dev"])):
|
||||
"""
|
||||
Get the version (PEP 440).
|
||||
|
||||
A biased approach to the PEP 440 semantic version.
|
||||
|
||||
Provides a tuple structure which is sorted for comparisons `v1 > v2` etc.
|
||||
(major, minor, micro, release type, pre-release build, post-release build, development release build)
|
||||
Release types are named in is such a way they are comparable with ease.
|
||||
Accessors to check if a development, pre-release, or post-release build. Also provides accessor to get
|
||||
development status for setup files.
|
||||
|
||||
How it works (currently):
|
||||
|
||||
- You must specify a release type as either `final`, `alpha`, `beta`, or `candidate`.
|
||||
- To define a development release, you can use either `.dev`, `.dev-alpha`, `.dev-beta`, or `.dev-candidate`.
|
||||
The dot is used to ensure all development specifiers are sorted before `alpha`.
|
||||
You can specify a `dev` number for development builds, but do not have to as implicit development releases
|
||||
are allowed.
|
||||
- You must specify a `pre` value greater than zero if using a prerelease as this project (not PEP 440) does not
|
||||
allow implicit prereleases.
|
||||
- You can optionally set `post` to a value greater than zero to make the build a post release. While post releases
|
||||
are technically allowed in prereleases, it is strongly discouraged, so we are rejecting them. It should be
|
||||
noted that we do not allow `post0` even though PEP 440 does not restrict this. This project specifically
|
||||
does not allow implicit post releases.
|
||||
- It should be noted that we do not support epochs `1!` or local versions `+some-custom.version-1`.
|
||||
|
||||
Acceptable version releases:
|
||||
|
||||
```
|
||||
Version(1, 0, 0, "final") 1.0
|
||||
Version(1, 2, 0, "final") 1.2
|
||||
Version(1, 2, 3, "final") 1.2.3
|
||||
Version(1, 2, 0, ".dev-alpha", pre=4) 1.2a4
|
||||
Version(1, 2, 0, ".dev-beta", pre=4) 1.2b4
|
||||
Version(1, 2, 0, ".dev-candidate", pre=4) 1.2rc4
|
||||
Version(1, 2, 0, "final", post=1) 1.2.post1
|
||||
Version(1, 2, 3, ".dev") 1.2.3.dev0
|
||||
Version(1, 2, 3, ".dev", dev=1) 1.2.3.dev1
|
||||
```
|
||||
|
||||
"""
|
||||
|
||||
def __new__(cls, major, minor, micro, release="final", pre=0, post=0, dev=0):
|
||||
"""Validate version info."""
|
||||
|
||||
# Ensure all parts are positive integers.
|
||||
for value in (major, minor, micro, pre, post):
|
||||
if not (isinstance(value, int) and value >= 0):
|
||||
raise ValueError("All version parts except 'release' should be integers.")
|
||||
|
||||
if release not in REL_MAP:
|
||||
raise ValueError("'{}' is not a valid release type.".format(release))
|
||||
|
||||
# Ensure valid pre-release (we do not allow implicit pre-releases).
|
||||
if ".dev-candidate" < release < "final":
|
||||
if pre == 0:
|
||||
raise ValueError("Implicit pre-releases not allowed.")
|
||||
elif dev:
|
||||
raise ValueError("Version is not a development release.")
|
||||
elif post:
|
||||
raise ValueError("Post-releases are not allowed with pre-releases.")
|
||||
|
||||
# Ensure valid development or development/pre release
|
||||
elif release < "alpha":
|
||||
if release > ".dev" and pre == 0:
|
||||
raise ValueError("Implicit pre-release not allowed.")
|
||||
elif post:
|
||||
raise ValueError("Post-releases are not allowed with pre-releases.")
|
||||
|
||||
# Ensure a valid normal release
|
||||
else:
|
||||
if pre:
|
||||
raise ValueError("Version is not a pre-release.")
|
||||
elif dev:
|
||||
raise ValueError("Version is not a development release.")
|
||||
|
||||
return super(Version, cls).__new__(cls, major, minor, micro, release, pre, post, dev)
|
||||
|
||||
def _is_pre(self):
|
||||
"""Is prerelease."""
|
||||
|
||||
return self.pre > 0
|
||||
|
||||
def _is_dev(self):
|
||||
"""Is development."""
|
||||
|
||||
return bool(self.release < "alpha")
|
||||
|
||||
def _is_post(self):
|
||||
"""Is post."""
|
||||
|
||||
return self.post > 0
|
||||
|
||||
def _get_dev_status(self): # pragma: no cover
|
||||
"""Get development status string."""
|
||||
|
||||
return DEV_STATUS[self.release]
|
||||
|
||||
def _get_canonical(self):
|
||||
"""Get the canonical output string."""
|
||||
|
||||
# Assemble major, minor, micro version and append `pre`, `post`, or `dev` if needed..
|
||||
if self.micro == 0:
|
||||
ver = "{}.{}".format(self.major, self.minor)
|
||||
else:
|
||||
ver = "{}.{}.{}".format(self.major, self.minor, self.micro)
|
||||
if self._is_pre():
|
||||
ver += '{}{}'.format(REL_MAP[self.release], self.pre)
|
||||
if self._is_post():
|
||||
ver += ".post{}".format(self.post)
|
||||
if self._is_dev():
|
||||
ver += ".dev{}".format(self.dev)
|
||||
|
||||
return ver
|
||||
|
||||
|
||||
def parse_version(ver, pre=False):
|
||||
"""Parse version into a comparable Version tuple."""
|
||||
|
||||
m = RE_VER.match(ver)
|
||||
|
||||
# Handle major, minor, micro
|
||||
major = int(m.group('major'))
|
||||
minor = int(m.group('minor')) if m.group('minor') else 0
|
||||
micro = int(m.group('micro')) if m.group('micro') else 0
|
||||
|
||||
# Handle pre releases
|
||||
if m.group('type'):
|
||||
release = PRE_REL_MAP[m.group('type')]
|
||||
pre = int(m.group('pre'))
|
||||
else:
|
||||
release = "final"
|
||||
pre = 0
|
||||
|
||||
# Handle development releases
|
||||
dev = m.group('dev') if m.group('dev') else 0
|
||||
if m.group('dev'):
|
||||
dev = int(m.group('dev'))
|
||||
release = '.dev-' + release if pre else '.dev'
|
||||
else:
|
||||
dev = 0
|
||||
|
||||
# Handle post
|
||||
post = int(m.group('post')) if m.group('post') else 0
|
||||
|
||||
return Version(major, minor, micro, release, pre, post, dev)
|
||||
|
||||
|
||||
__version_info__ = Version(1, 9, 3, "final")
|
||||
__version__ = __version_info__._get_canonical()
|
1472
libs/soupsieve/css_match.py
Normal file
1472
libs/soupsieve/css_match.py
Normal file
File diff suppressed because it is too large
Load diff
1218
libs/soupsieve/css_parser.py
Normal file
1218
libs/soupsieve/css_parser.py
Normal file
File diff suppressed because it is too large
Load diff
344
libs/soupsieve/css_types.py
Normal file
344
libs/soupsieve/css_types.py
Normal file
|
@ -0,0 +1,344 @@
|
|||
"""CSS selector structure items."""
|
||||
from __future__ import unicode_literals
|
||||
from . import util
|
||||
|
||||
__all__ = (
|
||||
'Selector',
|
||||
'SelectorNull',
|
||||
'SelectorTag',
|
||||
'SelectorAttribute',
|
||||
'SelectorContains',
|
||||
'SelectorNth',
|
||||
'SelectorLang',
|
||||
'SelectorList',
|
||||
'Namespaces',
|
||||
'CustomSelectors'
|
||||
)
|
||||
|
||||
|
||||
SEL_EMPTY = 0x1
|
||||
SEL_ROOT = 0x2
|
||||
SEL_DEFAULT = 0x4
|
||||
SEL_INDETERMINATE = 0x8
|
||||
SEL_SCOPE = 0x10
|
||||
SEL_DIR_LTR = 0x20
|
||||
SEL_DIR_RTL = 0x40
|
||||
SEL_IN_RANGE = 0x80
|
||||
SEL_OUT_OF_RANGE = 0x100
|
||||
SEL_DEFINED = 0x200
|
||||
|
||||
|
||||
class Immutable(object):
|
||||
"""Immutable."""
|
||||
|
||||
__slots__ = ('_hash',)
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize."""
|
||||
|
||||
temp = []
|
||||
for k, v in kwargs.items():
|
||||
temp.append(type(v))
|
||||
temp.append(v)
|
||||
super(Immutable, self).__setattr__(k, v)
|
||||
super(Immutable, self).__setattr__('_hash', hash(tuple(temp)))
|
||||
|
||||
@classmethod
|
||||
def __base__(cls):
|
||||
"""Get base class."""
|
||||
|
||||
return cls
|
||||
|
||||
def __eq__(self, other):
|
||||
"""Equal."""
|
||||
|
||||
return (
|
||||
isinstance(other, self.__base__()) and
|
||||
all([getattr(other, key) == getattr(self, key) for key in self.__slots__ if key != '_hash'])
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
"""Equal."""
|
||||
|
||||
return (
|
||||
not isinstance(other, self.__base__()) or
|
||||
any([getattr(other, key) != getattr(self, key) for key in self.__slots__ if key != '_hash'])
|
||||
)
|
||||
|
||||
def __hash__(self):
|
||||
"""Hash."""
|
||||
|
||||
return self._hash
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
"""Prevent mutability."""
|
||||
|
||||
raise AttributeError("'{}' is immutable".format(self.__class__.__name__))
|
||||
|
||||
def __repr__(self): # pragma: no cover
|
||||
"""Representation."""
|
||||
|
||||
return "{}({})".format(
|
||||
self.__base__(), ', '.join(["{}={!r}".format(k, getattr(self, k)) for k in self.__slots__[:-1]])
|
||||
)
|
||||
|
||||
__str__ = __repr__
|
||||
|
||||
|
||||
class ImmutableDict(util.Mapping):
|
||||
"""Hashable, immutable dictionary."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""Initialize."""
|
||||
|
||||
arg = args[0] if args else kwargs
|
||||
is_dict = isinstance(arg, dict)
|
||||
if (
|
||||
is_dict and not all([isinstance(v, util.Hashable) for v in arg.values()]) or
|
||||
not is_dict and not all([isinstance(k, util.Hashable) and isinstance(v, util.Hashable) for k, v in arg])
|
||||
):
|
||||
raise TypeError('All values must be hashable')
|
||||
|
||||
self._d = dict(*args, **kwargs)
|
||||
self._hash = hash(tuple([(type(x), x, type(y), y) for x, y in sorted(self._d.items())]))
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterator."""
|
||||
|
||||
return iter(self._d)
|
||||
|
||||
def __len__(self):
|
||||
"""Length."""
|
||||
|
||||
return len(self._d)
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""Get item: `namespace['key']`."""
|
||||
return self._d[key]
|
||||
|
||||
def __hash__(self):
|
||||
"""Hash."""
|
||||
|
||||
return self._hash
|
||||
|
||||
def __repr__(self): # pragma: no cover
|
||||
"""Representation."""
|
||||
|
||||
return "{!r}".format(self._d)
|
||||
|
||||
__str__ = __repr__
|
||||
|
||||
|
||||
class Namespaces(ImmutableDict):
|
||||
"""Namespaces."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""Initialize."""
|
||||
|
||||
# If there are arguments, check the first index.
|
||||
# `super` should fail if the user gave multiple arguments,
|
||||
# so don't bother checking that.
|
||||
arg = args[0] if args else kwargs
|
||||
is_dict = isinstance(arg, dict)
|
||||
if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]):
|
||||
raise TypeError('Namespace keys and values must be Unicode strings')
|
||||
elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]):
|
||||
raise TypeError('Namespace keys and values must be Unicode strings')
|
||||
|
||||
super(Namespaces, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class CustomSelectors(ImmutableDict):
|
||||
"""Custom selectors."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""Initialize."""
|
||||
|
||||
# If there are arguments, check the first index.
|
||||
# `super` should fail if the user gave multiple arguments,
|
||||
# so don't bother checking that.
|
||||
arg = args[0] if args else kwargs
|
||||
is_dict = isinstance(arg, dict)
|
||||
if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]):
|
||||
raise TypeError('CustomSelectors keys and values must be Unicode strings')
|
||||
elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]):
|
||||
raise TypeError('CustomSelectors keys and values must be Unicode strings')
|
||||
|
||||
super(CustomSelectors, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class Selector(Immutable):
|
||||
"""Selector."""
|
||||
|
||||
__slots__ = (
|
||||
'tag', 'ids', 'classes', 'attributes', 'nth', 'selectors',
|
||||
'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash'
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self, tag, ids, classes, attributes, nth, selectors,
|
||||
relation, rel_type, contains, lang, flags
|
||||
):
|
||||
"""Initialize."""
|
||||
|
||||
super(Selector, self).__init__(
|
||||
tag=tag,
|
||||
ids=ids,
|
||||
classes=classes,
|
||||
attributes=attributes,
|
||||
nth=nth,
|
||||
selectors=selectors,
|
||||
relation=relation,
|
||||
rel_type=rel_type,
|
||||
contains=contains,
|
||||
lang=lang,
|
||||
flags=flags
|
||||
)
|
||||
|
||||
|
||||
class SelectorNull(Immutable):
|
||||
"""Null Selector."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize."""
|
||||
|
||||
super(SelectorNull, self).__init__()
|
||||
|
||||
|
||||
class SelectorTag(Immutable):
|
||||
"""Selector tag."""
|
||||
|
||||
__slots__ = ("name", "prefix", "_hash")
|
||||
|
||||
def __init__(self, name, prefix):
|
||||
"""Initialize."""
|
||||
|
||||
super(SelectorTag, self).__init__(
|
||||
name=name,
|
||||
prefix=prefix
|
||||
)
|
||||
|
||||
|
||||
class SelectorAttribute(Immutable):
|
||||
"""Selector attribute rule."""
|
||||
|
||||
__slots__ = ("attribute", "prefix", "pattern", "xml_type_pattern", "_hash")
|
||||
|
||||
def __init__(self, attribute, prefix, pattern, xml_type_pattern):
|
||||
"""Initialize."""
|
||||
|
||||
super(SelectorAttribute, self).__init__(
|
||||
attribute=attribute,
|
||||
prefix=prefix,
|
||||
pattern=pattern,
|
||||
xml_type_pattern=xml_type_pattern
|
||||
)
|
||||
|
||||
|
||||
class SelectorContains(Immutable):
|
||||
"""Selector contains rule."""
|
||||
|
||||
__slots__ = ("text", "_hash")
|
||||
|
||||
def __init__(self, text):
|
||||
"""Initialize."""
|
||||
|
||||
super(SelectorContains, self).__init__(
|
||||
text=text
|
||||
)
|
||||
|
||||
|
||||
class SelectorNth(Immutable):
|
||||
"""Selector nth type."""
|
||||
|
||||
__slots__ = ("a", "n", "b", "of_type", "last", "selectors", "_hash")
|
||||
|
||||
def __init__(self, a, n, b, of_type, last, selectors):
|
||||
"""Initialize."""
|
||||
|
||||
super(SelectorNth, self).__init__(
|
||||
a=a,
|
||||
n=n,
|
||||
b=b,
|
||||
of_type=of_type,
|
||||
last=last,
|
||||
selectors=selectors
|
||||
)
|
||||
|
||||
|
||||
class SelectorLang(Immutable):
|
||||
"""Selector language rules."""
|
||||
|
||||
__slots__ = ("languages", "_hash",)
|
||||
|
||||
def __init__(self, languages):
|
||||
"""Initialize."""
|
||||
|
||||
super(SelectorLang, self).__init__(
|
||||
languages=tuple(languages)
|
||||
)
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterator."""
|
||||
|
||||
return iter(self.languages)
|
||||
|
||||
def __len__(self): # pragma: no cover
|
||||
"""Length."""
|
||||
|
||||
return len(self.languages)
|
||||
|
||||
def __getitem__(self, index): # pragma: no cover
|
||||
"""Get item."""
|
||||
|
||||
return self.languages[index]
|
||||
|
||||
|
||||
class SelectorList(Immutable):
|
||||
"""Selector list."""
|
||||
|
||||
__slots__ = ("selectors", "is_not", "is_html", "_hash")
|
||||
|
||||
def __init__(self, selectors=tuple(), is_not=False, is_html=False):
|
||||
"""Initialize."""
|
||||
|
||||
super(SelectorList, self).__init__(
|
||||
selectors=tuple(selectors),
|
||||
is_not=is_not,
|
||||
is_html=is_html
|
||||
)
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterator."""
|
||||
|
||||
return iter(self.selectors)
|
||||
|
||||
def __len__(self):
|
||||
"""Length."""
|
||||
|
||||
return len(self.selectors)
|
||||
|
||||
def __getitem__(self, index):
|
||||
"""Get item."""
|
||||
|
||||
return self.selectors[index]
|
||||
|
||||
|
||||
def _pickle(p):
|
||||
return p.__base__(), tuple([getattr(p, s) for s in p.__slots__[:-1]])
|
||||
|
||||
|
||||
def pickle_register(obj):
|
||||
"""Allow object to be pickled."""
|
||||
|
||||
util.copyreg.pickle(obj, _pickle)
|
||||
|
||||
|
||||
pickle_register(Selector)
|
||||
pickle_register(SelectorNull)
|
||||
pickle_register(SelectorTag)
|
||||
pickle_register(SelectorAttribute)
|
||||
pickle_register(SelectorContains)
|
||||
pickle_register(SelectorNth)
|
||||
pickle_register(SelectorLang)
|
||||
pickle_register(SelectorList)
|
170
libs/soupsieve/util.py
Normal file
170
libs/soupsieve/util.py
Normal file
|
@ -0,0 +1,170 @@
|
|||
"""Utility."""
|
||||
from __future__ import unicode_literals
|
||||
from functools import wraps
|
||||
import warnings
|
||||
import sys
|
||||
import struct
|
||||
import os
|
||||
import re
|
||||
MODULE = os.path.dirname(__file__)
|
||||
|
||||
PY3 = sys.version_info >= (3, 0)
|
||||
PY35 = sys.version_info >= (3, 5)
|
||||
PY37 = sys.version_info >= (3, 7)
|
||||
|
||||
if PY3:
|
||||
from functools import lru_cache # noqa F401
|
||||
import copyreg # noqa F401
|
||||
from collections.abc import Hashable, Mapping # noqa F401
|
||||
|
||||
ustr = str
|
||||
bstr = bytes
|
||||
unichar = chr
|
||||
string = str
|
||||
else:
|
||||
from backports.functools_lru_cache import lru_cache # noqa F401
|
||||
import copy_reg as copyreg # noqa F401
|
||||
from collections import Hashable, Mapping # noqa F401
|
||||
|
||||
ustr = unicode # noqa: F821
|
||||
bstr = str
|
||||
unichar = unichr # noqa: F821
|
||||
string = basestring # noqa: F821
|
||||
|
||||
DEBUG = 0x00001
|
||||
|
||||
RE_PATTERN_LINE_SPLIT = re.compile(r'(?:\r\n|(?!\r\n)[\n\r])|$')
|
||||
|
||||
LC_A = ord('a')
|
||||
LC_Z = ord('z')
|
||||
UC_A = ord('A')
|
||||
UC_Z = ord('Z')
|
||||
|
||||
|
||||
def lower(string):
|
||||
"""Lower."""
|
||||
|
||||
new_string = []
|
||||
for c in string:
|
||||
o = ord(c)
|
||||
new_string.append(chr(o + 32) if UC_A <= o <= UC_Z else c)
|
||||
return ''.join(new_string)
|
||||
|
||||
|
||||
def upper(string): # pragma: no cover
|
||||
"""Lower."""
|
||||
|
||||
new_string = []
|
||||
for c in string:
|
||||
o = ord(c)
|
||||
new_string.append(chr(o - 32) if LC_A <= o <= LC_Z else c)
|
||||
return ''.join(new_string)
|
||||
|
||||
|
||||
def uchr(i):
|
||||
"""Allow getting Unicode character on narrow python builds."""
|
||||
|
||||
try:
|
||||
return unichar(i)
|
||||
except ValueError: # pragma: no cover
|
||||
return struct.pack('i', i).decode('utf-32')
|
||||
|
||||
|
||||
def uord(c):
|
||||
"""Get Unicode ordinal."""
|
||||
|
||||
if len(c) == 2: # pragma: no cover
|
||||
high, low = [ord(p) for p in c]
|
||||
ordinal = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
|
||||
else:
|
||||
ordinal = ord(c)
|
||||
|
||||
return ordinal
|
||||
|
||||
|
||||
class SelectorSyntaxError(SyntaxError):
|
||||
"""Syntax error in a CSS selector."""
|
||||
|
||||
def __init__(self, msg, pattern=None, index=None):
|
||||
"""Initialize."""
|
||||
|
||||
self.line = None
|
||||
self.col = None
|
||||
self.context = None
|
||||
|
||||
if pattern is not None and index is not None:
|
||||
# Format pattern to show line and column position
|
||||
self.context, self.line, self.col = get_pattern_context(pattern, index)
|
||||
msg = '{}\n line {}:\n{}'.format(msg, self.line, self.context)
|
||||
|
||||
super(SelectorSyntaxError, self).__init__(msg)
|
||||
|
||||
|
||||
def deprecated(message, stacklevel=2): # pragma: no cover
|
||||
"""
|
||||
Raise a `DeprecationWarning` when wrapped function/method is called.
|
||||
|
||||
Borrowed from https://stackoverflow.com/a/48632082/866026
|
||||
"""
|
||||
|
||||
def _decorator(func):
|
||||
@wraps(func)
|
||||
def _func(*args, **kwargs):
|
||||
warnings.warn(
|
||||
"'{}' is deprecated. {}".format(func.__name__, message),
|
||||
category=DeprecationWarning,
|
||||
stacklevel=stacklevel
|
||||
)
|
||||
return func(*args, **kwargs)
|
||||
return _func
|
||||
return _decorator
|
||||
|
||||
|
||||
def warn_deprecated(message, stacklevel=2): # pragma: no cover
|
||||
"""Warn deprecated."""
|
||||
|
||||
warnings.warn(
|
||||
message,
|
||||
category=DeprecationWarning,
|
||||
stacklevel=stacklevel
|
||||
)
|
||||
|
||||
|
||||
def get_pattern_context(pattern, index):
|
||||
"""Get the pattern context."""
|
||||
|
||||
last = 0
|
||||
current_line = 1
|
||||
col = 1
|
||||
text = []
|
||||
line = 1
|
||||
|
||||
# Split pattern by newline and handle the text before the newline
|
||||
for m in RE_PATTERN_LINE_SPLIT.finditer(pattern):
|
||||
linetext = pattern[last:m.start(0)]
|
||||
if not len(m.group(0)) and not len(text):
|
||||
indent = ''
|
||||
offset = -1
|
||||
col = index - last + 1
|
||||
elif last <= index < m.end(0):
|
||||
indent = '--> '
|
||||
offset = (-1 if index > m.start(0) else 0) + 3
|
||||
col = index - last + 1
|
||||
else:
|
||||
indent = ' '
|
||||
offset = None
|
||||
if len(text):
|
||||
# Regardless of whether we are presented with `\r\n`, `\r`, or `\n`,
|
||||
# we will render the output with just `\n`. We will still log the column
|
||||
# correctly though.
|
||||
text.append('\n')
|
||||
text.append('{}{}'.format(indent, linetext))
|
||||
if offset is not None:
|
||||
text.append('\n')
|
||||
text.append(' ' * (col + offset) + '^')
|
||||
line = current_line
|
||||
|
||||
current_line += 1
|
||||
last = m.end(0)
|
||||
|
||||
return ''.join(text), line, col
|
88
libs/subliminal/subtitles/__init__.py
Normal file
88
libs/subliminal/subtitles/__init__.py
Normal file
|
@ -0,0 +1,88 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from datetime import time
|
||||
|
||||
|
||||
class Component(object):
|
||||
"""Base class for cue text.
|
||||
|
||||
:param list components: sub-components of this one.
|
||||
|
||||
"""
|
||||
tag_name = 'Component'
|
||||
|
||||
def __init__(self, components=None):
|
||||
if components is None:
|
||||
self.components = []
|
||||
elif isinstance(components, list):
|
||||
self.components = components
|
||||
else:
|
||||
self.components = [components]
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.components)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.components)
|
||||
|
||||
def __str__(self):
|
||||
return ''.join(str(c) for c in self.components)
|
||||
|
||||
def __repr__(self):
|
||||
return '<{name}>{components}</{name}>'.format(name=self.tag_name,
|
||||
components=''.join(repr(c) for c in self.components))
|
||||
|
||||
|
||||
class Bold(Component):
|
||||
"""Bold :class:`Component`."""
|
||||
tag_name = 'b'
|
||||
|
||||
|
||||
class Italic(Component):
|
||||
"""Italic :class:`Component`."""
|
||||
tag_name = 'i'
|
||||
|
||||
|
||||
class Underline(Component):
|
||||
"""Underline :class:`Component`."""
|
||||
tag_name = 'u'
|
||||
|
||||
|
||||
class Strikethrough(Component):
|
||||
"""Strikethrough :class:`Component`."""
|
||||
tag_name = 's'
|
||||
|
||||
|
||||
class Font(Component):
|
||||
"""Font :class:`Component`."""
|
||||
tag_name = 'font'
|
||||
|
||||
def __init__(self, color, *args, **kwargs):
|
||||
super(Font, self).__init__(*args, **kwargs)
|
||||
self.color = color
|
||||
|
||||
def __repr__(self):
|
||||
return '<{name} "{color}">{components}</{name}>'.format(name=self.tag_name, color=self.color,
|
||||
components=''.join(repr(c) for c in self.components))
|
||||
|
||||
|
||||
class Cue(object):
|
||||
"""A single subtitle cue with timings and components.
|
||||
|
||||
:param datetime.time start_time: start time.
|
||||
:param datetime.time end_time: end time.
|
||||
:param list components: cue components.
|
||||
|
||||
"""
|
||||
def __init__(self, start_time, end_time, components):
|
||||
self.start_time = start_time
|
||||
self.end_time = end_time
|
||||
self.components = components
|
||||
|
||||
def __repr__(self):
|
||||
return '<Cue [{start_time}->{end_time}] "{text}">'.format(start_time=self.start_time, end_time=self.end_time,
|
||||
text=''.join(repr(c) for c in self.components))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cue = Cue(time(), time(1), [Bold('Hello')])
|
||||
print repr(cue)
|
82
libs/subliminal/subtitles/subrip.py
Normal file
82
libs/subliminal/subtitles/subrip.py
Normal file
|
@ -0,0 +1,82 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
from datetime import time
|
||||
|
||||
from subliminal.subtitles import Cue
|
||||
|
||||
index_re = re.compile(r'(?P<index>\d+)')
|
||||
timing_re = re.compile(r'(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2}),(?P<milliseconds>\d{3})')
|
||||
|
||||
|
||||
class SubripReadError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class SubripReadIndexError(SubripReadError):
|
||||
pass
|
||||
|
||||
|
||||
class SubripReader(object):
|
||||
INDEX = 1
|
||||
TIMINGS = 2
|
||||
TEXT = 3
|
||||
|
||||
def __init__(self):
|
||||
self.state = self.INDEX
|
||||
|
||||
def read(self, content):
|
||||
pass
|
||||
|
||||
def read_line(self, line):
|
||||
if self.state == self.INDEX:
|
||||
if index_re.match(line):
|
||||
raise SubripReadIndexError
|
||||
|
||||
|
||||
def read_cue(stream):
|
||||
"""Attempt to parse a complete Cue from the stream"""
|
||||
# skip blank lines
|
||||
line = ''
|
||||
while not line:
|
||||
line = stream.readline()
|
||||
|
||||
# parse index
|
||||
if not index_re.match(line):
|
||||
raise SubripReadIndexError
|
||||
|
||||
# parse timings
|
||||
line = stream.readline()
|
||||
if '-->' not in line:
|
||||
raise SubripReadError
|
||||
timings = line.split('-->')
|
||||
if not len(timings):
|
||||
raise SubripReadError
|
||||
|
||||
# parse start time
|
||||
match = timing_re.match(timings[0].strip())
|
||||
if not match:
|
||||
raise SubripReadError
|
||||
start_time = time(**match.groupdict())
|
||||
|
||||
# parse end time
|
||||
match = timing_re.match(timings[0].strip())
|
||||
if not match:
|
||||
raise SubripReadError
|
||||
end_time = time(**match.groupdict())
|
||||
|
||||
|
||||
|
||||
|
||||
class SubripSubtitle(object):
|
||||
def __init__(self):
|
||||
self.cues = []
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print read_cue('toto')
|
||||
i = 0
|
||||
for x in read_cue('toto'):
|
||||
print x
|
||||
if i > 10:
|
||||
break
|
||||
i += 1
|
21
libs/subliminal2.7/__init__.py
Normal file
21
libs/subliminal2.7/__init__.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
__title__ = 'subliminal'
|
||||
__version__ = '2.1.0.dev'
|
||||
__short_version__ = '.'.join(__version__.split('.')[:2])
|
||||
__author__ = 'Antoine Bertin'
|
||||
__license__ = 'MIT'
|
||||
__copyright__ = 'Copyright 2016, Antoine Bertin'
|
||||
|
||||
import logging
|
||||
|
||||
from .core import (AsyncProviderPool, ProviderPool, check_video, download_best_subtitles, download_subtitles,
|
||||
list_subtitles, refine, save_subtitles, scan_video, scan_videos)
|
||||
from .cache import region
|
||||
from .exceptions import Error, ProviderError
|
||||
from .extensions import provider_manager, refiner_manager
|
||||
from .providers import Provider
|
||||
from .score import compute_score, get_scores
|
||||
from .subtitle import SUBTITLE_EXTENSIONS, Subtitle
|
||||
from .video import VIDEO_EXTENSIONS, Episode, Movie, Video
|
||||
|
||||
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
16
libs/subliminal2.7/cache.py
Normal file
16
libs/subliminal2.7/cache.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import datetime
|
||||
|
||||
from dogpile.cache import make_region
|
||||
|
||||
#: Expiration time for show caching
|
||||
SHOW_EXPIRATION_TIME = datetime.timedelta(weeks=3).total_seconds()
|
||||
|
||||
#: Expiration time for episode caching
|
||||
EPISODE_EXPIRATION_TIME = datetime.timedelta(days=3).total_seconds()
|
||||
|
||||
#: Expiration time for scraper searches
|
||||
REFINER_EXPIRATION_TIME = datetime.timedelta(weeks=1).total_seconds()
|
||||
|
||||
|
||||
region = make_region()
|
458
libs/subliminal2.7/cli.py
Normal file
458
libs/subliminal2.7/cli.py
Normal file
|
@ -0,0 +1,458 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Subliminal uses `click <http://click.pocoo.org>`_ to provide a powerful :abbr:`CLI (command-line interface)`.
|
||||
|
||||
"""
|
||||
from __future__ import division
|
||||
from collections import defaultdict
|
||||
from datetime import timedelta
|
||||
import glob
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
from appdirs import AppDirs
|
||||
from babelfish import Error as BabelfishError, Language
|
||||
import click
|
||||
from dogpile.cache.backends.file import AbstractFileLock
|
||||
from dogpile.util.readwrite_lock import ReadWriteMutex
|
||||
from six.moves import configparser
|
||||
|
||||
from subliminal import (AsyncProviderPool, Episode, Movie, Video, __version__, check_video, compute_score, get_scores,
|
||||
provider_manager, refine, refiner_manager, region, save_subtitles, scan_video, scan_videos)
|
||||
from subliminal.core import ARCHIVE_EXTENSIONS, search_external_subtitles
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MutexLock(AbstractFileLock):
|
||||
""":class:`MutexLock` is a thread-based rw lock based on :class:`dogpile.core.ReadWriteMutex`."""
|
||||
def __init__(self, filename):
|
||||
self.mutex = ReadWriteMutex()
|
||||
|
||||
def acquire_read_lock(self, wait):
|
||||
ret = self.mutex.acquire_read_lock(wait)
|
||||
return wait or ret
|
||||
|
||||
def acquire_write_lock(self, wait):
|
||||
ret = self.mutex.acquire_write_lock(wait)
|
||||
return wait or ret
|
||||
|
||||
def release_read_lock(self):
|
||||
return self.mutex.release_read_lock()
|
||||
|
||||
def release_write_lock(self):
|
||||
return self.mutex.release_write_lock()
|
||||
|
||||
|
||||
class Config(object):
|
||||
"""A :class:`~configparser.ConfigParser` wrapper to store configuration.
|
||||
|
||||
Interaction with the configuration is done with the properties.
|
||||
|
||||
:param str path: path to the configuration file.
|
||||
|
||||
"""
|
||||
def __init__(self, path):
|
||||
#: Path to the configuration file
|
||||
self.path = path
|
||||
|
||||
#: The underlying configuration object
|
||||
self.config = configparser.SafeConfigParser()
|
||||
self.config.add_section('general')
|
||||
self.config.set('general', 'languages', json.dumps(['en']))
|
||||
self.config.set('general', 'providers', json.dumps(sorted([p.name for p in provider_manager])))
|
||||
self.config.set('general', 'refiners', json.dumps(sorted([r.name for r in refiner_manager])))
|
||||
self.config.set('general', 'single', str(0))
|
||||
self.config.set('general', 'embedded_subtitles', str(1))
|
||||
self.config.set('general', 'age', str(int(timedelta(weeks=2).total_seconds())))
|
||||
self.config.set('general', 'hearing_impaired', str(1))
|
||||
self.config.set('general', 'min_score', str(0))
|
||||
|
||||
def read(self):
|
||||
"""Read the configuration from :attr:`path`"""
|
||||
self.config.read(self.path)
|
||||
|
||||
def write(self):
|
||||
"""Write the configuration to :attr:`path`"""
|
||||
with open(self.path, 'w') as f:
|
||||
self.config.write(f)
|
||||
|
||||
@property
|
||||
def languages(self):
|
||||
return {Language.fromietf(l) for l in json.loads(self.config.get('general', 'languages'))}
|
||||
|
||||
@languages.setter
|
||||
def languages(self, value):
|
||||
self.config.set('general', 'languages', json.dumps(sorted([str(l) for l in value])))
|
||||
|
||||
@property
|
||||
def providers(self):
|
||||
return json.loads(self.config.get('general', 'providers'))
|
||||
|
||||
@providers.setter
|
||||
def providers(self, value):
|
||||
self.config.set('general', 'providers', json.dumps(sorted([p.lower() for p in value])))
|
||||
|
||||
@property
|
||||
def refiners(self):
|
||||
return json.loads(self.config.get('general', 'refiners'))
|
||||
|
||||
@refiners.setter
|
||||
def refiners(self, value):
|
||||
self.config.set('general', 'refiners', json.dumps([r.lower() for r in value]))
|
||||
|
||||
@property
|
||||
def single(self):
|
||||
return self.config.getboolean('general', 'single')
|
||||
|
||||
@single.setter
|
||||
def single(self, value):
|
||||
self.config.set('general', 'single', str(int(value)))
|
||||
|
||||
@property
|
||||
def embedded_subtitles(self):
|
||||
return self.config.getboolean('general', 'embedded_subtitles')
|
||||
|
||||
@embedded_subtitles.setter
|
||||
def embedded_subtitles(self, value):
|
||||
self.config.set('general', 'embedded_subtitles', str(int(value)))
|
||||
|
||||
@property
|
||||
def age(self):
|
||||
return timedelta(seconds=self.config.getint('general', 'age'))
|
||||
|
||||
@age.setter
|
||||
def age(self, value):
|
||||
self.config.set('general', 'age', str(int(value.total_seconds())))
|
||||
|
||||
@property
|
||||
def hearing_impaired(self):
|
||||
return self.config.getboolean('general', 'hearing_impaired')
|
||||
|
||||
@hearing_impaired.setter
|
||||
def hearing_impaired(self, value):
|
||||
self.config.set('general', 'hearing_impaired', str(int(value)))
|
||||
|
||||
@property
|
||||
def min_score(self):
|
||||
return self.config.getfloat('general', 'min_score')
|
||||
|
||||
@min_score.setter
|
||||
def min_score(self, value):
|
||||
self.config.set('general', 'min_score', str(value))
|
||||
|
||||
@property
|
||||
def provider_configs(self):
|
||||
rv = {}
|
||||
for provider in provider_manager:
|
||||
if self.config.has_section(provider.name):
|
||||
rv[provider.name] = {k: v for k, v in self.config.items(provider.name)}
|
||||
return rv
|
||||
|
||||
@provider_configs.setter
|
||||
def provider_configs(self, value):
|
||||
# loop over provider configurations
|
||||
for provider, config in value.items():
|
||||
# create the corresponding section if necessary
|
||||
if not self.config.has_section(provider):
|
||||
self.config.add_section(provider)
|
||||
|
||||
# add config options
|
||||
for k, v in config.items():
|
||||
self.config.set(provider, k, v)
|
||||
|
||||
|
||||
class LanguageParamType(click.ParamType):
|
||||
""":class:`~click.ParamType` for languages that returns a :class:`~babelfish.language.Language`"""
|
||||
name = 'language'
|
||||
|
||||
def convert(self, value, param, ctx):
|
||||
try:
|
||||
return Language.fromietf(value)
|
||||
except BabelfishError:
|
||||
self.fail('%s is not a valid language' % value)
|
||||
|
||||
LANGUAGE = LanguageParamType()
|
||||
|
||||
|
||||
class AgeParamType(click.ParamType):
|
||||
""":class:`~click.ParamType` for age strings that returns a :class:`~datetime.timedelta`
|
||||
|
||||
An age string is in the form `number + identifier` with possible identifiers:
|
||||
|
||||
* ``w`` for weeks
|
||||
* ``d`` for days
|
||||
* ``h`` for hours
|
||||
|
||||
The form can be specified multiple times but only with that idenfier ordering. For example:
|
||||
|
||||
* ``1w2d4h`` for 1 week, 2 days and 4 hours
|
||||
* ``2w`` for 2 weeks
|
||||
* ``3w6h`` for 3 weeks and 6 hours
|
||||
|
||||
"""
|
||||
name = 'age'
|
||||
|
||||
def convert(self, value, param, ctx):
|
||||
match = re.match(r'^(?:(?P<weeks>\d+?)w)?(?:(?P<days>\d+?)d)?(?:(?P<hours>\d+?)h)?$', value)
|
||||
if not match:
|
||||
self.fail('%s is not a valid age' % value)
|
||||
|
||||
return timedelta(**{k: int(v) for k, v in match.groupdict(0).items()})
|
||||
|
||||
AGE = AgeParamType()
|
||||
|
||||
PROVIDER = click.Choice(sorted(provider_manager.names()))
|
||||
|
||||
REFINER = click.Choice(sorted(refiner_manager.names()))
|
||||
|
||||
dirs = AppDirs('subliminal')
|
||||
cache_file = 'subliminal.dbm'
|
||||
config_file = 'config.ini'
|
||||
|
||||
|
||||
@click.group(context_settings={'max_content_width': 100}, epilog='Suggestions and bug reports are greatly appreciated: '
|
||||
'https://github.com/Diaoul/subliminal/')
|
||||
@click.option('--addic7ed', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='Addic7ed configuration.')
|
||||
@click.option('--legendastv', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='LegendasTV configuration.')
|
||||
@click.option('--opensubtitles', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD',
|
||||
help='OpenSubtitles configuration.')
|
||||
@click.option('--cache-dir', type=click.Path(writable=True, file_okay=False), default=dirs.user_cache_dir,
|
||||
show_default=True, expose_value=True, help='Path to the cache directory.')
|
||||
@click.option('--debug', is_flag=True, help='Print useful information for debugging subliminal and for reporting bugs.')
|
||||
@click.version_option(__version__)
|
||||
@click.pass_context
|
||||
def subliminal(ctx, addic7ed, legendastv, opensubtitles, cache_dir, debug):
|
||||
"""Subtitles, faster than your thoughts."""
|
||||
# create cache directory
|
||||
try:
|
||||
os.makedirs(cache_dir)
|
||||
except OSError:
|
||||
if not os.path.isdir(cache_dir):
|
||||
raise
|
||||
|
||||
# configure cache
|
||||
region.configure('dogpile.cache.dbm', expiration_time=timedelta(days=30),
|
||||
arguments={'filename': os.path.join(cache_dir, cache_file), 'lock_factory': MutexLock})
|
||||
|
||||
# configure logging
|
||||
if debug:
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(logging.Formatter(logging.BASIC_FORMAT))
|
||||
logging.getLogger('subliminal').addHandler(handler)
|
||||
logging.getLogger('subliminal').setLevel(logging.DEBUG)
|
||||
|
||||
# provider configs
|
||||
ctx.obj = {'provider_configs': {}}
|
||||
if addic7ed:
|
||||
ctx.obj['provider_configs']['addic7ed'] = {'username': addic7ed[0], 'password': addic7ed[1]}
|
||||
if legendastv:
|
||||
ctx.obj['provider_configs']['legendastv'] = {'username': legendastv[0], 'password': legendastv[1]}
|
||||
if opensubtitles:
|
||||
ctx.obj['provider_configs']['opensubtitles'] = {'username': opensubtitles[0], 'password': opensubtitles[1]}
|
||||
|
||||
|
||||
@subliminal.command()
|
||||
@click.option('--clear-subliminal', is_flag=True, help='Clear subliminal\'s cache. Use this ONLY if your cache is '
|
||||
'corrupted or if you experience issues.')
|
||||
@click.pass_context
|
||||
def cache(ctx, clear_subliminal):
|
||||
"""Cache management."""
|
||||
if clear_subliminal:
|
||||
for file in glob.glob(os.path.join(ctx.parent.params['cache_dir'], cache_file) + '*'):
|
||||
os.remove(file)
|
||||
click.echo('Subliminal\'s cache cleared.')
|
||||
else:
|
||||
click.echo('Nothing done.')
|
||||
|
||||
|
||||
@subliminal.command()
|
||||
@click.option('-l', '--language', type=LANGUAGE, required=True, multiple=True, help='Language as IETF code, '
|
||||
'e.g. en, pt-BR (can be used multiple times).')
|
||||
@click.option('-p', '--provider', type=PROVIDER, multiple=True, help='Provider to use (can be used multiple times).')
|
||||
@click.option('-r', '--refiner', type=REFINER, multiple=True, help='Refiner to use (can be used multiple times).')
|
||||
@click.option('-a', '--age', type=AGE, help='Filter videos newer than AGE, e.g. 12h, 1w2d.')
|
||||
@click.option('-d', '--directory', type=click.STRING, metavar='DIR', help='Directory where to save subtitles, '
|
||||
'default is next to the video file.')
|
||||
@click.option('-e', '--encoding', type=click.STRING, metavar='ENC', help='Subtitle file encoding, default is to '
|
||||
'preserve original encoding.')
|
||||
@click.option('-s', '--single', is_flag=True, default=False, help='Save subtitle without language code in the file '
|
||||
'name, i.e. use .srt extension. Do not use this unless your media player requires it.')
|
||||
@click.option('-f', '--force', is_flag=True, default=False, help='Force download even if a subtitle already exist.')
|
||||
@click.option('-hi', '--hearing-impaired', is_flag=True, default=False, help='Prefer hearing impaired subtitles.')
|
||||
@click.option('-m', '--min-score', type=click.IntRange(0, 100), default=0, help='Minimum score for a subtitle '
|
||||
'to be downloaded (0 to 100).')
|
||||
@click.option('-w', '--max-workers', type=click.IntRange(1, 50), default=None, help='Maximum number of threads to use.')
|
||||
@click.option('-z/-Z', '--archives/--no-archives', default=True, show_default=True, help='Scan archives for videos '
|
||||
'(supported extensions: %s).' % ', '.join(ARCHIVE_EXTENSIONS))
|
||||
@click.option('-v', '--verbose', count=True, help='Increase verbosity.')
|
||||
@click.argument('path', type=click.Path(), required=True, nargs=-1)
|
||||
@click.pass_obj
|
||||
def download(obj, provider, refiner, language, age, directory, encoding, single, force, hearing_impaired, min_score,
|
||||
max_workers, archives, verbose, path):
|
||||
"""Download best subtitles.
|
||||
|
||||
PATH can be an directory containing videos, a video file path or a video file name. It can be used multiple times.
|
||||
|
||||
If an existing subtitle is detected (external or embedded) in the correct language, the download is skipped for
|
||||
the associated video.
|
||||
|
||||
"""
|
||||
# process parameters
|
||||
language = set(language)
|
||||
|
||||
# scan videos
|
||||
videos = []
|
||||
ignored_videos = []
|
||||
errored_paths = []
|
||||
with click.progressbar(path, label='Collecting videos', item_show_func=lambda p: p or '') as bar:
|
||||
for p in bar:
|
||||
logger.debug('Collecting path %s', p)
|
||||
|
||||
# non-existing
|
||||
if not os.path.exists(p):
|
||||
try:
|
||||
video = Video.fromname(p)
|
||||
except:
|
||||
logger.exception('Unexpected error while collecting non-existing path %s', p)
|
||||
errored_paths.append(p)
|
||||
continue
|
||||
if not force:
|
||||
video.subtitle_languages |= set(search_external_subtitles(video.name, directory=directory).values())
|
||||
refine(video, episode_refiners=refiner, movie_refiners=refiner, embedded_subtitles=not force)
|
||||
videos.append(video)
|
||||
continue
|
||||
|
||||
# directories
|
||||
if os.path.isdir(p):
|
||||
try:
|
||||
scanned_videos = scan_videos(p, age=age, archives=archives)
|
||||
except:
|
||||
logger.exception('Unexpected error while collecting directory path %s', p)
|
||||
errored_paths.append(p)
|
||||
continue
|
||||
for video in scanned_videos:
|
||||
if not force:
|
||||
video.subtitle_languages |= set(search_external_subtitles(video.name,
|
||||
directory=directory).values())
|
||||
if check_video(video, languages=language, age=age, undefined=single):
|
||||
refine(video, episode_refiners=refiner, movie_refiners=refiner, embedded_subtitles=not force)
|
||||
videos.append(video)
|
||||
else:
|
||||
ignored_videos.append(video)
|
||||
continue
|
||||
|
||||
# other inputs
|
||||
try:
|
||||
video = scan_video(p)
|
||||
except:
|
||||
logger.exception('Unexpected error while collecting path %s', p)
|
||||
errored_paths.append(p)
|
||||
continue
|
||||
if not force:
|
||||
video.subtitle_languages |= set(search_external_subtitles(video.name, directory=directory).values())
|
||||
if check_video(video, languages=language, age=age, undefined=single):
|
||||
refine(video, episode_refiners=refiner, movie_refiners=refiner, embedded_subtitles=not force)
|
||||
videos.append(video)
|
||||
else:
|
||||
ignored_videos.append(video)
|
||||
|
||||
# output errored paths
|
||||
if verbose > 0:
|
||||
for p in errored_paths:
|
||||
click.secho('%s errored' % p, fg='red')
|
||||
|
||||
# output ignored videos
|
||||
if verbose > 1:
|
||||
for video in ignored_videos:
|
||||
click.secho('%s ignored - subtitles: %s / age: %d day%s' % (
|
||||
os.path.split(video.name)[1],
|
||||
', '.join(str(s) for s in video.subtitle_languages) or 'none',
|
||||
video.age.days,
|
||||
's' if video.age.days > 1 else ''
|
||||
), fg='yellow')
|
||||
|
||||
# report collected videos
|
||||
click.echo('%s video%s collected / %s video%s ignored / %s error%s' % (
|
||||
click.style(str(len(videos)), bold=True, fg='green' if videos else None),
|
||||
's' if len(videos) > 1 else '',
|
||||
click.style(str(len(ignored_videos)), bold=True, fg='yellow' if ignored_videos else None),
|
||||
's' if len(ignored_videos) > 1 else '',
|
||||
click.style(str(len(errored_paths)), bold=True, fg='red' if errored_paths else None),
|
||||
's' if len(errored_paths) > 1 else '',
|
||||
))
|
||||
|
||||
# exit if no video collected
|
||||
if not videos:
|
||||
return
|
||||
|
||||
# download best subtitles
|
||||
downloaded_subtitles = defaultdict(list)
|
||||
with AsyncProviderPool(max_workers=max_workers, providers=provider, provider_configs=obj['provider_configs']) as p:
|
||||
with click.progressbar(videos, label='Downloading subtitles',
|
||||
item_show_func=lambda v: os.path.split(v.name)[1] if v is not None else '') as bar:
|
||||
for v in bar:
|
||||
scores = get_scores(v)
|
||||
subtitles = p.download_best_subtitles(p.list_subtitles(v, language - v.subtitle_languages),
|
||||
v, language, min_score=scores['hash'] * min_score / 100,
|
||||
hearing_impaired=hearing_impaired, only_one=single)
|
||||
downloaded_subtitles[v] = subtitles
|
||||
|
||||
if p.discarded_providers:
|
||||
click.secho('Some providers have been discarded due to unexpected errors: %s' %
|
||||
', '.join(p.discarded_providers), fg='yellow')
|
||||
|
||||
# save subtitles
|
||||
total_subtitles = 0
|
||||
for v, subtitles in downloaded_subtitles.items():
|
||||
saved_subtitles = save_subtitles(v, subtitles, single=single, directory=directory, encoding=encoding)
|
||||
total_subtitles += len(saved_subtitles)
|
||||
|
||||
if verbose > 0:
|
||||
click.echo('%s subtitle%s downloaded for %s' % (click.style(str(len(saved_subtitles)), bold=True),
|
||||
's' if len(saved_subtitles) > 1 else '',
|
||||
os.path.split(v.name)[1]))
|
||||
|
||||
if verbose > 1:
|
||||
for s in saved_subtitles:
|
||||
matches = s.get_matches(v)
|
||||
score = compute_score(s, v)
|
||||
|
||||
# score color
|
||||
score_color = None
|
||||
scores = get_scores(v)
|
||||
if isinstance(v, Movie):
|
||||
if score < scores['title']:
|
||||
score_color = 'red'
|
||||
elif score < scores['title'] + scores['year'] + scores['release_group']:
|
||||
score_color = 'yellow'
|
||||
else:
|
||||
score_color = 'green'
|
||||
elif isinstance(v, Episode):
|
||||
if score < scores['series'] + scores['season'] + scores['episode']:
|
||||
score_color = 'red'
|
||||
elif score < scores['series'] + scores['season'] + scores['episode'] + scores['release_group']:
|
||||
score_color = 'yellow'
|
||||
else:
|
||||
score_color = 'green'
|
||||
|
||||
# scale score from 0 to 100 taking out preferences
|
||||
scaled_score = score
|
||||
if s.hearing_impaired == hearing_impaired:
|
||||
scaled_score -= scores['hearing_impaired']
|
||||
scaled_score *= 100 / scores['hash']
|
||||
|
||||
# echo some nice colored output
|
||||
click.echo(' - [{score}] {language} subtitle from {provider_name} (match on {matches})'.format(
|
||||
score=click.style('{:5.1f}'.format(scaled_score), fg=score_color, bold=score >= scores['hash']),
|
||||
language=s.language.name if s.language.country is None else '%s (%s)' % (s.language.name,
|
||||
s.language.country.name),
|
||||
provider_name=s.provider_name,
|
||||
matches=', '.join(sorted(matches, key=scores.get, reverse=True))
|
||||
))
|
||||
|
||||
if verbose == 0:
|
||||
click.echo('Downloaded %s subtitle%s' % (click.style(str(total_subtitles), bold=True),
|
||||
's' if total_subtitles > 1 else ''))
|
0
libs/subliminal2.7/converters/__init__.py
Normal file
0
libs/subliminal2.7/converters/__init__.py
Normal file
32
libs/subliminal2.7/converters/addic7ed.py
Normal file
32
libs/subliminal2.7/converters/addic7ed.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from babelfish import LanguageReverseConverter, language_converters
|
||||
|
||||
|
||||
class Addic7edConverter(LanguageReverseConverter):
|
||||
def __init__(self):
|
||||
self.name_converter = language_converters['name']
|
||||
self.from_addic7ed = {u'Català': ('cat',), 'Chinese (Simplified)': ('zho',), 'Chinese (Traditional)': ('zho',),
|
||||
'Euskera': ('eus',), 'Galego': ('glg',), 'Greek': ('ell',), 'Malay': ('msa',),
|
||||
'Portuguese (Brazilian)': ('por', 'BR'), 'Serbian (Cyrillic)': ('srp', None, 'Cyrl'),
|
||||
'Serbian (Latin)': ('srp',), 'Spanish (Latin America)': ('spa',),
|
||||
'Spanish (Spain)': ('spa',)}
|
||||
self.to_addic7ed = {('cat',): 'Català', ('zho',): 'Chinese (Simplified)', ('eus',): 'Euskera',
|
||||
('glg',): 'Galego', ('ell',): 'Greek', ('msa',): 'Malay',
|
||||
('por', 'BR'): 'Portuguese (Brazilian)', ('srp', None, 'Cyrl'): 'Serbian (Cyrillic)'}
|
||||
self.codes = self.name_converter.codes | set(self.from_addic7ed.keys())
|
||||
|
||||
def convert(self, alpha3, country=None, script=None):
|
||||
if (alpha3, country, script) in self.to_addic7ed:
|
||||
return self.to_addic7ed[(alpha3, country, script)]
|
||||
if (alpha3, country) in self.to_addic7ed:
|
||||
return self.to_addic7ed[(alpha3, country)]
|
||||
if (alpha3,) in self.to_addic7ed:
|
||||
return self.to_addic7ed[(alpha3,)]
|
||||
|
||||
return self.name_converter.convert(alpha3, country, script)
|
||||
|
||||
def reverse(self, addic7ed):
|
||||
if addic7ed in self.from_addic7ed:
|
||||
return self.from_addic7ed[addic7ed]
|
||||
|
||||
return self.name_converter.reverse(addic7ed)
|
27
libs/subliminal2.7/converters/legendastv.py
Normal file
27
libs/subliminal2.7/converters/legendastv.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from babelfish import LanguageReverseConverter
|
||||
|
||||
from ..exceptions import ConfigurationError
|
||||
|
||||
|
||||
class LegendasTVConverter(LanguageReverseConverter):
|
||||
def __init__(self):
|
||||
self.from_legendastv = {1: ('por', 'BR'), 2: ('eng',), 3: ('spa',), 4: ('fra',), 5: ('deu',), 6: ('jpn',),
|
||||
7: ('dan',), 8: ('nor',), 9: ('swe',), 10: ('por',), 11: ('ara',), 12: ('ces',),
|
||||
13: ('zho',), 14: ('kor',), 15: ('bul',), 16: ('ita',), 17: ('pol',)}
|
||||
self.to_legendastv = {v: k for k, v in self.from_legendastv.items()}
|
||||
self.codes = set(self.from_legendastv.keys())
|
||||
|
||||
def convert(self, alpha3, country=None, script=None):
|
||||
if (alpha3, country) in self.to_legendastv:
|
||||
return self.to_legendastv[(alpha3, country)]
|
||||
if (alpha3,) in self.to_legendastv:
|
||||
return self.to_legendastv[(alpha3,)]
|
||||
|
||||
raise ConfigurationError('Unsupported language code for legendastv: %s, %s, %s' % (alpha3, country, script))
|
||||
|
||||
def reverse(self, legendastv):
|
||||
if legendastv in self.from_legendastv:
|
||||
return self.from_legendastv[legendastv]
|
||||
|
||||
raise ConfigurationError('Unsupported language number for legendastv: %s' % legendastv)
|
23
libs/subliminal2.7/converters/shooter.py
Normal file
23
libs/subliminal2.7/converters/shooter.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from babelfish import LanguageReverseConverter
|
||||
|
||||
from ..exceptions import ConfigurationError
|
||||
|
||||
|
||||
class ShooterConverter(LanguageReverseConverter):
|
||||
def __init__(self):
|
||||
self.from_shooter = {'chn': ('zho',), 'eng': ('eng',)}
|
||||
self.to_shooter = {v: k for k, v in self.from_shooter.items()}
|
||||
self.codes = set(self.from_shooter.keys())
|
||||
|
||||
def convert(self, alpha3, country=None, script=None):
|
||||
if (alpha3,) in self.to_shooter:
|
||||
return self.to_shooter[(alpha3,)]
|
||||
|
||||
raise ConfigurationError('Unsupported language for shooter: %s, %s, %s' % (alpha3, country, script))
|
||||
|
||||
def reverse(self, shooter):
|
||||
if shooter in self.from_shooter:
|
||||
return self.from_shooter[shooter]
|
||||
|
||||
raise ConfigurationError('Unsupported language code for shooter: %s' % shooter)
|
26
libs/subliminal2.7/converters/thesubdb.py
Normal file
26
libs/subliminal2.7/converters/thesubdb.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from babelfish import LanguageReverseConverter
|
||||
|
||||
from ..exceptions import ConfigurationError
|
||||
|
||||
|
||||
class TheSubDBConverter(LanguageReverseConverter):
|
||||
def __init__(self):
|
||||
self.from_thesubdb = {'en': ('eng',), 'es': ('spa',), 'fr': ('fra',), 'it': ('ita',), 'nl': ('nld',),
|
||||
'pl': ('pol',), 'pt': ('por', 'BR'), 'ro': ('ron',), 'sv': ('swe',), 'tr': ('tur',)}
|
||||
self.to_thesubdb = {v: k for k, v in self.from_thesubdb.items()}
|
||||
self.codes = set(self.from_thesubdb.keys())
|
||||
|
||||
def convert(self, alpha3, country=None, script=None):
|
||||
if (alpha3, country) in self.to_thesubdb:
|
||||
return self.to_thesubdb[(alpha3, country)]
|
||||
if (alpha3,) in self.to_thesubdb:
|
||||
return self.to_thesubdb[(alpha3,)]
|
||||
|
||||
raise ConfigurationError('Unsupported language for thesubdb: %s, %s, %s' % (alpha3, country, script))
|
||||
|
||||
def reverse(self, thesubdb):
|
||||
if thesubdb in self.from_thesubdb:
|
||||
return self.from_thesubdb[thesubdb]
|
||||
|
||||
raise ConfigurationError('Unsupported language code for thesubdb: %s' % thesubdb)
|
25
libs/subliminal2.7/converters/tvsubtitles.py
Normal file
25
libs/subliminal2.7/converters/tvsubtitles.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from babelfish import LanguageReverseConverter, language_converters
|
||||
|
||||
|
||||
class TVsubtitlesConverter(LanguageReverseConverter):
|
||||
def __init__(self):
|
||||
self.alpha2_converter = language_converters['alpha2']
|
||||
self.from_tvsubtitles = {'br': ('por', 'BR'), 'ua': ('ukr',), 'gr': ('ell',), 'cn': ('zho',), 'jp': ('jpn',),
|
||||
'cz': ('ces',)}
|
||||
self.to_tvsubtitles = {v: k for k, v in self.from_tvsubtitles.items()}
|
||||
self.codes = self.alpha2_converter.codes | set(self.from_tvsubtitles.keys())
|
||||
|
||||
def convert(self, alpha3, country=None, script=None):
|
||||
if (alpha3, country) in self.to_tvsubtitles:
|
||||
return self.to_tvsubtitles[(alpha3, country)]
|
||||
if (alpha3,) in self.to_tvsubtitles:
|
||||
return self.to_tvsubtitles[(alpha3,)]
|
||||
|
||||
return self.alpha2_converter.convert(alpha3, country, script)
|
||||
|
||||
def reverse(self, tvsubtitles):
|
||||
if tvsubtitles in self.from_tvsubtitles:
|
||||
return self.from_tvsubtitles[tvsubtitles]
|
||||
|
||||
return self.alpha2_converter.reverse(tvsubtitles)
|
777
libs/subliminal2.7/core.py
Normal file
777
libs/subliminal2.7/core.py
Normal file
|
@ -0,0 +1,777 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from collections import defaultdict
|
||||
|
||||
import platform
|
||||
is_windows_special_path = False
|
||||
|
||||
if platform.system() == "Windows":
|
||||
try:
|
||||
__file__.decode("ascii")
|
||||
except UnicodeDecodeError:
|
||||
is_windows_special_path = True
|
||||
|
||||
if not is_windows_special_path:
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
else:
|
||||
ThreadPoolExecutor = object
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
import io
|
||||
import itertools
|
||||
import logging
|
||||
import operator
|
||||
import os
|
||||
import socket
|
||||
|
||||
from babelfish import Language, LanguageReverseError
|
||||
from guessit import guessit
|
||||
from six.moves.xmlrpc_client import ProtocolError
|
||||
from rarfile import BadRarFile, NotRarFile, RarCannotExec, RarFile
|
||||
from zipfile import BadZipfile
|
||||
from ssl import SSLError
|
||||
import requests
|
||||
|
||||
from .exceptions import ServiceUnavailable
|
||||
from .extensions import provider_manager, refiner_manager
|
||||
from .score import compute_score as default_compute_score
|
||||
from .subtitle import SUBTITLE_EXTENSIONS, get_subtitle_path
|
||||
from .utils import hash_napiprojekt, hash_opensubtitles, hash_shooter, hash_thesubdb
|
||||
from .video import VIDEO_EXTENSIONS, Episode, Movie, Video
|
||||
|
||||
#: Supported archive extensions
|
||||
ARCHIVE_EXTENSIONS = ('.rar',)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ProviderPool(object):
|
||||
"""A pool of providers with the same API as a single :class:`~subliminal.providers.Provider`.
|
||||
|
||||
It has a few extra features:
|
||||
|
||||
* Lazy loads providers when needed and supports the `with` statement to :meth:`terminate`
|
||||
the providers on exit.
|
||||
* Automatically discard providers on failure.
|
||||
|
||||
:param list providers: name of providers to use, if not all.
|
||||
:param dict provider_configs: provider configuration as keyword arguments per provider name to pass when
|
||||
instanciating the :class:`~subliminal.providers.Provider`.
|
||||
|
||||
"""
|
||||
def __init__(self, providers=None, provider_configs=None):
|
||||
#: Name of providers to use
|
||||
self.providers = providers or provider_manager.names()
|
||||
|
||||
#: Provider configuration
|
||||
self.provider_configs = provider_configs or {}
|
||||
|
||||
#: Initialized providers
|
||||
self.initialized_providers = {}
|
||||
|
||||
#: Discarded providers
|
||||
self.discarded_providers = set()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.terminate()
|
||||
|
||||
def __getitem__(self, name):
|
||||
if name not in self.providers:
|
||||
raise KeyError
|
||||
if name not in self.initialized_providers:
|
||||
logger.info('Initializing provider %s', name)
|
||||
provider = provider_manager[name].plugin(**self.provider_configs.get(name, {}))
|
||||
provider.initialize()
|
||||
self.initialized_providers[name] = provider
|
||||
|
||||
return self.initialized_providers[name]
|
||||
|
||||
def __delitem__(self, name):
|
||||
if name not in self.initialized_providers:
|
||||
raise KeyError(name)
|
||||
|
||||
try:
|
||||
logger.info('Terminating provider %s', name)
|
||||
self.initialized_providers[name].terminate()
|
||||
except (requests.Timeout, socket.timeout):
|
||||
logger.error('Provider %r timed out, improperly terminated', name)
|
||||
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
|
||||
logger.error('Provider %r unavailable, improperly terminated', name)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code in range(500, 600):
|
||||
logger.error('Provider %r unavailable, improperly terminated', name)
|
||||
else:
|
||||
logger.exception('Provider %r http error %r, improperly terminated', name, e.response.status_code)
|
||||
except SSLError as e:
|
||||
if e.args[0] == 'The read operation timed out':
|
||||
logger.error('Provider %r unavailable, improperly terminated', name)
|
||||
else:
|
||||
logger.exception('Provider %r SSL error %r, improperly terminated', name, e.args[0])
|
||||
except:
|
||||
logger.exception('Provider %r terminated unexpectedly', name)
|
||||
|
||||
del self.initialized_providers[name]
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.initialized_providers)
|
||||
|
||||
def list_subtitles_provider(self, provider, video, languages):
|
||||
"""List subtitles with a single provider.
|
||||
|
||||
The video and languages are checked against the provider.
|
||||
|
||||
:param str provider: name of the provider.
|
||||
:param video: video to list subtitles for.
|
||||
:type video: :class:`~subliminal.video.Video`
|
||||
:param languages: languages to search for.
|
||||
:type languages: set of :class:`~babelfish.language.Language`
|
||||
:return: found subtitles.
|
||||
:rtype: list of :class:`~subliminal.subtitle.Subtitle` or None
|
||||
|
||||
"""
|
||||
# check video validity
|
||||
if not provider_manager[provider].plugin.check(video):
|
||||
logger.info('Skipping provider %r: not a valid video', provider)
|
||||
return []
|
||||
|
||||
# check supported languages
|
||||
provider_languages = provider_manager[provider].plugin.languages & languages
|
||||
if not provider_languages:
|
||||
logger.info('Skipping provider %r: no language to search for', provider)
|
||||
return []
|
||||
|
||||
# list subtitles
|
||||
logger.info('Listing subtitles with provider %r and languages %r', provider, provider_languages)
|
||||
try:
|
||||
return self[provider].list_subtitles(video, provider_languages)
|
||||
except (requests.Timeout, socket.timeout):
|
||||
logger.error('Provider %r timed out', provider)
|
||||
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
|
||||
logger.error('Provider %r unavailable', provider)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code in range(500, 600):
|
||||
logger.error('Provider %r unavailable', provider)
|
||||
else:
|
||||
logger.exception('Provider %r http error %r', provider, e.response.status_code)
|
||||
except SSLError as e:
|
||||
if e.args[0] == 'The read operation timed out':
|
||||
logger.error('Provider %r unavailable', provider)
|
||||
else:
|
||||
logger.exception('Provider %r SSL error %r', provider, e.args[0])
|
||||
except:
|
||||
logger.exception('Unexpected error in provider %r', provider)
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
"""List subtitles.
|
||||
|
||||
:param video: video to list subtitles for.
|
||||
:type video: :class:`~subliminal.video.Video`
|
||||
:param languages: languages to search for.
|
||||
:type languages: set of :class:`~babelfish.language.Language`
|
||||
:return: found subtitles.
|
||||
:rtype: list of :class:`~subliminal.subtitle.Subtitle`
|
||||
|
||||
"""
|
||||
subtitles = []
|
||||
|
||||
for name in self.providers:
|
||||
# check discarded providers
|
||||
if name in self.discarded_providers:
|
||||
logger.debug('Skipping discarded provider %r', name)
|
||||
continue
|
||||
|
||||
# list subtitles
|
||||
provider_subtitles = self.list_subtitles_provider(name, video, languages)
|
||||
if provider_subtitles is None:
|
||||
logger.info('Discarding provider %s', name)
|
||||
self.discarded_providers.add(name)
|
||||
continue
|
||||
|
||||
# add the subtitles
|
||||
subtitles.extend(provider_subtitles)
|
||||
|
||||
return subtitles
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
"""Download `subtitle`'s :attr:`~subliminal.subtitle.Subtitle.content`.
|
||||
|
||||
:param subtitle: subtitle to download.
|
||||
:type subtitle: :class:`~subliminal.subtitle.Subtitle`
|
||||
:return: `True` if the subtitle has been successfully downloaded, `False` otherwise.
|
||||
:rtype: bool
|
||||
|
||||
"""
|
||||
# check discarded providers
|
||||
if subtitle.provider_name in self.discarded_providers:
|
||||
logger.warning('Provider %r is discarded', subtitle.provider_name)
|
||||
return False
|
||||
|
||||
logger.info('Downloading subtitle %r', subtitle)
|
||||
try:
|
||||
self[subtitle.provider_name].download_subtitle(subtitle)
|
||||
except (requests.Timeout, socket.timeout):
|
||||
logger.error('Provider %r timed out, discarding it', subtitle.provider_name)
|
||||
self.discarded_providers.add(subtitle.provider_name)
|
||||
return False
|
||||
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
|
||||
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
|
||||
self.discarded_providers.add(subtitle.provider_name)
|
||||
return False
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code in range(500, 600):
|
||||
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
|
||||
else:
|
||||
logger.exception('Provider %r http error %r, discarding it', subtitle.provider_name,
|
||||
e.response.status_code)
|
||||
self.discarded_providers.add(subtitle.provider_name)
|
||||
return False
|
||||
except SSLError as e:
|
||||
if e.args[0] == 'The read operation timed out':
|
||||
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
|
||||
else:
|
||||
logger.exception('Provider %r SSL error %r, discarding it', subtitle.provider_name, e.args[0])
|
||||
self.discarded_providers.add(subtitle.provider_name)
|
||||
return False
|
||||
except (BadRarFile, BadZipfile):
|
||||
logger.error('Bad archive for %r', subtitle)
|
||||
return False
|
||||
except:
|
||||
logger.exception('Unexpected error in provider %r, discarding it', subtitle.provider_name)
|
||||
self.discarded_providers.add(subtitle.provider_name)
|
||||
return False
|
||||
|
||||
# check subtitle validity
|
||||
if not subtitle.is_valid():
|
||||
logger.error('Invalid subtitle')
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def download_best_subtitles(self, subtitles, video, languages, min_score=0, hearing_impaired=False, only_one=False,
|
||||
compute_score=None):
|
||||
"""Download the best matching subtitles.
|
||||
|
||||
:param subtitles: the subtitles to use.
|
||||
:type subtitles: list of :class:`~subliminal.subtitle.Subtitle`
|
||||
:param video: video to download subtitles for.
|
||||
:type video: :class:`~subliminal.video.Video`
|
||||
:param languages: languages to download.
|
||||
:type languages: set of :class:`~babelfish.language.Language`
|
||||
:param int min_score: minimum score for a subtitle to be downloaded.
|
||||
:param bool hearing_impaired: hearing impaired preference.
|
||||
:param bool only_one: download only one subtitle, not one per language.
|
||||
:param compute_score: function that takes `subtitle` and `video` as positional arguments,
|
||||
`hearing_impaired` as keyword argument and returns the score.
|
||||
:return: downloaded subtitles.
|
||||
:rtype: list of :class:`~subliminal.subtitle.Subtitle`
|
||||
|
||||
"""
|
||||
compute_score = compute_score or default_compute_score
|
||||
|
||||
# sort subtitles by score
|
||||
scored_subtitles = sorted([(s, compute_score(s, video, hearing_impaired=hearing_impaired))
|
||||
for s in subtitles], key=operator.itemgetter(1), reverse=True)
|
||||
|
||||
# download best subtitles, falling back on the next on error
|
||||
downloaded_subtitles = []
|
||||
for subtitle, score in scored_subtitles:
|
||||
# check score
|
||||
if score < min_score:
|
||||
logger.info('Score %d is below min_score (%d)', score, min_score)
|
||||
break
|
||||
|
||||
# check downloaded languages
|
||||
if subtitle.language in set(s.language for s in downloaded_subtitles):
|
||||
logger.debug('Skipping subtitle: %r already downloaded', subtitle.language)
|
||||
continue
|
||||
|
||||
# download
|
||||
if self.download_subtitle(subtitle):
|
||||
downloaded_subtitles.append(subtitle)
|
||||
|
||||
# stop when all languages are downloaded
|
||||
if set(s.language for s in downloaded_subtitles) == languages:
|
||||
logger.debug('All languages downloaded')
|
||||
break
|
||||
|
||||
# stop if only one subtitle is requested
|
||||
if only_one:
|
||||
logger.debug('Only one subtitle downloaded')
|
||||
break
|
||||
|
||||
return downloaded_subtitles
|
||||
|
||||
def terminate(self):
|
||||
"""Terminate all the :attr:`initialized_providers`."""
|
||||
logger.debug('Terminating initialized providers')
|
||||
for name in list(self.initialized_providers):
|
||||
del self[name]
|
||||
|
||||
|
||||
class AsyncProviderPool(ProviderPool):
|
||||
"""Subclass of :class:`ProviderPool` with asynchronous support for :meth:`~ProviderPool.list_subtitles`.
|
||||
|
||||
:param int max_workers: maximum number of threads to use. If `None`, :attr:`max_workers` will be set
|
||||
to the number of :attr:`~ProviderPool.providers`.
|
||||
|
||||
"""
|
||||
def __init__(self, max_workers=None, *args, **kwargs):
|
||||
super(AsyncProviderPool, self).__init__(*args, **kwargs)
|
||||
|
||||
#: Maximum number of threads to use
|
||||
self.max_workers = max_workers or len(self.providers)
|
||||
|
||||
def list_subtitles_provider(self, provider, video, languages):
|
||||
return provider, super(AsyncProviderPool, self).list_subtitles_provider(provider, video, languages)
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
subtitles = []
|
||||
|
||||
with ThreadPoolExecutor(self.max_workers) as executor:
|
||||
for provider, provider_subtitles in executor.map(self.list_subtitles_provider, self.providers,
|
||||
itertools.repeat(video, len(self.providers)),
|
||||
itertools.repeat(languages, len(self.providers))):
|
||||
# discard provider that failed
|
||||
if provider_subtitles is None:
|
||||
logger.info('Discarding provider %s', provider)
|
||||
self.discarded_providers.add(provider)
|
||||
continue
|
||||
|
||||
# add subtitles
|
||||
subtitles.extend(provider_subtitles)
|
||||
|
||||
return subtitles
|
||||
|
||||
|
||||
def check_video(video, languages=None, age=None, undefined=False):
|
||||
"""Perform some checks on the `video`.
|
||||
|
||||
All the checks are optional. Return `False` if any of this check fails:
|
||||
|
||||
* `languages` already exist in `video`'s :attr:`~subliminal.video.Video.subtitle_languages`.
|
||||
* `video` is older than `age`.
|
||||
* `video` has an `undefined` language in :attr:`~subliminal.video.Video.subtitle_languages`.
|
||||
|
||||
:param video: video to check.
|
||||
:type video: :class:`~subliminal.video.Video`
|
||||
:param languages: desired languages.
|
||||
:type languages: set of :class:`~babelfish.language.Language`
|
||||
:param datetime.timedelta age: maximum age of the video.
|
||||
:param bool undefined: fail on existing undefined language.
|
||||
:return: `True` if the video passes the checks, `False` otherwise.
|
||||
:rtype: bool
|
||||
|
||||
"""
|
||||
# language test
|
||||
if languages and not (languages - video.subtitle_languages):
|
||||
logger.debug('All languages %r exist', languages)
|
||||
return False
|
||||
|
||||
# age test
|
||||
if age and video.age > age:
|
||||
logger.debug('Video is older than %r', age)
|
||||
return False
|
||||
|
||||
# undefined test
|
||||
if undefined and Language('und') in video.subtitle_languages:
|
||||
logger.debug('Undefined language found')
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def search_external_subtitles(path, directory=None):
|
||||
"""Search for external subtitles from a video `path` and their associated language.
|
||||
|
||||
Unless `directory` is provided, search will be made in the same directory as the video file.
|
||||
|
||||
:param str path: path to the video.
|
||||
:param str directory: directory to search for subtitles.
|
||||
:return: found subtitles with their languages.
|
||||
:rtype: dict
|
||||
|
||||
"""
|
||||
# split path
|
||||
dirpath, filename = os.path.split(path)
|
||||
dirpath = dirpath or '.'
|
||||
fileroot, fileext = os.path.splitext(filename)
|
||||
|
||||
# search for subtitles
|
||||
subtitles = {}
|
||||
for p in os.listdir(directory or dirpath):
|
||||
# keep only valid subtitle filenames
|
||||
if not p.startswith(fileroot) or not p.endswith(SUBTITLE_EXTENSIONS):
|
||||
continue
|
||||
|
||||
# extract the potential language code
|
||||
language = Language('und')
|
||||
language_code = p[len(fileroot):-len(os.path.splitext(p)[1])].replace(fileext, '').replace('_', '-')[1:]
|
||||
if language_code:
|
||||
try:
|
||||
language = Language.fromietf(language_code)
|
||||
except (ValueError, LanguageReverseError):
|
||||
logger.error('Cannot parse language code %r', language_code)
|
||||
|
||||
subtitles[p] = language
|
||||
|
||||
logger.debug('Found subtitles %r', subtitles)
|
||||
|
||||
return subtitles
|
||||
|
||||
|
||||
def scan_video(path):
|
||||
"""Scan a video from a `path`.
|
||||
|
||||
:param str path: existing path to the video.
|
||||
:return: the scanned video.
|
||||
:rtype: :class:`~subliminal.video.Video`
|
||||
|
||||
"""
|
||||
# check for non-existing path
|
||||
if not os.path.exists(path):
|
||||
raise ValueError('Path does not exist')
|
||||
|
||||
# check video extension
|
||||
if not path.endswith(VIDEO_EXTENSIONS):
|
||||
raise ValueError('%r is not a valid video extension' % os.path.splitext(path)[1])
|
||||
|
||||
dirpath, filename = os.path.split(path)
|
||||
logger.info('Scanning video %r in %r', filename, dirpath)
|
||||
|
||||
# guess
|
||||
video = Video.fromguess(path, guessit(path))
|
||||
|
||||
# size and hashes
|
||||
video.size = os.path.getsize(path)
|
||||
if video.size > 10485760:
|
||||
logger.debug('Size is %d', video.size)
|
||||
video.hashes['opensubtitles'] = hash_opensubtitles(path)
|
||||
video.hashes['shooter'] = hash_shooter(path)
|
||||
video.hashes['thesubdb'] = hash_thesubdb(path)
|
||||
video.hashes['napiprojekt'] = hash_napiprojekt(path)
|
||||
logger.debug('Computed hashes %r', video.hashes)
|
||||
else:
|
||||
logger.warning('Size is lower than 10MB: hashes not computed')
|
||||
|
||||
return video
|
||||
|
||||
|
||||
def scan_archive(path):
|
||||
"""Scan an archive from a `path`.
|
||||
|
||||
:param str path: existing path to the archive.
|
||||
:return: the scanned video.
|
||||
:rtype: :class:`~subliminal.video.Video`
|
||||
|
||||
"""
|
||||
# check for non-existing path
|
||||
if not os.path.exists(path):
|
||||
raise ValueError('Path does not exist')
|
||||
|
||||
# check video extension
|
||||
if not path.endswith(ARCHIVE_EXTENSIONS):
|
||||
raise ValueError('%r is not a valid archive extension' % os.path.splitext(path)[1])
|
||||
|
||||
dirpath, filename = os.path.split(path)
|
||||
logger.info('Scanning archive %r in %r', filename, dirpath)
|
||||
|
||||
# rar extension
|
||||
if filename.endswith('.rar'):
|
||||
rar = RarFile(path)
|
||||
|
||||
# filter on video extensions
|
||||
rar_filenames = [f for f in rar.namelist() if f.endswith(VIDEO_EXTENSIONS)]
|
||||
|
||||
# no video found
|
||||
if not rar_filenames:
|
||||
raise ValueError('No video in archive')
|
||||
|
||||
# more than one video found
|
||||
if len(rar_filenames) > 1:
|
||||
raise ValueError('More than one video in archive')
|
||||
|
||||
# guess
|
||||
rar_filename = rar_filenames[0]
|
||||
rar_filepath = os.path.join(dirpath, rar_filename)
|
||||
video = Video.fromguess(rar_filepath, guessit(rar_filepath))
|
||||
|
||||
# size
|
||||
video.size = rar.getinfo(rar_filename).file_size
|
||||
else:
|
||||
raise ValueError('Unsupported extension %r' % os.path.splitext(path)[1])
|
||||
|
||||
return video
|
||||
|
||||
|
||||
def scan_videos(path, age=None, archives=True):
|
||||
"""Scan `path` for videos and their subtitles.
|
||||
|
||||
See :func:`refine` to find additional information for the video.
|
||||
|
||||
:param str path: existing directory path to scan.
|
||||
:param datetime.timedelta age: maximum age of the video or archive.
|
||||
:param bool archives: scan videos in archives.
|
||||
:return: the scanned videos.
|
||||
:rtype: list of :class:`~subliminal.video.Video`
|
||||
|
||||
"""
|
||||
# check for non-existing path
|
||||
if not os.path.exists(path):
|
||||
raise ValueError('Path does not exist')
|
||||
|
||||
# check for non-directory path
|
||||
if not os.path.isdir(path):
|
||||
raise ValueError('Path is not a directory')
|
||||
|
||||
# walk the path
|
||||
videos = []
|
||||
for dirpath, dirnames, filenames in os.walk(path):
|
||||
logger.debug('Walking directory %r', dirpath)
|
||||
|
||||
# remove badly encoded and hidden dirnames
|
||||
for dirname in list(dirnames):
|
||||
if dirname.startswith('.'):
|
||||
logger.debug('Skipping hidden dirname %r in %r', dirname, dirpath)
|
||||
dirnames.remove(dirname)
|
||||
|
||||
# scan for videos
|
||||
for filename in filenames:
|
||||
# filter on videos and archives
|
||||
if not (filename.endswith(VIDEO_EXTENSIONS) or archives and filename.endswith(ARCHIVE_EXTENSIONS)):
|
||||
continue
|
||||
|
||||
# skip hidden files
|
||||
if filename.startswith('.'):
|
||||
logger.debug('Skipping hidden filename %r in %r', filename, dirpath)
|
||||
continue
|
||||
|
||||
# reconstruct the file path
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
|
||||
# skip links
|
||||
if os.path.islink(filepath):
|
||||
logger.debug('Skipping link %r in %r', filename, dirpath)
|
||||
continue
|
||||
|
||||
# skip old files
|
||||
try:
|
||||
file_age = datetime.utcfromtimestamp(os.path.getmtime(filepath))
|
||||
except ValueError:
|
||||
logger.warning('Could not get age of file %r in %r', filename, dirpath)
|
||||
continue
|
||||
else:
|
||||
if age and datetime.utcnow() - file_age > age:
|
||||
logger.debug('Skipping old file %r in %r', filename, dirpath)
|
||||
continue
|
||||
|
||||
# scan
|
||||
if filename.endswith(VIDEO_EXTENSIONS): # video
|
||||
try:
|
||||
video = scan_video(filepath)
|
||||
except ValueError: # pragma: no cover
|
||||
logger.exception('Error scanning video')
|
||||
continue
|
||||
elif archives and filename.endswith(ARCHIVE_EXTENSIONS): # archive
|
||||
try:
|
||||
video = scan_archive(filepath)
|
||||
except (NotRarFile, RarCannotExec, ValueError): # pragma: no cover
|
||||
logger.exception('Error scanning archive')
|
||||
continue
|
||||
else: # pragma: no cover
|
||||
raise ValueError('Unsupported file %r' % filename)
|
||||
|
||||
videos.append(video)
|
||||
|
||||
return videos
|
||||
|
||||
|
||||
def refine(video, episode_refiners=None, movie_refiners=None, **kwargs):
|
||||
"""Refine a video using :ref:`refiners`.
|
||||
|
||||
.. note::
|
||||
|
||||
Exceptions raised in refiners are silently passed and logged.
|
||||
|
||||
:param video: the video to refine.
|
||||
:type video: :class:`~subliminal.video.Video`
|
||||
:param tuple episode_refiners: refiners to use for episodes.
|
||||
:param tuple movie_refiners: refiners to use for movies.
|
||||
:param \*\*kwargs: additional parameters for the :func:`~subliminal.refiners.refine` functions.
|
||||
|
||||
"""
|
||||
refiners = ()
|
||||
if isinstance(video, Episode):
|
||||
refiners = episode_refiners or ('metadata', 'tvdb', 'omdb')
|
||||
elif isinstance(video, Movie):
|
||||
refiners = movie_refiners or ('metadata', 'omdb')
|
||||
for refiner in refiners:
|
||||
logger.info('Refining video with %s', refiner)
|
||||
try:
|
||||
refiner_manager[refiner].plugin(video, **kwargs)
|
||||
except:
|
||||
logger.error('Failed to refine video %r', video.name)
|
||||
logger.debug('Refiner exception:', exc_info=True)
|
||||
|
||||
|
||||
def list_subtitles(videos, languages, pool_class=ProviderPool, **kwargs):
|
||||
"""List subtitles.
|
||||
|
||||
The `videos` must pass the `languages` check of :func:`check_video`.
|
||||
|
||||
:param videos: videos to list subtitles for.
|
||||
:type videos: set of :class:`~subliminal.video.Video`
|
||||
:param languages: languages to search for.
|
||||
:type languages: set of :class:`~babelfish.language.Language`
|
||||
:param pool_class: class to use as provider pool.
|
||||
:type pool_class: :class:`ProviderPool`, :class:`AsyncProviderPool` or similar
|
||||
:param \*\*kwargs: additional parameters for the provided `pool_class` constructor.
|
||||
:return: found subtitles per video.
|
||||
:rtype: dict of :class:`~subliminal.video.Video` to list of :class:`~subliminal.subtitle.Subtitle`
|
||||
|
||||
"""
|
||||
listed_subtitles = defaultdict(list)
|
||||
|
||||
# check videos
|
||||
checked_videos = []
|
||||
for video in videos:
|
||||
if not check_video(video, languages=languages):
|
||||
logger.info('Skipping video %r', video)
|
||||
continue
|
||||
checked_videos.append(video)
|
||||
|
||||
# return immediately if no video passed the checks
|
||||
if not checked_videos:
|
||||
return listed_subtitles
|
||||
|
||||
# list subtitles
|
||||
with pool_class(**kwargs) as pool:
|
||||
for video in checked_videos:
|
||||
logger.info('Listing subtitles for %r', video)
|
||||
subtitles = pool.list_subtitles(video, languages - video.subtitle_languages)
|
||||
listed_subtitles[video].extend(subtitles)
|
||||
logger.info('Found %d subtitle(s)', len(subtitles))
|
||||
|
||||
return listed_subtitles
|
||||
|
||||
|
||||
def download_subtitles(subtitles, pool_class=ProviderPool, **kwargs):
|
||||
"""Download :attr:`~subliminal.subtitle.Subtitle.content` of `subtitles`.
|
||||
|
||||
:param subtitles: subtitles to download.
|
||||
:type subtitles: list of :class:`~subliminal.subtitle.Subtitle`
|
||||
:param pool_class: class to use as provider pool.
|
||||
:type pool_class: :class:`ProviderPool`, :class:`AsyncProviderPool` or similar
|
||||
:param \*\*kwargs: additional parameters for the provided `pool_class` constructor.
|
||||
|
||||
"""
|
||||
with pool_class(**kwargs) as pool:
|
||||
for subtitle in subtitles:
|
||||
logger.info('Downloading subtitle %r', subtitle)
|
||||
pool.download_subtitle(subtitle)
|
||||
|
||||
|
||||
def download_best_subtitles(videos, languages, min_score=0, hearing_impaired=False, only_one=False, compute_score=None,
|
||||
pool_class=ProviderPool, **kwargs):
|
||||
"""List and download the best matching subtitles.
|
||||
|
||||
The `videos` must pass the `languages` and `undefined` (`only_one`) checks of :func:`check_video`.
|
||||
|
||||
:param videos: videos to download subtitles for.
|
||||
:type videos: set of :class:`~subliminal.video.Video`
|
||||
:param languages: languages to download.
|
||||
:type languages: set of :class:`~babelfish.language.Language`
|
||||
:param int min_score: minimum score for a subtitle to be downloaded.
|
||||
:param bool hearing_impaired: hearing impaired preference.
|
||||
:param bool only_one: download only one subtitle, not one per language.
|
||||
:param compute_score: function that takes `subtitle` and `video` as positional arguments,
|
||||
`hearing_impaired` as keyword argument and returns the score.
|
||||
:param pool_class: class to use as provider pool.
|
||||
:type pool_class: :class:`ProviderPool`, :class:`AsyncProviderPool` or similar
|
||||
:param \*\*kwargs: additional parameters for the provided `pool_class` constructor.
|
||||
:return: downloaded subtitles per video.
|
||||
:rtype: dict of :class:`~subliminal.video.Video` to list of :class:`~subliminal.subtitle.Subtitle`
|
||||
|
||||
"""
|
||||
downloaded_subtitles = defaultdict(list)
|
||||
|
||||
# check videos
|
||||
checked_videos = []
|
||||
for video in videos:
|
||||
if not check_video(video, languages=languages, undefined=only_one):
|
||||
logger.info('Skipping video %r', video)
|
||||
continue
|
||||
checked_videos.append(video)
|
||||
|
||||
# return immediately if no video passed the checks
|
||||
if not checked_videos:
|
||||
return downloaded_subtitles
|
||||
|
||||
# download best subtitles
|
||||
with pool_class(**kwargs) as pool:
|
||||
for video in checked_videos:
|
||||
logger.info('Downloading best subtitles for %r', video)
|
||||
subtitles = pool.download_best_subtitles(pool.list_subtitles(video, languages - video.subtitle_languages),
|
||||
video, languages, min_score=min_score,
|
||||
hearing_impaired=hearing_impaired, only_one=only_one,
|
||||
compute_score=compute_score)
|
||||
logger.info('Downloaded %d subtitle(s)', len(subtitles))
|
||||
downloaded_subtitles[video].extend(subtitles)
|
||||
|
||||
return downloaded_subtitles
|
||||
|
||||
|
||||
def save_subtitles(video, subtitles, single=False, directory=None, encoding=None):
|
||||
"""Save subtitles on filesystem.
|
||||
|
||||
Subtitles are saved in the order of the list. If a subtitle with a language has already been saved, other subtitles
|
||||
with the same language are silently ignored.
|
||||
|
||||
The extension used is `.lang.srt` by default or `.srt` is `single` is `True`, with `lang` being the IETF code for
|
||||
the :attr:`~subliminal.subtitle.Subtitle.language` of the subtitle.
|
||||
|
||||
:param video: video of the subtitles.
|
||||
:type video: :class:`~subliminal.video.Video`
|
||||
:param subtitles: subtitles to save.
|
||||
:type subtitles: list of :class:`~subliminal.subtitle.Subtitle`
|
||||
:param bool single: save a single subtitle, default is to save one subtitle per language.
|
||||
:param str directory: path to directory where to save the subtitles, default is next to the video.
|
||||
:param str encoding: encoding in which to save the subtitles, default is to keep original encoding.
|
||||
:return: the saved subtitles
|
||||
:rtype: list of :class:`~subliminal.subtitle.Subtitle`
|
||||
|
||||
"""
|
||||
saved_subtitles = []
|
||||
for subtitle in subtitles:
|
||||
# check content
|
||||
if subtitle.content is None:
|
||||
logger.error('Skipping subtitle %r: no content', subtitle)
|
||||
continue
|
||||
|
||||
# check language
|
||||
if subtitle.language in set(s.language for s in saved_subtitles):
|
||||
logger.debug('Skipping subtitle %r: language already saved', subtitle)
|
||||
continue
|
||||
|
||||
# create subtitle path
|
||||
subtitle_path = get_subtitle_path(video.name, None if single else subtitle.language)
|
||||
if directory is not None:
|
||||
subtitle_path = os.path.join(directory, os.path.split(subtitle_path)[1])
|
||||
|
||||
# save content as is or in the specified encoding
|
||||
logger.info('Saving %r to %r', subtitle, subtitle_path)
|
||||
if encoding is None:
|
||||
with io.open(subtitle_path, 'wb') as f:
|
||||
f.write(subtitle.content)
|
||||
else:
|
||||
with io.open(subtitle_path, 'w', encoding=encoding) as f:
|
||||
f.write(subtitle.text)
|
||||
saved_subtitles.append(subtitle)
|
||||
|
||||
# check single
|
||||
if single:
|
||||
break
|
||||
|
||||
return saved_subtitles
|
29
libs/subliminal2.7/exceptions.py
Normal file
29
libs/subliminal2.7/exceptions.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
class Error(Exception):
|
||||
"""Base class for exceptions in subliminal."""
|
||||
pass
|
||||
|
||||
|
||||
class ProviderError(Error):
|
||||
"""Exception raised by providers."""
|
||||
pass
|
||||
|
||||
|
||||
class ConfigurationError(ProviderError):
|
||||
"""Exception raised by providers when badly configured."""
|
||||
pass
|
||||
|
||||
|
||||
class AuthenticationError(ProviderError):
|
||||
"""Exception raised by providers when authentication failed."""
|
||||
pass
|
||||
|
||||
|
||||
class ServiceUnavailable(ProviderError):
|
||||
"""Exception raised when status is '503 Service Unavailable'."""
|
||||
pass
|
||||
|
||||
|
||||
class DownloadLimitExceeded(ProviderError):
|
||||
"""Exception raised by providers when download limit is exceeded."""
|
||||
pass
|
105
libs/subliminal2.7/extensions.py
Normal file
105
libs/subliminal2.7/extensions.py
Normal file
|
@ -0,0 +1,105 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from pkg_resources import EntryPoint
|
||||
|
||||
from stevedore import ExtensionManager
|
||||
|
||||
|
||||
class RegistrableExtensionManager(ExtensionManager):
|
||||
""":class:~stevedore.extensions.ExtensionManager` with support for registration.
|
||||
|
||||
It allows loading of internal extensions without setup and registering/unregistering additional extensions.
|
||||
|
||||
Loading is done in this order:
|
||||
|
||||
* Entry point extensions
|
||||
* Internal extensions
|
||||
* Registered extensions
|
||||
|
||||
:param str namespace: namespace argument for :class:~stevedore.extensions.ExtensionManager`.
|
||||
:param list internal_extensions: internal extensions to use with entry point syntax.
|
||||
:param \*\*kwargs: additional parameters for the :class:~stevedore.extensions.ExtensionManager` constructor.
|
||||
|
||||
"""
|
||||
def __init__(self, namespace, internal_extensions, **kwargs):
|
||||
#: Registered extensions with entry point syntax
|
||||
self.registered_extensions = []
|
||||
|
||||
#: Internal extensions with entry point syntax
|
||||
self.internal_extensions = internal_extensions
|
||||
|
||||
super(RegistrableExtensionManager, self).__init__(namespace, **kwargs)
|
||||
|
||||
def list_entry_points(self):
|
||||
# copy of default extensions
|
||||
eps = list(super(RegistrableExtensionManager, self).list_entry_points())
|
||||
|
||||
# internal extensions
|
||||
for iep in self.internal_extensions:
|
||||
ep = EntryPoint.parse(iep)
|
||||
if ep.name not in [e.name for e in eps]:
|
||||
eps.append(ep)
|
||||
|
||||
# registered extensions
|
||||
for rep in self.registered_extensions:
|
||||
ep = EntryPoint.parse(rep)
|
||||
if ep.name not in [e.name for e in eps]:
|
||||
eps.append(ep)
|
||||
|
||||
return eps
|
||||
|
||||
def register(self, entry_point):
|
||||
"""Register an extension
|
||||
|
||||
:param str entry_point: extension to register (entry point syntax).
|
||||
:raise: ValueError if already registered.
|
||||
|
||||
"""
|
||||
if entry_point in self.registered_extensions:
|
||||
raise ValueError('Extension already registered')
|
||||
|
||||
ep = EntryPoint.parse(entry_point)
|
||||
if ep.name in self.names():
|
||||
raise ValueError('An extension with the same name already exist')
|
||||
|
||||
ext = self._load_one_plugin(ep, False, (), {}, False)
|
||||
self.extensions.append(ext)
|
||||
if self._extensions_by_name is not None:
|
||||
self._extensions_by_name[ext.name] = ext
|
||||
self.registered_extensions.insert(0, entry_point)
|
||||
|
||||
def unregister(self, entry_point):
|
||||
"""Unregister a provider
|
||||
|
||||
:param str entry_point: provider to unregister (entry point syntax).
|
||||
|
||||
"""
|
||||
if entry_point not in self.registered_extensions:
|
||||
raise ValueError('Extension not registered')
|
||||
|
||||
ep = EntryPoint.parse(entry_point)
|
||||
self.registered_extensions.remove(entry_point)
|
||||
if self._extensions_by_name is not None:
|
||||
del self._extensions_by_name[ep.name]
|
||||
for i, ext in enumerate(self.extensions):
|
||||
if ext.name == ep.name:
|
||||
del self.extensions[i]
|
||||
break
|
||||
|
||||
|
||||
#: Provider manager
|
||||
provider_manager = RegistrableExtensionManager('subliminal.providers', [
|
||||
'addic7ed = subliminal.providers.addic7ed:Addic7edProvider',
|
||||
'legendastv = subliminal.providers.legendastv:LegendasTVProvider',
|
||||
'opensubtitles = subliminal.providers.opensubtitles:OpenSubtitlesProvider',
|
||||
'podnapisi = subliminal.providers.podnapisi:PodnapisiProvider',
|
||||
'shooter = subliminal.providers.shooter:ShooterProvider',
|
||||
'thesubdb = subliminal.providers.thesubdb:TheSubDBProvider',
|
||||
'tvsubtitles = subliminal.providers.tvsubtitles:TVsubtitlesProvider'
|
||||
])
|
||||
|
||||
#: Refiner manager
|
||||
refiner_manager = RegistrableExtensionManager('subliminal.refiners', [
|
||||
'metadata = subliminal.refiners.metadata:refine',
|
||||
'omdb = subliminal.refiners.omdb:refine',
|
||||
'tvdb = subliminal.refiners.tvdb:refine'
|
||||
])
|
164
libs/subliminal2.7/providers/__init__.py
Normal file
164
libs/subliminal2.7/providers/__init__.py
Normal file
|
@ -0,0 +1,164 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import logging
|
||||
|
||||
from bs4 import BeautifulSoup, FeatureNotFound
|
||||
from six.moves.xmlrpc_client import SafeTransport
|
||||
|
||||
from ..video import Episode, Movie
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TimeoutSafeTransport(SafeTransport):
|
||||
"""Timeout support for ``xmlrpc.client.SafeTransport``."""
|
||||
def __init__(self, timeout, *args, **kwargs):
|
||||
SafeTransport.__init__(self, *args, **kwargs)
|
||||
self.timeout = timeout
|
||||
|
||||
def make_connection(self, host):
|
||||
c = SafeTransport.make_connection(self, host)
|
||||
c.timeout = self.timeout
|
||||
|
||||
return c
|
||||
|
||||
|
||||
class ParserBeautifulSoup(BeautifulSoup):
|
||||
"""A ``bs4.BeautifulSoup`` that picks the first parser available in `parsers`.
|
||||
|
||||
:param markup: markup for the ``bs4.BeautifulSoup``.
|
||||
:param list parsers: parser names, in order of preference.
|
||||
|
||||
"""
|
||||
def __init__(self, markup, parsers, **kwargs):
|
||||
# reject features
|
||||
if set(parsers).intersection({'fast', 'permissive', 'strict', 'xml', 'html', 'html5'}):
|
||||
raise ValueError('Features not allowed, only parser names')
|
||||
|
||||
# reject some kwargs
|
||||
if 'features' in kwargs:
|
||||
raise ValueError('Cannot use features kwarg')
|
||||
if 'builder' in kwargs:
|
||||
raise ValueError('Cannot use builder kwarg')
|
||||
|
||||
# pick the first parser available
|
||||
for parser in parsers:
|
||||
try:
|
||||
super(ParserBeautifulSoup, self).__init__(markup, parser, **kwargs)
|
||||
return
|
||||
except FeatureNotFound:
|
||||
pass
|
||||
|
||||
raise FeatureNotFound
|
||||
|
||||
|
||||
class Provider(object):
|
||||
"""Base class for providers.
|
||||
|
||||
If any configuration is possible for the provider, like credentials, it must take place during instantiation.
|
||||
|
||||
:raise: :class:`~subliminal.exceptions.ConfigurationError` if there is a configuration error
|
||||
|
||||
"""
|
||||
#: Supported set of :class:`~babelfish.language.Language`
|
||||
languages = set()
|
||||
|
||||
#: Supported video types
|
||||
video_types = (Episode, Movie)
|
||||
|
||||
#: Required hash, if any
|
||||
required_hash = None
|
||||
|
||||
#: Subtitle class to use
|
||||
subtitle_class = None
|
||||
|
||||
def __enter__(self):
|
||||
self.initialize()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.terminate()
|
||||
|
||||
def initialize(self):
|
||||
"""Initialize the provider.
|
||||
|
||||
Must be called when starting to work with the provider. This is the place for network initialization
|
||||
or login operations.
|
||||
|
||||
.. note::
|
||||
This is called automatically when entering the `with` statement
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def terminate(self):
|
||||
"""Terminate the provider.
|
||||
|
||||
Must be called when done with the provider. This is the place for network shutdown or logout operations.
|
||||
|
||||
.. note::
|
||||
This is called automatically when exiting the `with` statement
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def check(cls, video):
|
||||
"""Check if the `video` can be processed.
|
||||
|
||||
The `video` is considered invalid if not an instance of :attr:`video_types` or if the :attr:`required_hash` is
|
||||
not present in :attr:`~subliminal.video.Video.hashes` attribute of the `video`.
|
||||
|
||||
:param video: the video to check.
|
||||
:type video: :class:`~subliminal.video.Video`
|
||||
:return: `True` if the `video` is valid, `False` otherwise.
|
||||
:rtype: bool
|
||||
|
||||
"""
|
||||
if not isinstance(video, cls.video_types):
|
||||
return False
|
||||
if cls.required_hash is not None and cls.required_hash not in video.hashes:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def query(self, *args, **kwargs):
|
||||
"""Query the provider for subtitles.
|
||||
|
||||
Arguments should match as much as possible the actual parameters for querying the provider
|
||||
|
||||
:return: found subtitles.
|
||||
:rtype: list of :class:`~subliminal.subtitle.Subtitle`
|
||||
:raise: :class:`~subliminal.exceptions.ProviderError`
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
"""List subtitles for the `video` with the given `languages`.
|
||||
|
||||
This will call the :meth:`query` method internally. The parameters passed to the :meth:`query` method may
|
||||
vary depending on the amount of information available in the `video`.
|
||||
|
||||
:param video: video to list subtitles for.
|
||||
:type video: :class:`~subliminal.video.Video`
|
||||
:param languages: languages to search for.
|
||||
:type languages: set of :class:`~babelfish.language.Language`
|
||||
:return: found subtitles.
|
||||
:rtype: list of :class:`~subliminal.subtitle.Subtitle`
|
||||
:raise: :class:`~subliminal.exceptions.ProviderError`
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
"""Download `subtitle`'s :attr:`~subliminal.subtitle.Subtitle.content`.
|
||||
|
||||
:param subtitle: subtitle to download.
|
||||
:type subtitle: :class:`~subliminal.subtitle.Subtitle`
|
||||
:raise: :class:`~subliminal.exceptions.ProviderError`
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def __repr__(self):
|
||||
return '<%s [%r]>' % (self.__class__.__name__, self.video_types)
|
321
libs/subliminal2.7/providers/addic7ed.py
Normal file
321
libs/subliminal2.7/providers/addic7ed.py
Normal file
|
@ -0,0 +1,321 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import logging
|
||||
import re
|
||||
|
||||
from babelfish import Language, language_converters
|
||||
from guessit import guessit
|
||||
from requests import Session
|
||||
|
||||
from . import ParserBeautifulSoup, Provider
|
||||
from .. import __short_version__
|
||||
from ..cache import SHOW_EXPIRATION_TIME, region
|
||||
from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded
|
||||
from ..score import get_equivalent_release_groups
|
||||
from ..subtitle import Subtitle, fix_line_ending, guess_matches
|
||||
from ..utils import sanitize, sanitize_release_group
|
||||
from ..video import Episode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
language_converters.register('addic7ed = subliminal.converters.addic7ed:Addic7edConverter')
|
||||
|
||||
# Series cell matching regex
|
||||
show_cells_re = re.compile(b'<td class="version">.*?</td>', re.DOTALL)
|
||||
|
||||
#: Series header parsing regex
|
||||
series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),*&!?-]+?)(?: \((?P<year>\d{4})\))?$')
|
||||
|
||||
|
||||
class Addic7edSubtitle(Subtitle):
|
||||
"""Addic7ed Subtitle."""
|
||||
provider_name = 'addic7ed'
|
||||
|
||||
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, year, version,
|
||||
download_link):
|
||||
super(Addic7edSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link)
|
||||
self.series = series
|
||||
self.season = season
|
||||
self.episode = episode
|
||||
self.title = title
|
||||
self.year = year
|
||||
self.version = version
|
||||
self.download_link = download_link
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
return self.download_link
|
||||
|
||||
def get_matches(self, video):
|
||||
matches = set()
|
||||
|
||||
# series name
|
||||
if video.series and sanitize(self.series) in (
|
||||
sanitize(name) for name in [video.series] + video.alternative_series):
|
||||
matches.add('series')
|
||||
# season
|
||||
if video.season and self.season == video.season:
|
||||
matches.add('season')
|
||||
# episode
|
||||
if video.episode and self.episode == video.episode:
|
||||
matches.add('episode')
|
||||
# title of the episode
|
||||
if video.title and sanitize(self.title) == sanitize(video.title):
|
||||
matches.add('title')
|
||||
# year
|
||||
if video.original_series and self.year is None or video.year and video.year == self.year:
|
||||
matches.add('year')
|
||||
# release_group
|
||||
if (video.release_group and self.version and
|
||||
any(r in sanitize_release_group(self.version)
|
||||
for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))):
|
||||
matches.add('release_group')
|
||||
# resolution
|
||||
if video.resolution and self.version and video.resolution in self.version.lower():
|
||||
matches.add('resolution')
|
||||
# format
|
||||
if video.format and self.version and video.format.lower() in self.version.lower():
|
||||
matches.add('format')
|
||||
# other properties
|
||||
matches |= guess_matches(video, guessit(self.version), partial=True)
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
class Addic7edProvider(Provider):
|
||||
"""Addic7ed Provider."""
|
||||
languages = {Language('por', 'BR')} | {Language(l) for l in [
|
||||
'ara', 'aze', 'ben', 'bos', 'bul', 'cat', 'ces', 'dan', 'deu', 'ell', 'eng', 'eus', 'fas', 'fin', 'fra', 'glg',
|
||||
'heb', 'hrv', 'hun', 'hye', 'ind', 'ita', 'jpn', 'kor', 'mkd', 'msa', 'nld', 'nor', 'pol', 'por', 'ron', 'rus',
|
||||
'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'tha', 'tur', 'ukr', 'vie', 'zho'
|
||||
]}
|
||||
video_types = (Episode,)
|
||||
server_url = 'http://www.addic7ed.com/'
|
||||
subtitle_class = Addic7edSubtitle
|
||||
|
||||
def __init__(self, username=None, password=None):
|
||||
if any((username, password)) and not all((username, password)):
|
||||
raise ConfigurationError('Username and password must be specified')
|
||||
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.logged_in = False
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
|
||||
|
||||
# login
|
||||
if self.username and self.password:
|
||||
logger.info('Logging in')
|
||||
data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'}
|
||||
r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10)
|
||||
|
||||
if r.status_code != 302:
|
||||
raise AuthenticationError(self.username)
|
||||
|
||||
logger.debug('Logged in')
|
||||
self.logged_in = True
|
||||
|
||||
def terminate(self):
|
||||
# logout
|
||||
if self.logged_in:
|
||||
logger.info('Logging out')
|
||||
r = self.session.get(self.server_url + 'logout.php', timeout=10)
|
||||
r.raise_for_status()
|
||||
logger.debug('Logged out')
|
||||
self.logged_in = False
|
||||
|
||||
self.session.close()
|
||||
|
||||
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME)
|
||||
def _get_show_ids(self):
|
||||
"""Get the ``dict`` of show ids per series by querying the `shows.php` page.
|
||||
|
||||
:return: show id per series, lower case and without quotes.
|
||||
:rtype: dict
|
||||
|
||||
"""
|
||||
# get the show page
|
||||
logger.info('Getting show ids')
|
||||
r = self.session.get(self.server_url + 'shows.php', timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
# LXML parser seems to fail when parsing Addic7ed.com HTML markup.
|
||||
# Last known version to work properly is 3.6.4 (next version, 3.7.0, fails)
|
||||
# Assuming the site's markup is bad, and stripping it down to only contain what's needed.
|
||||
show_cells = re.findall(show_cells_re, r.content)
|
||||
if show_cells:
|
||||
soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser'])
|
||||
else:
|
||||
# If RegEx fails, fall back to original r.content and use 'html.parser'
|
||||
soup = ParserBeautifulSoup(r.content, ['html.parser'])
|
||||
|
||||
# populate the show ids
|
||||
show_ids = {}
|
||||
for show in soup.select('td.version > h3 > a[href^="/show/"]'):
|
||||
show_ids[sanitize(show.text)] = int(show['href'][6:])
|
||||
logger.debug('Found %d show ids', len(show_ids))
|
||||
|
||||
return show_ids
|
||||
|
||||
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME)
|
||||
def _search_show_id(self, series, year=None):
|
||||
"""Search the show id from the `series` and `year`.
|
||||
|
||||
:param str series: series of the episode.
|
||||
:param year: year of the series, if any.
|
||||
:type year: int
|
||||
:return: the show id, if found.
|
||||
:rtype: int
|
||||
|
||||
"""
|
||||
# addic7ed doesn't support search with quotes
|
||||
series = series.replace('\'', ' ')
|
||||
|
||||
# build the params
|
||||
series_year = '%s %d' % (series, year) if year is not None else series
|
||||
params = {'search': series_year, 'Submit': 'Search'}
|
||||
|
||||
# make the search
|
||||
logger.info('Searching show ids with %r', params)
|
||||
r = self.session.get(self.server_url + 'search.php', params=params, timeout=10)
|
||||
r.raise_for_status()
|
||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||
|
||||
# get the suggestion
|
||||
suggestion = soup.select('span.titulo > a[href^="/show/"]')
|
||||
if not suggestion:
|
||||
logger.warning('Show id not found: no suggestion')
|
||||
return None
|
||||
if not sanitize(suggestion[0].i.text.replace('\'', ' ')) == sanitize(series_year):
|
||||
logger.warning('Show id not found: suggestion does not match')
|
||||
return None
|
||||
show_id = int(suggestion[0]['href'][6:])
|
||||
logger.debug('Found show id %d', show_id)
|
||||
|
||||
return show_id
|
||||
|
||||
def get_show_id(self, series, year=None, country_code=None):
|
||||
"""Get the best matching show id for `series`, `year` and `country_code`.
|
||||
|
||||
First search in the result of :meth:`_get_show_ids` and fallback on a search with :meth:`_search_show_id`.
|
||||
|
||||
:param str series: series of the episode.
|
||||
:param year: year of the series, if any.
|
||||
:type year: int
|
||||
:param country_code: country code of the series, if any.
|
||||
:type country_code: str
|
||||
:return: the show id, if found.
|
||||
:rtype: int
|
||||
|
||||
"""
|
||||
series_sanitized = sanitize(series).lower()
|
||||
show_ids = self._get_show_ids()
|
||||
show_id = None
|
||||
|
||||
# attempt with country
|
||||
if not show_id and country_code:
|
||||
logger.debug('Getting show id with country')
|
||||
show_id = show_ids.get('%s %s' % (series_sanitized, country_code.lower()))
|
||||
|
||||
# attempt with year
|
||||
if not show_id and year:
|
||||
logger.debug('Getting show id with year')
|
||||
show_id = show_ids.get('%s %d' % (series_sanitized, year))
|
||||
|
||||
# attempt clean
|
||||
if not show_id:
|
||||
logger.debug('Getting show id')
|
||||
show_id = show_ids.get(series_sanitized)
|
||||
|
||||
# search as last resort
|
||||
if not show_id:
|
||||
logger.warning('Series %s not found in show ids', series)
|
||||
show_id = self._search_show_id(series)
|
||||
|
||||
return show_id
|
||||
|
||||
def query(self, show_id, series, season, year=None, country=None):
|
||||
# get the page of the season of the show
|
||||
logger.info('Getting the page of show id %d, season %d', show_id, season)
|
||||
r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
if not r.content:
|
||||
# Provider returns a status of 304 Not Modified with an empty content
|
||||
# raise_for_status won't raise exception for that status code
|
||||
logger.debug('No data returned from provider')
|
||||
return []
|
||||
|
||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||
|
||||
# loop over subtitle rows
|
||||
match = series_year_re.match(soup.select('#header font')[0].text.strip()[:-10])
|
||||
series = match.group('series')
|
||||
year = int(match.group('year')) if match.group('year') else None
|
||||
subtitles = []
|
||||
for row in soup.select('tr.epeven'):
|
||||
cells = row('td')
|
||||
|
||||
# ignore incomplete subtitles
|
||||
status = cells[5].text
|
||||
if status != 'Completed':
|
||||
logger.debug('Ignoring subtitle with status %s', status)
|
||||
continue
|
||||
|
||||
# read the item
|
||||
language = Language.fromaddic7ed(cells[3].text)
|
||||
hearing_impaired = bool(cells[6].text)
|
||||
page_link = self.server_url + cells[2].a['href'][1:]
|
||||
season = int(cells[0].text)
|
||||
episode = int(cells[1].text)
|
||||
title = cells[2].text
|
||||
version = cells[4].text
|
||||
download_link = cells[9].a['href'][1:]
|
||||
|
||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year,
|
||||
version, download_link)
|
||||
logger.debug('Found subtitle %r', subtitle)
|
||||
subtitles.append(subtitle)
|
||||
|
||||
return subtitles
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
# lookup show_id
|
||||
titles = [video.series] + video.alternative_series
|
||||
show_id = None
|
||||
for title in titles:
|
||||
show_id = self.get_show_id(title, video.year)
|
||||
if show_id is not None:
|
||||
break
|
||||
|
||||
# query for subtitles with the show_id
|
||||
if show_id is not None:
|
||||
subtitles = [s for s in self.query(show_id, title, video.season, video.year)
|
||||
if s.language in languages and s.episode == video.episode]
|
||||
if subtitles:
|
||||
return subtitles
|
||||
else:
|
||||
logger.error('No show id found for %r (%r)', video.series, {'year': video.year})
|
||||
|
||||
return []
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
# download the subtitle
|
||||
logger.info('Downloading subtitle %r', subtitle)
|
||||
r = self.session.get(self.server_url + subtitle.download_link, headers={'Referer': subtitle.page_link},
|
||||
timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
if not r.content:
|
||||
# Provider returns a status of 304 Not Modified with an empty content
|
||||
# raise_for_status won't raise exception for that status code
|
||||
logger.debug('Unable to download subtitle. No data returned from provider')
|
||||
return
|
||||
|
||||
# detect download limit exceeded
|
||||
if r.headers['Content-Type'] == 'text/html':
|
||||
raise DownloadLimitExceeded
|
||||
|
||||
subtitle.content = fix_line_ending(r.content)
|
522
libs/subliminal2.7/providers/legendastv.py
Normal file
522
libs/subliminal2.7/providers/legendastv.py
Normal file
|
@ -0,0 +1,522 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
from babelfish import Language, language_converters
|
||||
from datetime import datetime, timedelta
|
||||
from dogpile.cache.api import NO_VALUE
|
||||
from guessit import guessit
|
||||
import pytz
|
||||
import rarfile
|
||||
from rarfile import RarFile, is_rarfile
|
||||
from requests import Session
|
||||
from zipfile import ZipFile, is_zipfile
|
||||
|
||||
from . import ParserBeautifulSoup, Provider
|
||||
from .. import __short_version__
|
||||
from ..cache import SHOW_EXPIRATION_TIME, region
|
||||
from ..exceptions import AuthenticationError, ConfigurationError, ProviderError, ServiceUnavailable
|
||||
from ..subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches, sanitize
|
||||
from ..video import Episode, Movie
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
language_converters.register('legendastv = subliminal.converters.legendastv:LegendasTVConverter')
|
||||
|
||||
# Configure :mod:`rarfile` to use the same path separator as :mod:`zipfile`
|
||||
rarfile.PATH_SEP = '/'
|
||||
|
||||
#: Conversion map for types
|
||||
type_map = {'M': 'movie', 'S': 'episode', 'C': 'episode'}
|
||||
|
||||
#: BR title season parsing regex
|
||||
season_re = re.compile(r' - (?P<season>\d+)(\xaa|a|st|nd|rd|th) (temporada|season)', re.IGNORECASE)
|
||||
|
||||
#: Downloads parsing regex
|
||||
downloads_re = re.compile(r'(?P<downloads>\d+) downloads')
|
||||
|
||||
#: Rating parsing regex
|
||||
rating_re = re.compile(r'nota (?P<rating>\d+)')
|
||||
|
||||
#: Timestamp parsing regex
|
||||
timestamp_re = re.compile(r'(?P<day>\d+)/(?P<month>\d+)/(?P<year>\d+) - (?P<hour>\d+):(?P<minute>\d+)')
|
||||
|
||||
#: Title with year/country regex
|
||||
title_re = re.compile(r'^(?P<series>.*?)(?: \((?:(?P<year>\d{4})|(?P<country>[A-Z]{2}))\))?$')
|
||||
|
||||
#: Cache key for releases
|
||||
releases_key = __name__ + ':releases|{archive_id}|{archive_name}'
|
||||
|
||||
|
||||
class LegendasTVArchive(object):
|
||||
"""LegendasTV Archive.
|
||||
|
||||
:param str id: identifier.
|
||||
:param str name: name.
|
||||
:param bool pack: contains subtitles for multiple episodes.
|
||||
:param bool pack: featured.
|
||||
:param str link: link.
|
||||
:param int downloads: download count.
|
||||
:param int rating: rating (0-10).
|
||||
:param timestamp: timestamp.
|
||||
:type timestamp: datetime.datetime
|
||||
"""
|
||||
|
||||
def __init__(self, id, name, pack, featured, link, downloads=0, rating=0, timestamp=None):
|
||||
#: Identifier
|
||||
self.id = id
|
||||
|
||||
#: Name
|
||||
self.name = name
|
||||
|
||||
#: Pack
|
||||
self.pack = pack
|
||||
|
||||
#: Featured
|
||||
self.featured = featured
|
||||
|
||||
#: Link
|
||||
self.link = link
|
||||
|
||||
#: Download count
|
||||
self.downloads = downloads
|
||||
|
||||
#: Rating (0-10)
|
||||
self.rating = rating
|
||||
|
||||
#: Timestamp
|
||||
self.timestamp = timestamp
|
||||
|
||||
#: Compressed content as :class:`rarfile.RarFile` or :class:`zipfile.ZipFile`
|
||||
self.content = None
|
||||
|
||||
def __repr__(self):
|
||||
return '<%s [%s] %r>' % (self.__class__.__name__, self.id, self.name)
|
||||
|
||||
|
||||
class LegendasTVSubtitle(Subtitle):
|
||||
"""LegendasTV Subtitle."""
|
||||
|
||||
provider_name = 'legendastv'
|
||||
|
||||
def __init__(self, language, type, title, year, imdb_id, season, archive, name):
|
||||
super(LegendasTVSubtitle, self).__init__(language, page_link=archive.link)
|
||||
self.type = type
|
||||
self.title = title
|
||||
self.year = year
|
||||
self.imdb_id = imdb_id
|
||||
self.season = season
|
||||
self.archive = archive
|
||||
self.name = name
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
return '%s-%s' % (self.archive.id, self.name.lower())
|
||||
|
||||
def get_matches(self, video, hearing_impaired=False):
|
||||
matches = set()
|
||||
|
||||
# episode
|
||||
if isinstance(video, Episode) and self.type == 'episode':
|
||||
# series
|
||||
if video.series and (sanitize(self.title) in (
|
||||
sanitize(name) for name in [video.series] + video.alternative_series)):
|
||||
matches.add('series')
|
||||
|
||||
# year
|
||||
if video.original_series and self.year is None or video.year and video.year == self.year:
|
||||
matches.add('year')
|
||||
|
||||
# imdb_id
|
||||
if video.series_imdb_id and self.imdb_id == video.series_imdb_id:
|
||||
matches.add('series_imdb_id')
|
||||
|
||||
# movie
|
||||
elif isinstance(video, Movie) and self.type == 'movie':
|
||||
# title
|
||||
if video.title and (sanitize(self.title) in (
|
||||
sanitize(name) for name in [video.title] + video.alternative_titles)):
|
||||
matches.add('title')
|
||||
|
||||
# year
|
||||
if video.year and self.year == video.year:
|
||||
matches.add('year')
|
||||
|
||||
# imdb_id
|
||||
if video.imdb_id and self.imdb_id == video.imdb_id:
|
||||
matches.add('imdb_id')
|
||||
|
||||
# name
|
||||
matches |= guess_matches(video, guessit(self.name, {'type': self.type}))
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
class LegendasTVProvider(Provider):
|
||||
"""LegendasTV Provider.
|
||||
|
||||
:param str username: username.
|
||||
:param str password: password.
|
||||
"""
|
||||
|
||||
languages = {Language.fromlegendastv(l) for l in language_converters['legendastv'].codes}
|
||||
server_url = 'http://legendas.tv/'
|
||||
subtitle_class = LegendasTVSubtitle
|
||||
|
||||
def __init__(self, username=None, password=None):
|
||||
|
||||
# Provider needs UNRAR installed. If not available raise ConfigurationError
|
||||
try:
|
||||
rarfile.custom_check(rarfile.UNRAR_TOOL)
|
||||
except rarfile.RarExecError:
|
||||
raise ConfigurationError('UNRAR tool not available')
|
||||
|
||||
if any((username, password)) and not all((username, password)):
|
||||
raise ConfigurationError('Username and password must be specified')
|
||||
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.logged_in = False
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
|
||||
|
||||
# login
|
||||
if self.username and self.password:
|
||||
logger.info('Logging in')
|
||||
data = {'_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password}
|
||||
r = self.session.post(self.server_url + 'login', data, allow_redirects=False, timeout=10)
|
||||
raise_for_status(r)
|
||||
|
||||
soup = ParserBeautifulSoup(r.content, ['html.parser'])
|
||||
if soup.find('div', {'class': 'alert-error'}, string=re.compile(u'Usuário ou senha inválidos')):
|
||||
raise AuthenticationError(self.username)
|
||||
|
||||
logger.debug('Logged in')
|
||||
self.logged_in = True
|
||||
|
||||
def terminate(self):
|
||||
# logout
|
||||
if self.logged_in:
|
||||
logger.info('Logging out')
|
||||
r = self.session.get(self.server_url + 'users/logout', allow_redirects=False, timeout=10)
|
||||
raise_for_status(r)
|
||||
logger.debug('Logged out')
|
||||
self.logged_in = False
|
||||
|
||||
self.session.close()
|
||||
|
||||
@staticmethod
|
||||
def is_valid_title(title, title_id, sanitized_title, season, year):
|
||||
"""Check if is a valid title."""
|
||||
sanitized_result = sanitize(title['title'])
|
||||
if sanitized_result != sanitized_title:
|
||||
logger.debug("Mismatched title, discarding title %d (%s)",
|
||||
title_id, sanitized_result)
|
||||
return
|
||||
|
||||
# episode type
|
||||
if season:
|
||||
# discard mismatches on type
|
||||
if title['type'] != 'episode':
|
||||
logger.debug("Mismatched 'episode' type, discarding title %d (%s)", title_id, sanitized_result)
|
||||
return
|
||||
|
||||
# discard mismatches on season
|
||||
if 'season' not in title or title['season'] != season:
|
||||
logger.debug('Mismatched season %s, discarding title %d (%s)',
|
||||
title.get('season'), title_id, sanitized_result)
|
||||
return
|
||||
# movie type
|
||||
else:
|
||||
# discard mismatches on type
|
||||
if title['type'] != 'movie':
|
||||
logger.debug("Mismatched 'movie' type, discarding title %d (%s)", title_id, sanitized_result)
|
||||
return
|
||||
|
||||
# discard mismatches on year
|
||||
if year is not None and 'year' in title and title['year'] != year:
|
||||
logger.debug("Mismatched movie year, discarding title %d (%s)", title_id, sanitized_result)
|
||||
return
|
||||
return True
|
||||
|
||||
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, should_cache_fn=lambda value: value)
|
||||
def search_titles(self, title, season, title_year):
|
||||
"""Search for titles matching the `title`.
|
||||
|
||||
For episodes, each season has it own title
|
||||
:param str title: the title to search for.
|
||||
:param int season: season of the title
|
||||
:param int title_year: year of the title
|
||||
:return: found titles.
|
||||
:rtype: dict
|
||||
"""
|
||||
titles = {}
|
||||
sanitized_titles = [sanitize(title)]
|
||||
ignore_characters = {'\'', '.'}
|
||||
if any(c in title for c in ignore_characters):
|
||||
sanitized_titles.append(sanitize(title, ignore_characters=ignore_characters))
|
||||
|
||||
for sanitized_title in sanitized_titles:
|
||||
# make the query
|
||||
if season:
|
||||
logger.info('Searching episode title %r for season %r', sanitized_title, season)
|
||||
else:
|
||||
logger.info('Searching movie title %r', sanitized_title)
|
||||
|
||||
r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(sanitized_title), timeout=10)
|
||||
raise_for_status(r)
|
||||
results = json.loads(r.text)
|
||||
|
||||
# loop over results
|
||||
for result in results:
|
||||
source = result['_source']
|
||||
|
||||
# extract id
|
||||
title_id = int(source['id_filme'])
|
||||
|
||||
# extract type
|
||||
title = {'type': type_map[source['tipo']]}
|
||||
|
||||
# extract title, year and country
|
||||
name, year, country = title_re.match(source['dsc_nome']).groups()
|
||||
title['title'] = name
|
||||
|
||||
# extract imdb_id
|
||||
if source['id_imdb'] != '0':
|
||||
if not source['id_imdb'].startswith('tt'):
|
||||
title['imdb_id'] = 'tt' + source['id_imdb'].zfill(7)
|
||||
else:
|
||||
title['imdb_id'] = source['id_imdb']
|
||||
|
||||
# extract season
|
||||
if title['type'] == 'episode':
|
||||
if source['temporada'] and source['temporada'].isdigit():
|
||||
title['season'] = int(source['temporada'])
|
||||
else:
|
||||
match = season_re.search(source['dsc_nome_br'])
|
||||
if match:
|
||||
title['season'] = int(match.group('season'))
|
||||
else:
|
||||
logger.debug('No season detected for title %d (%s)', title_id, name)
|
||||
|
||||
# extract year
|
||||
if year:
|
||||
title['year'] = int(year)
|
||||
elif source['dsc_data_lancamento'] and source['dsc_data_lancamento'].isdigit():
|
||||
# year is based on season air date hence the adjustment
|
||||
title['year'] = int(source['dsc_data_lancamento']) - title.get('season', 1) + 1
|
||||
|
||||
# add title only if is valid
|
||||
# Check against title without ignored chars
|
||||
if self.is_valid_title(title, title_id, sanitized_titles[0], season, title_year):
|
||||
titles[title_id] = title
|
||||
|
||||
logger.debug('Found %d titles', len(titles))
|
||||
|
||||
return titles
|
||||
|
||||
@region.cache_on_arguments(expiration_time=timedelta(minutes=15).total_seconds())
|
||||
def get_archives(self, title_id, language_code, title_type, season, episode):
|
||||
"""Get the archive list from a given `title_id`, `language_code`, `title_type`, `season` and `episode`.
|
||||
|
||||
:param int title_id: title id.
|
||||
:param int language_code: language code.
|
||||
:param str title_type: episode or movie
|
||||
:param int season: season
|
||||
:param int episode: episode
|
||||
:return: the archives.
|
||||
:rtype: list of :class:`LegendasTVArchive`
|
||||
|
||||
"""
|
||||
archives = []
|
||||
page = 0
|
||||
while True:
|
||||
# get the archive page
|
||||
url = self.server_url + 'legenda/busca/-/{language}/-/{page}/{title}'.format(
|
||||
language=language_code, page=page, title=title_id)
|
||||
r = self.session.get(url)
|
||||
raise_for_status(r)
|
||||
|
||||
# parse the results
|
||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||
for archive_soup in soup.select('div.list_element > article > div > div.f_left'):
|
||||
# create archive
|
||||
archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2],
|
||||
archive_soup.a.text,
|
||||
'pack' in archive_soup.parent['class'],
|
||||
'destaque' in archive_soup.parent['class'],
|
||||
self.server_url + archive_soup.a['href'][1:])
|
||||
# clean name of path separators and pack flags
|
||||
clean_name = archive.name.replace('/', '-')
|
||||
if archive.pack and clean_name.startswith('(p)'):
|
||||
clean_name = clean_name[3:]
|
||||
|
||||
# guess from name
|
||||
guess = guessit(clean_name, {'type': title_type})
|
||||
|
||||
# episode
|
||||
if season and episode:
|
||||
# discard mismatches on episode in non-pack archives
|
||||
|
||||
# Guessit may return int for single episode or list for multi-episode
|
||||
# Check if archive name has multiple episodes releases on it
|
||||
if not archive.pack and 'episode' in guess:
|
||||
wanted_episode = set(episode) if isinstance(episode, list) else {episode}
|
||||
archive_episode = guess['episode'] if isinstance(guess['episode'], list) else {guess['episode']}
|
||||
|
||||
if not wanted_episode.intersection(archive_episode):
|
||||
logger.debug('Mismatched episode %s, discarding archive: %s', guess['episode'], clean_name)
|
||||
continue
|
||||
|
||||
# extract text containing downloads, rating and timestamp
|
||||
data_text = archive_soup.find('p', class_='data').text
|
||||
|
||||
# match downloads
|
||||
archive.downloads = int(downloads_re.search(data_text).group('downloads'))
|
||||
|
||||
# match rating
|
||||
match = rating_re.search(data_text)
|
||||
if match:
|
||||
archive.rating = int(match.group('rating'))
|
||||
|
||||
# match timestamp and validate it
|
||||
time_data = {k: int(v) for k, v in timestamp_re.search(data_text).groupdict().items()}
|
||||
archive.timestamp = pytz.timezone('America/Sao_Paulo').localize(datetime(**time_data))
|
||||
if archive.timestamp > datetime.utcnow().replace(tzinfo=pytz.utc):
|
||||
raise ProviderError('Archive timestamp is in the future')
|
||||
|
||||
# add archive
|
||||
logger.info('Found archive for title %d and language %d at page %s: %s',
|
||||
title_id, language_code, page, archive)
|
||||
archives.append(archive)
|
||||
|
||||
# stop on last page
|
||||
if soup.find('a', attrs={'class': 'load_more'}, string='carregar mais') is None:
|
||||
break
|
||||
|
||||
# increment page count
|
||||
page += 1
|
||||
|
||||
logger.debug('Found %d archives', len(archives))
|
||||
|
||||
return archives
|
||||
|
||||
def download_archive(self, archive):
|
||||
"""Download an archive's :attr:`~LegendasTVArchive.content`.
|
||||
|
||||
:param archive: the archive to download :attr:`~LegendasTVArchive.content` of.
|
||||
:type archive: :class:`LegendasTVArchive`
|
||||
|
||||
"""
|
||||
logger.info('Downloading archive %s', archive.id)
|
||||
r = self.session.get(self.server_url + 'downloadarquivo/{}'.format(archive.id))
|
||||
raise_for_status(r)
|
||||
|
||||
# open the archive
|
||||
archive_stream = io.BytesIO(r.content)
|
||||
if is_rarfile(archive_stream):
|
||||
logger.debug('Identified rar archive')
|
||||
archive.content = RarFile(archive_stream)
|
||||
elif is_zipfile(archive_stream):
|
||||
logger.debug('Identified zip archive')
|
||||
archive.content = ZipFile(archive_stream)
|
||||
else:
|
||||
raise ValueError('Not a valid archive')
|
||||
|
||||
def query(self, language, title, season=None, episode=None, year=None):
|
||||
# search for titles
|
||||
titles = self.search_titles(title, season, year)
|
||||
|
||||
subtitles = []
|
||||
# iterate over titles
|
||||
for title_id, t in titles.items():
|
||||
|
||||
logger.info('Getting archives for title %d and language %d', title_id, language.legendastv)
|
||||
archives = self.get_archives(title_id, language.legendastv, t['type'], season, episode)
|
||||
if not archives:
|
||||
logger.info('No archives found for title %d and language %d', title_id, language.legendastv)
|
||||
|
||||
# iterate over title's archives
|
||||
for a in archives:
|
||||
|
||||
# compute an expiration time based on the archive timestamp
|
||||
expiration_time = (datetime.utcnow().replace(tzinfo=pytz.utc) - a.timestamp).total_seconds()
|
||||
|
||||
# attempt to get the releases from the cache
|
||||
cache_key = releases_key.format(archive_id=a.id, archive_name=a.name)
|
||||
releases = region.get(cache_key, expiration_time=expiration_time)
|
||||
|
||||
# the releases are not in cache or cache is expired
|
||||
if releases == NO_VALUE:
|
||||
logger.info('Releases not found in cache')
|
||||
|
||||
# download archive
|
||||
self.download_archive(a)
|
||||
|
||||
# extract the releases
|
||||
releases = []
|
||||
for name in a.content.namelist():
|
||||
# discard the legendastv file
|
||||
if name.startswith('Legendas.tv'):
|
||||
continue
|
||||
|
||||
# discard hidden files
|
||||
if os.path.split(name)[-1].startswith('.'):
|
||||
continue
|
||||
|
||||
# discard non-subtitle files
|
||||
if not name.lower().endswith(SUBTITLE_EXTENSIONS):
|
||||
continue
|
||||
|
||||
releases.append(name)
|
||||
|
||||
# cache the releases
|
||||
region.set(cache_key, releases)
|
||||
|
||||
# iterate over releases
|
||||
for r in releases:
|
||||
subtitle = self.subtitle_class(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'),
|
||||
t.get('season'), a, r)
|
||||
logger.debug('Found subtitle %r', subtitle)
|
||||
subtitles.append(subtitle)
|
||||
|
||||
return subtitles
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
season = episode = None
|
||||
if isinstance(video, Episode):
|
||||
titles = [video.series] + video.alternative_series
|
||||
season = video.season
|
||||
episode = video.episode
|
||||
else:
|
||||
titles = [video.title] + video.alternative_titles
|
||||
|
||||
for title in titles:
|
||||
subtitles = [s for l in languages for s in
|
||||
self.query(l, title, season=season, episode=episode, year=video.year)]
|
||||
if subtitles:
|
||||
return subtitles
|
||||
|
||||
return []
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
# download archive in case we previously hit the releases cache and didn't download it
|
||||
if subtitle.archive.content is None:
|
||||
self.download_archive(subtitle.archive)
|
||||
|
||||
# extract subtitle's content
|
||||
subtitle.content = fix_line_ending(subtitle.archive.content.read(subtitle.name))
|
||||
|
||||
|
||||
def raise_for_status(r):
|
||||
# When site is under maintaince and http status code 200.
|
||||
if 'Em breve estaremos de volta' in r.text:
|
||||
raise ServiceUnavailable
|
||||
else:
|
||||
r.raise_for_status()
|
108
libs/subliminal2.7/providers/napiprojekt.py
Normal file
108
libs/subliminal2.7/providers/napiprojekt.py
Normal file
|
@ -0,0 +1,108 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import logging
|
||||
|
||||
from babelfish import Language
|
||||
from requests import Session
|
||||
|
||||
from . import Provider
|
||||
from .. import __short_version__
|
||||
from ..subtitle import Subtitle
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_subhash(hash):
|
||||
"""Get a second hash based on napiprojekt's hash.
|
||||
|
||||
:param str hash: napiprojekt's hash.
|
||||
:return: the subhash.
|
||||
:rtype: str
|
||||
|
||||
"""
|
||||
idx = [0xe, 0x3, 0x6, 0x8, 0x2]
|
||||
mul = [2, 2, 5, 4, 3]
|
||||
add = [0, 0xd, 0x10, 0xb, 0x5]
|
||||
|
||||
b = []
|
||||
for i in range(len(idx)):
|
||||
a = add[i]
|
||||
m = mul[i]
|
||||
i = idx[i]
|
||||
t = a + int(hash[i], 16)
|
||||
v = int(hash[t:t + 2], 16)
|
||||
b.append(('%x' % (v * m))[-1])
|
||||
|
||||
return ''.join(b)
|
||||
|
||||
|
||||
class NapiProjektSubtitle(Subtitle):
|
||||
"""NapiProjekt Subtitle."""
|
||||
provider_name = 'napiprojekt'
|
||||
|
||||
def __init__(self, language, hash):
|
||||
super(NapiProjektSubtitle, self).__init__(language)
|
||||
self.hash = hash
|
||||
self.content = None
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
return self.hash
|
||||
|
||||
def get_matches(self, video):
|
||||
matches = set()
|
||||
|
||||
# hash
|
||||
if 'napiprojekt' in video.hashes and video.hashes['napiprojekt'] == self.hash:
|
||||
matches.add('hash')
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
class NapiProjektProvider(Provider):
|
||||
"""NapiProjekt Provider."""
|
||||
languages = {Language.fromalpha2(l) for l in ['pl']}
|
||||
required_hash = 'napiprojekt'
|
||||
server_url = 'http://napiprojekt.pl/unit_napisy/dl.php'
|
||||
subtitle_class = NapiProjektSubtitle
|
||||
|
||||
def __init__(self):
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
|
||||
|
||||
def terminate(self):
|
||||
self.session.close()
|
||||
|
||||
def query(self, language, hash):
|
||||
params = {
|
||||
'v': 'dreambox',
|
||||
'kolejka': 'false',
|
||||
'nick': '',
|
||||
'pass': '',
|
||||
'napios': 'Linux',
|
||||
'l': language.alpha2.upper(),
|
||||
'f': hash,
|
||||
't': get_subhash(hash)}
|
||||
logger.info('Searching subtitle %r', params)
|
||||
r = self.session.get(self.server_url, params=params, timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
# handle subtitles not found and errors
|
||||
if r.content[:4] == b'NPc0':
|
||||
logger.debug('No subtitles found')
|
||||
return None
|
||||
|
||||
subtitle = self.subtitle_class(language, hash)
|
||||
subtitle.content = r.content
|
||||
logger.debug('Found subtitle %r', subtitle)
|
||||
|
||||
return subtitle
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
return [s for s in [self.query(l, video.hashes['napiprojekt']) for l in languages] if s is not None]
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
# there is no download step, content is already filled from listing subtitles
|
||||
pass
|
297
libs/subliminal2.7/providers/opensubtitles.py
Normal file
297
libs/subliminal2.7/providers/opensubtitles.py
Normal file
|
@ -0,0 +1,297 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import zlib
|
||||
|
||||
from babelfish import Language, language_converters
|
||||
from guessit import guessit
|
||||
from six.moves.xmlrpc_client import ServerProxy
|
||||
|
||||
from . import Provider, TimeoutSafeTransport
|
||||
from .. import __short_version__
|
||||
from ..exceptions import (AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError,
|
||||
ServiceUnavailable)
|
||||
from ..subtitle import Subtitle, fix_line_ending, guess_matches
|
||||
from ..utils import sanitize
|
||||
from ..video import Episode, Movie
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OpenSubtitlesSubtitle(Subtitle):
|
||||
"""OpenSubtitles Subtitle."""
|
||||
provider_name = 'opensubtitles'
|
||||
series_re = re.compile(r'^"(?P<series_name>.*)" (?P<series_title>.*)$')
|
||||
|
||||
def __init__(self, language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, hash, movie_name,
|
||||
movie_release_name, movie_year, movie_imdb_id, series_season, series_episode, filename, encoding):
|
||||
super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired=hearing_impaired,
|
||||
page_link=page_link, encoding=encoding)
|
||||
self.subtitle_id = subtitle_id
|
||||
self.matched_by = matched_by
|
||||
self.movie_kind = movie_kind
|
||||
self.hash = hash
|
||||
self.movie_name = movie_name
|
||||
self.movie_release_name = movie_release_name
|
||||
self.movie_year = movie_year
|
||||
self.movie_imdb_id = movie_imdb_id
|
||||
self.series_season = series_season
|
||||
self.series_episode = series_episode
|
||||
self.filename = filename
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
return str(self.subtitle_id)
|
||||
|
||||
@property
|
||||
def series_name(self):
|
||||
return self.series_re.match(self.movie_name).group('series_name')
|
||||
|
||||
@property
|
||||
def series_title(self):
|
||||
return self.series_re.match(self.movie_name).group('series_title')
|
||||
|
||||
def get_matches(self, video):
|
||||
matches = set()
|
||||
|
||||
# episode
|
||||
if isinstance(video, Episode) and self.movie_kind == 'episode':
|
||||
# tag match, assume series, year, season and episode matches
|
||||
if self.matched_by == 'tag':
|
||||
if not video.imdb_id or self.movie_imdb_id == video.imdb_id:
|
||||
matches |= {'series', 'year', 'season', 'episode'}
|
||||
# series
|
||||
if video.series and sanitize(self.series_name) == sanitize(video.series):
|
||||
matches.add('series')
|
||||
# year
|
||||
if video.original_series and self.movie_year is None or video.year and video.year == self.movie_year:
|
||||
matches.add('year')
|
||||
# season
|
||||
if video.season and self.series_season == video.season:
|
||||
matches.add('season')
|
||||
# episode
|
||||
if video.episode and self.series_episode == video.episode:
|
||||
matches.add('episode')
|
||||
# title
|
||||
if video.title and sanitize(self.series_title) == sanitize(video.title):
|
||||
matches.add('title')
|
||||
# guess
|
||||
matches |= guess_matches(video, guessit(self.movie_release_name, {'type': 'episode'}))
|
||||
matches |= guess_matches(video, guessit(self.filename, {'type': 'episode'}))
|
||||
# hash
|
||||
if 'opensubtitles' in video.hashes and self.hash == video.hashes['opensubtitles']:
|
||||
if 'series' in matches and 'season' in matches and 'episode' in matches:
|
||||
matches.add('hash')
|
||||
else:
|
||||
logger.debug('Match on hash discarded')
|
||||
# movie
|
||||
elif isinstance(video, Movie) and self.movie_kind == 'movie':
|
||||
# tag match, assume title and year matches
|
||||
if self.matched_by == 'tag':
|
||||
if not video.imdb_id or self.movie_imdb_id == video.imdb_id:
|
||||
matches |= {'title', 'year'}
|
||||
# title
|
||||
if video.title and sanitize(self.movie_name) == sanitize(video.title):
|
||||
matches.add('title')
|
||||
# year
|
||||
if video.year and self.movie_year == video.year:
|
||||
matches.add('year')
|
||||
# guess
|
||||
matches |= guess_matches(video, guessit(self.movie_release_name, {'type': 'movie'}))
|
||||
matches |= guess_matches(video, guessit(self.filename, {'type': 'movie'}))
|
||||
# hash
|
||||
if 'opensubtitles' in video.hashes and self.hash == video.hashes['opensubtitles']:
|
||||
if 'title' in matches:
|
||||
matches.add('hash')
|
||||
else:
|
||||
logger.debug('Match on hash discarded')
|
||||
else:
|
||||
logger.info('%r is not a valid movie_kind', self.movie_kind)
|
||||
return matches
|
||||
|
||||
# imdb_id
|
||||
if video.imdb_id and self.movie_imdb_id == video.imdb_id:
|
||||
matches.add('imdb_id')
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
class OpenSubtitlesProvider(Provider):
|
||||
"""OpenSubtitles Provider.
|
||||
|
||||
:param str username: username.
|
||||
:param str password: password.
|
||||
|
||||
"""
|
||||
languages = {Language.fromopensubtitles(l) for l in language_converters['opensubtitles'].codes}
|
||||
subtitle_class = OpenSubtitlesSubtitle
|
||||
|
||||
def __init__(self, username=None, password=None):
|
||||
self.server = ServerProxy('https://api.opensubtitles.org/xml-rpc', TimeoutSafeTransport(10))
|
||||
if any((username, password)) and not all((username, password)):
|
||||
raise ConfigurationError('Username and password must be specified')
|
||||
# None values not allowed for logging in, so replace it by ''
|
||||
self.username = username or ''
|
||||
self.password = password or ''
|
||||
self.token = None
|
||||
|
||||
def initialize(self):
|
||||
logger.info('Logging in')
|
||||
response = checked(self.server.LogIn(self.username, self.password, 'eng',
|
||||
'subliminal v%s' % __short_version__))
|
||||
self.token = response['token']
|
||||
logger.debug('Logged in with token %r', self.token)
|
||||
|
||||
def terminate(self):
|
||||
logger.info('Logging out')
|
||||
checked(self.server.LogOut(self.token))
|
||||
self.server.close()
|
||||
self.token = None
|
||||
logger.debug('Logged out')
|
||||
|
||||
def no_operation(self):
|
||||
logger.debug('No operation')
|
||||
checked(self.server.NoOperation(self.token))
|
||||
|
||||
def query(self, languages, hash=None, size=None, imdb_id=None, query=None, season=None, episode=None, tag=None):
|
||||
# fill the search criteria
|
||||
criteria = []
|
||||
if hash and size:
|
||||
criteria.append({'moviehash': hash, 'moviebytesize': str(size)})
|
||||
if imdb_id:
|
||||
if season and episode:
|
||||
criteria.append({'imdbid': imdb_id[2:], 'season': season, 'episode': episode})
|
||||
else:
|
||||
criteria.append({'imdbid': imdb_id[2:]})
|
||||
if tag:
|
||||
criteria.append({'tag': tag})
|
||||
if query and season and episode:
|
||||
criteria.append({'query': query.replace('\'', ''), 'season': season, 'episode': episode})
|
||||
elif query:
|
||||
criteria.append({'query': query.replace('\'', '')})
|
||||
if not criteria:
|
||||
raise ValueError('Not enough information')
|
||||
|
||||
# add the language
|
||||
for criterion in criteria:
|
||||
criterion['sublanguageid'] = ','.join(sorted(l.opensubtitles for l in languages))
|
||||
|
||||
# query the server
|
||||
logger.info('Searching subtitles %r', criteria)
|
||||
response = checked(self.server.SearchSubtitles(self.token, criteria))
|
||||
subtitles = []
|
||||
|
||||
# exit if no data
|
||||
if not response['data']:
|
||||
logger.debug('No subtitles found')
|
||||
return subtitles
|
||||
|
||||
# loop over subtitle items
|
||||
for subtitle_item in response['data']:
|
||||
# read the item
|
||||
language = Language.fromopensubtitles(subtitle_item['SubLanguageID'])
|
||||
hearing_impaired = bool(int(subtitle_item['SubHearingImpaired']))
|
||||
page_link = subtitle_item['SubtitlesLink']
|
||||
subtitle_id = int(subtitle_item['IDSubtitleFile'])
|
||||
matched_by = subtitle_item['MatchedBy']
|
||||
movie_kind = subtitle_item['MovieKind']
|
||||
hash = subtitle_item['MovieHash']
|
||||
movie_name = subtitle_item['MovieName']
|
||||
movie_release_name = subtitle_item['MovieReleaseName']
|
||||
movie_year = int(subtitle_item['MovieYear']) if subtitle_item['MovieYear'] else None
|
||||
movie_imdb_id = 'tt' + subtitle_item['IDMovieImdb']
|
||||
series_season = int(subtitle_item['SeriesSeason']) if subtitle_item['SeriesSeason'] else None
|
||||
series_episode = int(subtitle_item['SeriesEpisode']) if subtitle_item['SeriesEpisode'] else None
|
||||
filename = subtitle_item['SubFileName']
|
||||
encoding = subtitle_item.get('SubEncoding') or None
|
||||
|
||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind,
|
||||
hash, movie_name, movie_release_name, movie_year, movie_imdb_id,
|
||||
series_season, series_episode, filename, encoding)
|
||||
logger.debug('Found subtitle %r by %s', subtitle, matched_by)
|
||||
subtitles.append(subtitle)
|
||||
|
||||
return subtitles
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
season = episode = None
|
||||
if isinstance(video, Episode):
|
||||
query = video.series
|
||||
season = video.season
|
||||
episode = video.episode
|
||||
else:
|
||||
query = video.title
|
||||
|
||||
return self.query(languages, hash=video.hashes.get('opensubtitles'), size=video.size, imdb_id=video.imdb_id,
|
||||
query=query, season=season, episode=episode, tag=os.path.basename(video.name))
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
logger.info('Downloading subtitle %r', subtitle)
|
||||
response = checked(self.server.DownloadSubtitles(self.token, [str(subtitle.subtitle_id)]))
|
||||
subtitle.content = fix_line_ending(zlib.decompress(base64.b64decode(response['data'][0]['data']), 47))
|
||||
|
||||
|
||||
class OpenSubtitlesError(ProviderError):
|
||||
"""Base class for non-generic :class:`OpenSubtitlesProvider` exceptions."""
|
||||
pass
|
||||
|
||||
|
||||
class Unauthorized(OpenSubtitlesError, AuthenticationError):
|
||||
"""Exception raised when status is '401 Unauthorized'."""
|
||||
pass
|
||||
|
||||
|
||||
class NoSession(OpenSubtitlesError, AuthenticationError):
|
||||
"""Exception raised when status is '406 No session'."""
|
||||
pass
|
||||
|
||||
|
||||
class DownloadLimitReached(OpenSubtitlesError, DownloadLimitExceeded):
|
||||
"""Exception raised when status is '407 Download limit reached'."""
|
||||
pass
|
||||
|
||||
|
||||
class InvalidImdbid(OpenSubtitlesError):
|
||||
"""Exception raised when status is '413 Invalid ImdbID'."""
|
||||
pass
|
||||
|
||||
|
||||
class UnknownUserAgent(OpenSubtitlesError, AuthenticationError):
|
||||
"""Exception raised when status is '414 Unknown User Agent'."""
|
||||
pass
|
||||
|
||||
|
||||
class DisabledUserAgent(OpenSubtitlesError, AuthenticationError):
|
||||
"""Exception raised when status is '415 Disabled user agent'."""
|
||||
pass
|
||||
|
||||
|
||||
def checked(response):
|
||||
"""Check a response status before returning it.
|
||||
|
||||
:param response: a response from a XMLRPC call to OpenSubtitles.
|
||||
:return: the response.
|
||||
:raise: :class:`OpenSubtitlesError`
|
||||
|
||||
"""
|
||||
status_code = int(response['status'][:3])
|
||||
if status_code == 401:
|
||||
raise Unauthorized
|
||||
if status_code == 406:
|
||||
raise NoSession
|
||||
if status_code == 407:
|
||||
raise DownloadLimitReached
|
||||
if status_code == 413:
|
||||
raise InvalidImdbid
|
||||
if status_code == 414:
|
||||
raise UnknownUserAgent
|
||||
if status_code == 415:
|
||||
raise DisabledUserAgent
|
||||
if status_code == 503:
|
||||
raise ServiceUnavailable
|
||||
if status_code != 200:
|
||||
raise OpenSubtitlesError(response['status'])
|
||||
|
||||
return response
|
197
libs/subliminal2.7/providers/podnapisi.py
Normal file
197
libs/subliminal2.7/providers/podnapisi.py
Normal file
|
@ -0,0 +1,197 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
|
||||
from babelfish import Language, language_converters
|
||||
from guessit import guessit
|
||||
try:
|
||||
from lxml import etree
|
||||
except ImportError:
|
||||
try:
|
||||
import xml.etree.cElementTree as etree
|
||||
except ImportError:
|
||||
import xml.etree.ElementTree as etree
|
||||
from requests import Session
|
||||
from zipfile import ZipFile
|
||||
|
||||
from . import Provider
|
||||
from .. import __short_version__
|
||||
from ..exceptions import ProviderError
|
||||
from ..subtitle import Subtitle, fix_line_ending, guess_matches
|
||||
from ..utils import sanitize
|
||||
from ..video import Episode, Movie
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PodnapisiSubtitle(Subtitle):
|
||||
"""Podnapisi Subtitle."""
|
||||
provider_name = 'podnapisi'
|
||||
|
||||
def __init__(self, language, hearing_impaired, page_link, pid, releases, title, season=None, episode=None,
|
||||
year=None):
|
||||
super(PodnapisiSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link)
|
||||
self.pid = pid
|
||||
self.releases = releases
|
||||
self.title = title
|
||||
self.season = season
|
||||
self.episode = episode
|
||||
self.year = year
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
return self.pid
|
||||
|
||||
def get_matches(self, video):
|
||||
matches = set()
|
||||
|
||||
# episode
|
||||
if isinstance(video, Episode):
|
||||
# series
|
||||
if video.series and (sanitize(self.title) in (
|
||||
sanitize(name) for name in [video.series] + video.alternative_series)):
|
||||
matches.add('series')
|
||||
# year
|
||||
if video.original_series and self.year is None or video.year and video.year == self.year:
|
||||
matches.add('year')
|
||||
# season
|
||||
if video.season and self.season == video.season:
|
||||
matches.add('season')
|
||||
# episode
|
||||
if video.episode and self.episode == video.episode:
|
||||
matches.add('episode')
|
||||
# guess
|
||||
for release in self.releases:
|
||||
matches |= guess_matches(video, guessit(release, {'type': 'episode'}))
|
||||
# movie
|
||||
elif isinstance(video, Movie):
|
||||
# title
|
||||
if video.title and (sanitize(self.title) in (
|
||||
sanitize(name) for name in [video.title] + video.alternative_titles)):
|
||||
matches.add('title')
|
||||
# year
|
||||
if video.year and self.year == video.year:
|
||||
matches.add('year')
|
||||
# guess
|
||||
for release in self.releases:
|
||||
matches |= guess_matches(video, guessit(release, {'type': 'movie'}))
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
class PodnapisiProvider(Provider):
|
||||
"""Podnapisi Provider."""
|
||||
languages = ({Language('por', 'BR'), Language('srp', script='Latn')} |
|
||||
{Language.fromalpha2(l) for l in language_converters['alpha2'].codes})
|
||||
server_url = 'https://www.podnapisi.net/subtitles/'
|
||||
subtitle_class = PodnapisiSubtitle
|
||||
|
||||
def __init__(self):
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
|
||||
|
||||
def terminate(self):
|
||||
self.session.close()
|
||||
|
||||
def query(self, language, keyword, season=None, episode=None, year=None):
|
||||
# set parameters, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164#p212652
|
||||
params = {'sXML': 1, 'sL': str(language), 'sK': keyword}
|
||||
is_episode = False
|
||||
if season and episode:
|
||||
is_episode = True
|
||||
params['sTS'] = season
|
||||
params['sTE'] = episode
|
||||
if year:
|
||||
params['sY'] = year
|
||||
|
||||
# loop over paginated results
|
||||
logger.info('Searching subtitles %r', params)
|
||||
subtitles = []
|
||||
pids = set()
|
||||
while True:
|
||||
# query the server
|
||||
r = self.session.get(self.server_url + 'search/old', params=params, timeout=10)
|
||||
r.raise_for_status()
|
||||
xml = etree.fromstring(r.content)
|
||||
|
||||
# exit if no results
|
||||
if not int(xml.find('pagination/results').text):
|
||||
logger.debug('No subtitles found')
|
||||
break
|
||||
|
||||
# loop over subtitles
|
||||
for subtitle_xml in xml.findall('subtitle'):
|
||||
# read xml elements
|
||||
pid = subtitle_xml.find('pid').text
|
||||
# ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321
|
||||
if pid in pids:
|
||||
continue
|
||||
|
||||
language = Language.fromietf(subtitle_xml.find('language').text)
|
||||
hearing_impaired = 'n' in (subtitle_xml.find('flags').text or '')
|
||||
page_link = subtitle_xml.find('url').text
|
||||
releases = []
|
||||
if subtitle_xml.find('release').text:
|
||||
for release in subtitle_xml.find('release').text.split():
|
||||
release = re.sub(r'\.+$', '', release) # remove trailing dots
|
||||
release = ''.join(filter(lambda x: ord(x) < 128, release)) # remove non-ascii characters
|
||||
releases.append(release)
|
||||
title = subtitle_xml.find('title').text
|
||||
season = int(subtitle_xml.find('tvSeason').text)
|
||||
episode = int(subtitle_xml.find('tvEpisode').text)
|
||||
year = int(subtitle_xml.find('year').text)
|
||||
|
||||
if is_episode:
|
||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title,
|
||||
season=season, episode=episode, year=year)
|
||||
else:
|
||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title,
|
||||
year=year)
|
||||
|
||||
logger.debug('Found subtitle %r', subtitle)
|
||||
subtitles.append(subtitle)
|
||||
pids.add(pid)
|
||||
|
||||
# stop on last page
|
||||
if int(xml.find('pagination/current').text) >= int(xml.find('pagination/count').text):
|
||||
break
|
||||
|
||||
# increment current page
|
||||
params['page'] = int(xml.find('pagination/current').text) + 1
|
||||
logger.debug('Getting page %d', params['page'])
|
||||
|
||||
return subtitles
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
season = episode = None
|
||||
if isinstance(video, Episode):
|
||||
titles = [video.series] + video.alternative_series
|
||||
season = video.season
|
||||
episode = video.episode
|
||||
else:
|
||||
titles = [video.title] + video.alternative_titles
|
||||
|
||||
for title in titles:
|
||||
subtitles = [s for l in languages for s in
|
||||
self.query(l, title, season=season, episode=episode, year=video.year)]
|
||||
if subtitles:
|
||||
return subtitles
|
||||
|
||||
return []
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
# download as a zip
|
||||
logger.info('Downloading subtitle %r', subtitle)
|
||||
r = self.session.get(self.server_url + subtitle.pid + '/download', params={'container': 'zip'}, timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
# open the zip
|
||||
with ZipFile(io.BytesIO(r.content)) as zf:
|
||||
if len(zf.namelist()) > 1:
|
||||
raise ProviderError('More than one file to unzip')
|
||||
|
||||
subtitle.content = fix_line_ending(zf.read(zf.namelist()[0]))
|
83
libs/subliminal2.7/providers/shooter.py
Normal file
83
libs/subliminal2.7/providers/shooter.py
Normal file
|
@ -0,0 +1,83 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
from babelfish import Language, language_converters
|
||||
from requests import Session
|
||||
|
||||
from . import Provider
|
||||
from .. import __short_version__
|
||||
from ..subtitle import Subtitle, fix_line_ending
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
language_converters.register('shooter = subliminal.converters.shooter:ShooterConverter')
|
||||
|
||||
|
||||
class ShooterSubtitle(Subtitle):
|
||||
"""Shooter Subtitle."""
|
||||
provider_name = 'shooter'
|
||||
|
||||
def __init__(self, language, hash, download_link):
|
||||
super(ShooterSubtitle, self).__init__(language)
|
||||
self.hash = hash
|
||||
self.download_link = download_link
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
return self.download_link
|
||||
|
||||
def get_matches(self, video):
|
||||
matches = set()
|
||||
|
||||
# hash
|
||||
if 'shooter' in video.hashes and video.hashes['shooter'] == self.hash:
|
||||
matches.add('hash')
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
class ShooterProvider(Provider):
|
||||
"""Shooter Provider."""
|
||||
languages = {Language(l) for l in ['eng', 'zho']}
|
||||
server_url = 'https://www.shooter.cn/api/subapi.php'
|
||||
subtitle_class = ShooterSubtitle
|
||||
|
||||
def __init__(self):
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
|
||||
|
||||
def terminate(self):
|
||||
self.session.close()
|
||||
|
||||
def query(self, language, filename, hash=None):
|
||||
# query the server
|
||||
params = {'filehash': hash, 'pathinfo': os.path.realpath(filename), 'format': 'json', 'lang': language.shooter}
|
||||
logger.debug('Searching subtitles %r', params)
|
||||
r = self.session.post(self.server_url, params=params, timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
# handle subtitles not found
|
||||
if r.content == b'\xff':
|
||||
logger.debug('No subtitles found')
|
||||
return []
|
||||
|
||||
# parse the subtitles
|
||||
results = json.loads(r.text)
|
||||
subtitles = [self.subtitle_class(language, hash, t['Link']) for s in results for t in s['Files']]
|
||||
|
||||
return subtitles
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
return [s for l in languages for s in self.query(l, video.name, video.hashes.get('shooter'))]
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
logger.info('Downloading subtitle %r', subtitle)
|
||||
r = self.session.get(subtitle.download_link, timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
subtitle.content = fix_line_ending(r.content)
|
243
libs/subliminal2.7/providers/subscenter.py
Normal file
243
libs/subliminal2.7/providers/subscenter.py
Normal file
|
@ -0,0 +1,243 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import bisect
|
||||
from collections import defaultdict
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import zipfile
|
||||
|
||||
from babelfish import Language
|
||||
from guessit import guessit
|
||||
from requests import Session
|
||||
|
||||
from . import ParserBeautifulSoup, Provider
|
||||
from .. import __short_version__
|
||||
from ..cache import SHOW_EXPIRATION_TIME, region
|
||||
from ..exceptions import AuthenticationError, ConfigurationError, ProviderError
|
||||
from ..subtitle import Subtitle, fix_line_ending, guess_matches
|
||||
from ..utils import sanitize
|
||||
from ..video import Episode, Movie
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SubsCenterSubtitle(Subtitle):
|
||||
"""SubsCenter Subtitle."""
|
||||
provider_name = 'subscenter'
|
||||
|
||||
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, subtitle_id, subtitle_key,
|
||||
subtitle_version, downloaded, releases):
|
||||
super(SubsCenterSubtitle, self).__init__(language, hearing_impaired, page_link)
|
||||
self.series = series
|
||||
self.season = season
|
||||
self.episode = episode
|
||||
self.title = title
|
||||
self.subtitle_id = subtitle_id
|
||||
self.subtitle_key = subtitle_key
|
||||
self.subtitle_version = subtitle_version
|
||||
self.downloaded = downloaded
|
||||
self.releases = releases
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
return str(self.subtitle_id)
|
||||
|
||||
def get_matches(self, video):
|
||||
matches = set()
|
||||
|
||||
# episode
|
||||
if isinstance(video, Episode):
|
||||
# series
|
||||
if video.series and sanitize(self.series) == sanitize(video.series):
|
||||
matches.add('series')
|
||||
# season
|
||||
if video.season and self.season == video.season:
|
||||
matches.add('season')
|
||||
# episode
|
||||
if video.episode and self.episode == video.episode:
|
||||
matches.add('episode')
|
||||
# guess
|
||||
for release in self.releases:
|
||||
matches |= guess_matches(video, guessit(release, {'type': 'episode'}))
|
||||
# movie
|
||||
elif isinstance(video, Movie):
|
||||
# guess
|
||||
for release in self.releases:
|
||||
matches |= guess_matches(video, guessit(release, {'type': 'movie'}))
|
||||
|
||||
# title
|
||||
if video.title and sanitize(self.title) == sanitize(video.title):
|
||||
matches.add('title')
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
class SubsCenterProvider(Provider):
|
||||
"""SubsCenter Provider."""
|
||||
languages = {Language.fromalpha2(l) for l in ['he']}
|
||||
server_url = 'http://www.subscenter.org/he/'
|
||||
subtitle_class = SubsCenterSubtitle
|
||||
|
||||
def __init__(self, username=None, password=None):
|
||||
if username is not None and password is None or username is None and password is not None:
|
||||
raise ConfigurationError('Username and password must be specified')
|
||||
|
||||
self.session = None
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.logged_in = False
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__)
|
||||
|
||||
# login
|
||||
if self.username is not None and self.password is not None:
|
||||
logger.debug('Logging in')
|
||||
url = self.server_url + 'subscenter/accounts/login/'
|
||||
|
||||
# retrieve CSRF token
|
||||
self.session.get(url)
|
||||
csrf_token = self.session.cookies['csrftoken']
|
||||
|
||||
# actual login
|
||||
data = {'username': self.username, 'password': self.password, 'csrfmiddlewaretoken': csrf_token}
|
||||
r = self.session.post(url, data, allow_redirects=False, timeout=10)
|
||||
|
||||
if r.status_code != 302:
|
||||
raise AuthenticationError(self.username)
|
||||
|
||||
logger.info('Logged in')
|
||||
self.logged_in = True
|
||||
|
||||
def terminate(self):
|
||||
# logout
|
||||
if self.logged_in:
|
||||
logger.info('Logging out')
|
||||
r = self.session.get(self.server_url + 'subscenter/accounts/logout/', timeout=10)
|
||||
r.raise_for_status()
|
||||
logger.info('Logged out')
|
||||
self.logged_in = False
|
||||
|
||||
self.session.close()
|
||||
|
||||
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME)
|
||||
def _search_url_titles(self, title):
|
||||
"""Search the URL titles by kind for the given `title`.
|
||||
|
||||
:param str title: title to search for.
|
||||
:return: the URL titles by kind.
|
||||
:rtype: collections.defaultdict
|
||||
|
||||
"""
|
||||
# make the search
|
||||
logger.info('Searching title name for %r', title)
|
||||
r = self.session.get(self.server_url + 'subtitle/search/', params={'q': title}, timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
# check for redirections
|
||||
if r.history and all([h.status_code == 302 for h in r.history]):
|
||||
logger.debug('Redirected to the subtitles page')
|
||||
links = [r.url]
|
||||
else:
|
||||
# get the suggestions (if needed)
|
||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||
links = [link.attrs['href'] for link in soup.select('#processes div.generalWindowTop a')]
|
||||
logger.debug('Found %d suggestions', len(links))
|
||||
|
||||
url_titles = defaultdict(list)
|
||||
for link in links:
|
||||
parts = link.split('/')
|
||||
url_titles[parts[-3]].append(parts[-2])
|
||||
|
||||
return url_titles
|
||||
|
||||
def query(self, title, season=None, episode=None):
|
||||
# search for the url title
|
||||
url_titles = self._search_url_titles(title)
|
||||
|
||||
# episode
|
||||
if season and episode:
|
||||
if 'series' not in url_titles:
|
||||
logger.error('No URL title found for series %r', title)
|
||||
return []
|
||||
url_title = url_titles['series'][0]
|
||||
logger.debug('Using series title %r', url_title)
|
||||
url = self.server_url + 'cst/data/series/sb/{}/{}/{}/'.format(url_title, season, episode)
|
||||
page_link = self.server_url + 'subtitle/series/{}/{}/{}/'.format(url_title, season, episode)
|
||||
else:
|
||||
if 'movie' not in url_titles:
|
||||
logger.error('No URL title found for movie %r', title)
|
||||
return []
|
||||
url_title = url_titles['movie'][0]
|
||||
logger.debug('Using movie title %r', url_title)
|
||||
url = self.server_url + 'cst/data/movie/sb/{}/'.format(url_title)
|
||||
page_link = self.server_url + 'subtitle/movie/{}/'.format(url_title)
|
||||
|
||||
# get the list of subtitles
|
||||
logger.debug('Getting the list of subtitles')
|
||||
r = self.session.get(url)
|
||||
r.raise_for_status()
|
||||
results = json.loads(r.text)
|
||||
|
||||
# loop over results
|
||||
subtitles = {}
|
||||
for language_code, language_data in results.items():
|
||||
for quality_data in language_data.values():
|
||||
for quality, subtitles_data in quality_data.items():
|
||||
for subtitle_item in subtitles_data.values():
|
||||
# read the item
|
||||
language = Language.fromalpha2(language_code)
|
||||
hearing_impaired = bool(subtitle_item['hearing_impaired'])
|
||||
subtitle_id = subtitle_item['id']
|
||||
subtitle_key = subtitle_item['key']
|
||||
subtitle_version = subtitle_item['h_version']
|
||||
downloaded = subtitle_item['downloaded']
|
||||
release = subtitle_item['subtitle_version']
|
||||
|
||||
# add the release and increment downloaded count if we already have the subtitle
|
||||
if subtitle_id in subtitles:
|
||||
logger.debug('Found additional release %r for subtitle %d', release, subtitle_id)
|
||||
bisect.insort_left(subtitles[subtitle_id].releases, release) # deterministic order
|
||||
subtitles[subtitle_id].downloaded += downloaded
|
||||
continue
|
||||
|
||||
# otherwise create it
|
||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, title, season, episode,
|
||||
title, subtitle_id, subtitle_key, subtitle_version, downloaded,
|
||||
[release])
|
||||
logger.debug('Found subtitle %r', subtitle)
|
||||
subtitles[subtitle_id] = subtitle
|
||||
|
||||
return subtitles.values()
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
season = episode = None
|
||||
title = video.title
|
||||
|
||||
if isinstance(video, Episode):
|
||||
title = video.series
|
||||
season = video.season
|
||||
episode = video.episode
|
||||
|
||||
return [s for s in self.query(title, season, episode) if s.language in languages]
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
# download
|
||||
url = self.server_url + 'subtitle/download/{}/{}/'.format(subtitle.language.alpha2, subtitle.subtitle_id)
|
||||
params = {'v': subtitle.subtitle_version, 'key': subtitle.subtitle_key}
|
||||
r = self.session.get(url, params=params, headers={'Referer': subtitle.page_link}, timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
# open the zip
|
||||
try:
|
||||
with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
|
||||
# remove some filenames from the namelist
|
||||
namelist = [n for n in zf.namelist() if not n.endswith('.txt')]
|
||||
if len(namelist) > 1:
|
||||
raise ProviderError('More than one file to unzip')
|
||||
|
||||
subtitle.content = fix_line_ending(zf.read(namelist[0]))
|
||||
except zipfile.BadZipfile:
|
||||
# if no zip file was retrieved, daily downloads limit has exceeded
|
||||
raise ProviderError('Daily limit exceeded')
|
88
libs/subliminal2.7/providers/thesubdb.py
Normal file
88
libs/subliminal2.7/providers/thesubdb.py
Normal file
|
@ -0,0 +1,88 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import logging
|
||||
|
||||
from babelfish import Language, language_converters
|
||||
from requests import Session
|
||||
|
||||
from . import Provider
|
||||
from .. import __short_version__
|
||||
from ..subtitle import Subtitle, fix_line_ending
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
language_converters.register('thesubdb = subliminal.converters.thesubdb:TheSubDBConverter')
|
||||
|
||||
|
||||
class TheSubDBSubtitle(Subtitle):
|
||||
"""TheSubDB Subtitle."""
|
||||
provider_name = 'thesubdb'
|
||||
|
||||
def __init__(self, language, hash):
|
||||
super(TheSubDBSubtitle, self).__init__(language)
|
||||
self.hash = hash
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
return self.hash + '-' + str(self.language)
|
||||
|
||||
def get_matches(self, video):
|
||||
matches = set()
|
||||
|
||||
# hash
|
||||
if 'thesubdb' in video.hashes and video.hashes['thesubdb'] == self.hash:
|
||||
matches.add('hash')
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
class TheSubDBProvider(Provider):
|
||||
"""TheSubDB Provider."""
|
||||
languages = {Language.fromthesubdb(l) for l in language_converters['thesubdb'].codes}
|
||||
required_hash = 'thesubdb'
|
||||
server_url = 'http://api.thesubdb.com/'
|
||||
subtitle_class = TheSubDBSubtitle
|
||||
|
||||
def __init__(self):
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
self.session.headers['User-Agent'] = ('SubDB/1.0 (subliminal/%s; https://github.com/Diaoul/subliminal)' %
|
||||
__short_version__)
|
||||
|
||||
def terminate(self):
|
||||
self.session.close()
|
||||
|
||||
def query(self, hash):
|
||||
# make the query
|
||||
params = {'action': 'search', 'hash': hash}
|
||||
logger.info('Searching subtitles %r', params)
|
||||
r = self.session.get(self.server_url, params=params, timeout=10)
|
||||
|
||||
# handle subtitles not found and errors
|
||||
if r.status_code == 404:
|
||||
logger.debug('No subtitles found')
|
||||
return []
|
||||
r.raise_for_status()
|
||||
|
||||
# loop over languages
|
||||
subtitles = []
|
||||
for language_code in r.text.split(','):
|
||||
language = Language.fromthesubdb(language_code)
|
||||
|
||||
subtitle = self.subtitle_class(language, hash)
|
||||
logger.debug('Found subtitle %r', subtitle)
|
||||
subtitles.append(subtitle)
|
||||
|
||||
return subtitles
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
return [s for s in self.query(video.hashes['thesubdb']) if s.language in languages]
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
logger.info('Downloading subtitle %r', subtitle)
|
||||
params = {'action': 'download', 'hash': subtitle.hash, 'language': subtitle.language.alpha2}
|
||||
r = self.session.get(self.server_url, params=params, timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
subtitle.content = fix_line_ending(r.content)
|
226
libs/subliminal2.7/providers/tvsubtitles.py
Normal file
226
libs/subliminal2.7/providers/tvsubtitles.py
Normal file
|
@ -0,0 +1,226 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
from zipfile import ZipFile
|
||||
|
||||
from babelfish import Language, language_converters
|
||||
from guessit import guessit
|
||||
from requests import Session
|
||||
|
||||
from . import ParserBeautifulSoup, Provider
|
||||
from .. import __short_version__
|
||||
from ..cache import EPISODE_EXPIRATION_TIME, SHOW_EXPIRATION_TIME, region
|
||||
from ..exceptions import ProviderError
|
||||
from ..score import get_equivalent_release_groups
|
||||
from ..subtitle import Subtitle, fix_line_ending, guess_matches
|
||||
from ..utils import sanitize, sanitize_release_group
|
||||
from ..video import Episode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
language_converters.register('tvsubtitles = subliminal.converters.tvsubtitles:TVsubtitlesConverter')
|
||||
|
||||
link_re = re.compile(r'^(?P<series>.+?)(?: \(?\d{4}\)?| \((?:US|UK)\))? \((?P<first_year>\d{4})-\d{4}\)$')
|
||||
episode_id_re = re.compile(r'^episode-\d+\.html$')
|
||||
|
||||
|
||||
class TVsubtitlesSubtitle(Subtitle):
|
||||
"""TVsubtitles Subtitle."""
|
||||
provider_name = 'tvsubtitles'
|
||||
|
||||
def __init__(self, language, page_link, subtitle_id, series, season, episode, year, rip, release):
|
||||
super(TVsubtitlesSubtitle, self).__init__(language, page_link=page_link)
|
||||
self.subtitle_id = subtitle_id
|
||||
self.series = series
|
||||
self.season = season
|
||||
self.episode = episode
|
||||
self.year = year
|
||||
self.rip = rip
|
||||
self.release = release
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
return str(self.subtitle_id)
|
||||
|
||||
def get_matches(self, video):
|
||||
matches = set()
|
||||
|
||||
# series
|
||||
if video.series and (sanitize(self.series) in (
|
||||
sanitize(name) for name in [video.series] + video.alternative_series)):
|
||||
matches.add('series')
|
||||
# season
|
||||
if video.season and self.season == video.season:
|
||||
matches.add('season')
|
||||
# episode
|
||||
if video.episode and self.episode == video.episode:
|
||||
matches.add('episode')
|
||||
# year
|
||||
if video.original_series and self.year is None or video.year and video.year == self.year:
|
||||
matches.add('year')
|
||||
# release_group
|
||||
if (video.release_group and self.release and
|
||||
any(r in sanitize_release_group(self.release)
|
||||
for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))):
|
||||
matches.add('release_group')
|
||||
# other properties
|
||||
if self.release:
|
||||
matches |= guess_matches(video, guessit(self.release, {'type': 'episode'}), partial=True)
|
||||
if self.rip:
|
||||
matches |= guess_matches(video, guessit(self.rip), partial=True)
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
class TVsubtitlesProvider(Provider):
|
||||
"""TVsubtitles Provider."""
|
||||
languages = {Language('por', 'BR')} | {Language(l) for l in [
|
||||
'ara', 'bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'fin', 'fra', 'hun', 'ita', 'jpn', 'kor', 'nld', 'pol', 'por',
|
||||
'ron', 'rus', 'spa', 'swe', 'tur', 'ukr', 'zho'
|
||||
]}
|
||||
video_types = (Episode,)
|
||||
server_url = 'http://www.tvsubtitles.net/'
|
||||
subtitle_class = TVsubtitlesSubtitle
|
||||
|
||||
def __init__(self):
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
|
||||
|
||||
def terminate(self):
|
||||
self.session.close()
|
||||
|
||||
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME)
|
||||
def search_show_id(self, series, year=None):
|
||||
"""Search the show id from the `series` and `year`.
|
||||
|
||||
:param str series: series of the episode.
|
||||
:param year: year of the series, if any.
|
||||
:type year: int
|
||||
:return: the show id, if any.
|
||||
:rtype: int
|
||||
|
||||
"""
|
||||
# make the search
|
||||
logger.info('Searching show id for %r', series)
|
||||
r = self.session.post(self.server_url + 'search.php', data={'q': series}, timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
# get the series out of the suggestions
|
||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||
show_id = None
|
||||
for suggestion in soup.select('div.left li div a[href^="/tvshow-"]'):
|
||||
match = link_re.match(suggestion.text)
|
||||
if not match:
|
||||
logger.error('Failed to match %s', suggestion.text)
|
||||
continue
|
||||
|
||||
if match.group('series').lower() == series.lower():
|
||||
if year is not None and int(match.group('first_year')) != year:
|
||||
logger.debug('Year does not match')
|
||||
continue
|
||||
show_id = int(suggestion['href'][8:-5])
|
||||
logger.debug('Found show id %d', show_id)
|
||||
break
|
||||
|
||||
return show_id
|
||||
|
||||
@region.cache_on_arguments(expiration_time=EPISODE_EXPIRATION_TIME)
|
||||
def get_episode_ids(self, show_id, season):
|
||||
"""Get episode ids from the show id and the season.
|
||||
|
||||
:param int show_id: show id.
|
||||
:param int season: season of the episode.
|
||||
:return: episode ids per episode number.
|
||||
:rtype: dict
|
||||
|
||||
"""
|
||||
# get the page of the season of the show
|
||||
logger.info('Getting the page of show id %d, season %d', show_id, season)
|
||||
r = self.session.get(self.server_url + 'tvshow-%d-%d.html' % (show_id, season), timeout=10)
|
||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||
|
||||
# loop over episode rows
|
||||
episode_ids = {}
|
||||
for row in soup.select('table#table5 tr'):
|
||||
# skip rows that do not have a link to the episode page
|
||||
if not row('a', href=episode_id_re):
|
||||
continue
|
||||
|
||||
# extract data from the cells
|
||||
cells = row('td')
|
||||
episode = int(cells[0].text.split('x')[1])
|
||||
episode_id = int(cells[1].a['href'][8:-5])
|
||||
episode_ids[episode] = episode_id
|
||||
|
||||
if episode_ids:
|
||||
logger.debug('Found episode ids %r', episode_ids)
|
||||
else:
|
||||
logger.warning('No episode ids found')
|
||||
|
||||
return episode_ids
|
||||
|
||||
def query(self, show_id, series, season, episode, year=None):
|
||||
# get the episode ids
|
||||
episode_ids = self.get_episode_ids(show_id, season)
|
||||
if episode not in episode_ids:
|
||||
logger.error('Episode %d not found', episode)
|
||||
return []
|
||||
|
||||
# get the episode page
|
||||
logger.info('Getting the page for episode %d', episode_ids[episode])
|
||||
r = self.session.get(self.server_url + 'episode-%d.html' % episode_ids[episode], timeout=10)
|
||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||
|
||||
# loop over subtitles rows
|
||||
subtitles = []
|
||||
for row in soup.select('.subtitlen'):
|
||||
# read the item
|
||||
language = Language.fromtvsubtitles(row.h5.img['src'][13:-4])
|
||||
subtitle_id = int(row.parent['href'][10:-5])
|
||||
page_link = self.server_url + 'subtitle-%d.html' % subtitle_id
|
||||
rip = row.find('p', title='rip').text.strip() or None
|
||||
release = row.find('h5').text.strip() or None
|
||||
|
||||
subtitle = self.subtitle_class(language, page_link, subtitle_id, series, season, episode, year, rip,
|
||||
release)
|
||||
logger.debug('Found subtitle %s', subtitle)
|
||||
subtitles.append(subtitle)
|
||||
|
||||
return subtitles
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
# lookup show_id
|
||||
titles = [video.series] + video.alternative_series
|
||||
show_id = None
|
||||
for title in titles:
|
||||
show_id = self.search_show_id(title, video.year)
|
||||
if show_id is not None:
|
||||
break
|
||||
|
||||
# query for subtitles with the show_id
|
||||
if show_id is not None:
|
||||
subtitles = [s for s in self.query(show_id, title, video.season, video.episode, video.year)
|
||||
if s.language in languages and s.episode == video.episode]
|
||||
if subtitles:
|
||||
return subtitles
|
||||
else:
|
||||
logger.error('No show id found for %r (%r)', video.series, {'year': video.year})
|
||||
|
||||
return []
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
# download as a zip
|
||||
logger.info('Downloading subtitle %r', subtitle)
|
||||
r = self.session.get(self.server_url + 'download-%d.html' % subtitle.subtitle_id, timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
# open the zip
|
||||
with ZipFile(io.BytesIO(r.content)) as zf:
|
||||
if len(zf.namelist()) > 1:
|
||||
raise ProviderError('More than one file to unzip')
|
||||
|
||||
subtitle.content = fix_line_ending(zf.read(zf.namelist()[0]))
|
12
libs/subliminal2.7/refiners/__init__.py
Normal file
12
libs/subliminal2.7/refiners/__init__.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
"""
|
||||
Refiners enrich a :class:`~subliminal.video.Video` object by adding information to it.
|
||||
|
||||
A refiner is a simple function:
|
||||
|
||||
.. py:function:: refine(video, **kwargs)
|
||||
|
||||
:param video: the video to refine.
|
||||
:type video: :class:`~subliminal.video.Video`
|
||||
:param \*\*kwargs: additional parameters for refiners.
|
||||
|
||||
"""
|
99
libs/subliminal2.7/refiners/metadata.py
Normal file
99
libs/subliminal2.7/refiners/metadata.py
Normal file
|
@ -0,0 +1,99 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import logging
|
||||
import os
|
||||
|
||||
from babelfish import Error as BabelfishError, Language
|
||||
from enzyme import MKV
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def refine(video, embedded_subtitles=True, **kwargs):
|
||||
"""Refine a video by searching its metadata.
|
||||
|
||||
Several :class:`~subliminal.video.Video` attributes can be found:
|
||||
|
||||
* :attr:`~subliminal.video.Video.resolution`
|
||||
* :attr:`~subliminal.video.Video.video_codec`
|
||||
* :attr:`~subliminal.video.Video.audio_codec`
|
||||
* :attr:`~subliminal.video.Video.subtitle_languages`
|
||||
|
||||
:param bool embedded_subtitles: search for embedded subtitles.
|
||||
|
||||
"""
|
||||
# skip non existing videos
|
||||
if not video.exists:
|
||||
return
|
||||
|
||||
# check extensions
|
||||
extension = os.path.splitext(video.name)[1]
|
||||
if extension == '.mkv':
|
||||
with open(video.name, 'rb') as f:
|
||||
mkv = MKV(f)
|
||||
|
||||
# main video track
|
||||
if mkv.video_tracks:
|
||||
video_track = mkv.video_tracks[0]
|
||||
|
||||
# resolution
|
||||
if video_track.height in (480, 720, 1080):
|
||||
if video_track.interlaced:
|
||||
video.resolution = '%di' % video_track.height
|
||||
else:
|
||||
video.resolution = '%dp' % video_track.height
|
||||
logger.debug('Found resolution %s', video.resolution)
|
||||
|
||||
# video codec
|
||||
if video_track.codec_id == 'V_MPEG4/ISO/AVC':
|
||||
video.video_codec = 'h264'
|
||||
logger.debug('Found video_codec %s', video.video_codec)
|
||||
elif video_track.codec_id == 'V_MPEG4/ISO/SP':
|
||||
video.video_codec = 'DivX'
|
||||
logger.debug('Found video_codec %s', video.video_codec)
|
||||
elif video_track.codec_id == 'V_MPEG4/ISO/ASP':
|
||||
video.video_codec = 'XviD'
|
||||
logger.debug('Found video_codec %s', video.video_codec)
|
||||
else:
|
||||
logger.warning('MKV has no video track')
|
||||
|
||||
# main audio track
|
||||
if mkv.audio_tracks:
|
||||
audio_track = mkv.audio_tracks[0]
|
||||
# audio codec
|
||||
if audio_track.codec_id == 'A_AC3':
|
||||
video.audio_codec = 'AC3'
|
||||
logger.debug('Found audio_codec %s', video.audio_codec)
|
||||
elif audio_track.codec_id == 'A_DTS':
|
||||
video.audio_codec = 'DTS'
|
||||
logger.debug('Found audio_codec %s', video.audio_codec)
|
||||
elif audio_track.codec_id == 'A_AAC':
|
||||
video.audio_codec = 'AAC'
|
||||
logger.debug('Found audio_codec %s', video.audio_codec)
|
||||
else:
|
||||
logger.warning('MKV has no audio track')
|
||||
|
||||
# subtitle tracks
|
||||
if mkv.subtitle_tracks:
|
||||
if embedded_subtitles:
|
||||
embedded_subtitle_languages = set()
|
||||
for st in mkv.subtitle_tracks:
|
||||
if st.language:
|
||||
try:
|
||||
embedded_subtitle_languages.add(Language.fromalpha3b(st.language))
|
||||
except BabelfishError:
|
||||
logger.error('Embedded subtitle track language %r is not a valid language', st.language)
|
||||
embedded_subtitle_languages.add(Language('und'))
|
||||
elif st.name:
|
||||
try:
|
||||
embedded_subtitle_languages.add(Language.fromname(st.name))
|
||||
except BabelfishError:
|
||||
logger.debug('Embedded subtitle track name %r is not a valid language', st.name)
|
||||
embedded_subtitle_languages.add(Language('und'))
|
||||
else:
|
||||
embedded_subtitle_languages.add(Language('und'))
|
||||
logger.debug('Found embedded subtitle %r', embedded_subtitle_languages)
|
||||
video.subtitle_languages |= embedded_subtitle_languages
|
||||
else:
|
||||
logger.debug('MKV has no subtitle track')
|
||||
else:
|
||||
logger.debug('Unsupported video extension %s', extension)
|
187
libs/subliminal2.7/refiners/omdb.py
Normal file
187
libs/subliminal2.7/refiners/omdb.py
Normal file
|
@ -0,0 +1,187 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import logging
|
||||
import operator
|
||||
|
||||
import requests
|
||||
|
||||
from .. import __short_version__
|
||||
from ..cache import REFINER_EXPIRATION_TIME, region
|
||||
from ..video import Episode, Movie
|
||||
from ..utils import sanitize
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OMDBClient(object):
|
||||
base_url = 'http://www.omdbapi.com'
|
||||
|
||||
def __init__(self, version=1, session=None, headers=None, timeout=10):
|
||||
#: Session for the requests
|
||||
self.session = session or requests.Session()
|
||||
self.session.timeout = timeout
|
||||
self.session.headers.update(headers or {})
|
||||
self.session.params['r'] = 'json'
|
||||
self.session.params['v'] = version
|
||||
|
||||
def get(self, id=None, title=None, type=None, year=None, plot='short', tomatoes=False):
|
||||
# build the params
|
||||
params = {}
|
||||
if id:
|
||||
params['i'] = id
|
||||
if title:
|
||||
params['t'] = title
|
||||
if not params:
|
||||
raise ValueError('At least id or title is required')
|
||||
params['type'] = type
|
||||
params['y'] = year
|
||||
params['plot'] = plot
|
||||
params['tomatoes'] = tomatoes
|
||||
|
||||
# perform the request
|
||||
r = self.session.get(self.base_url, params=params)
|
||||
r.raise_for_status()
|
||||
|
||||
# get the response as json
|
||||
j = r.json()
|
||||
|
||||
# check response status
|
||||
if j['Response'] == 'False':
|
||||
return None
|
||||
|
||||
return j
|
||||
|
||||
def search(self, title, type=None, year=None, page=1):
|
||||
# build the params
|
||||
params = {'s': title, 'type': type, 'y': year, 'page': page}
|
||||
|
||||
# perform the request
|
||||
r = self.session.get(self.base_url, params=params)
|
||||
r.raise_for_status()
|
||||
|
||||
# get the response as json
|
||||
j = r.json()
|
||||
|
||||
# check response status
|
||||
if j['Response'] == 'False':
|
||||
return None
|
||||
|
||||
return j
|
||||
|
||||
|
||||
omdb_client = OMDBClient(headers={'User-Agent': 'Subliminal/%s' % __short_version__})
|
||||
|
||||
|
||||
@region.cache_on_arguments(expiration_time=REFINER_EXPIRATION_TIME)
|
||||
def search(title, type, year):
|
||||
results = omdb_client.search(title, type, year)
|
||||
if not results:
|
||||
return None
|
||||
|
||||
# fetch all paginated results
|
||||
all_results = results['Search']
|
||||
total_results = int(results['totalResults'])
|
||||
page = 1
|
||||
while total_results > page * 10:
|
||||
page += 1
|
||||
results = omdb_client.search(title, type, year, page=page)
|
||||
all_results.extend(results['Search'])
|
||||
|
||||
return all_results
|
||||
|
||||
|
||||
def refine(video, **kwargs):
|
||||
"""Refine a video by searching `OMDb API <http://omdbapi.com/>`_.
|
||||
|
||||
Several :class:`~subliminal.video.Episode` attributes can be found:
|
||||
|
||||
* :attr:`~subliminal.video.Episode.series`
|
||||
* :attr:`~subliminal.video.Episode.year`
|
||||
* :attr:`~subliminal.video.Episode.series_imdb_id`
|
||||
|
||||
Similarly, for a :class:`~subliminal.video.Movie`:
|
||||
|
||||
* :attr:`~subliminal.video.Movie.title`
|
||||
* :attr:`~subliminal.video.Movie.year`
|
||||
* :attr:`~subliminal.video.Video.imdb_id`
|
||||
|
||||
"""
|
||||
if isinstance(video, Episode):
|
||||
# exit if the information is complete
|
||||
if video.series_imdb_id:
|
||||
logger.debug('No need to search')
|
||||
return
|
||||
|
||||
# search the series
|
||||
results = search(video.series, 'series', video.year)
|
||||
if not results:
|
||||
logger.warning('No results for series')
|
||||
return
|
||||
logger.debug('Found %d results', len(results))
|
||||
|
||||
# filter the results
|
||||
results = [r for r in results if sanitize(r['Title']) == sanitize(video.series)]
|
||||
if not results:
|
||||
logger.warning('No matching series found')
|
||||
return
|
||||
|
||||
# process the results
|
||||
found = False
|
||||
for result in sorted(results, key=operator.itemgetter('Year')):
|
||||
if video.original_series and video.year is None:
|
||||
logger.debug('Found result for original series without year')
|
||||
found = True
|
||||
break
|
||||
if video.year == int(result['Year'].split(u'\u2013')[0]):
|
||||
logger.debug('Found result with matching year')
|
||||
found = True
|
||||
break
|
||||
|
||||
if not found:
|
||||
logger.warning('No matching series found')
|
||||
return
|
||||
|
||||
# add series information
|
||||
logger.debug('Found series %r', result)
|
||||
video.series = result['Title']
|
||||
video.year = int(result['Year'].split(u'\u2013')[0])
|
||||
video.series_imdb_id = result['imdbID']
|
||||
|
||||
elif isinstance(video, Movie):
|
||||
# exit if the information is complete
|
||||
if video.imdb_id:
|
||||
return
|
||||
|
||||
# search the movie
|
||||
results = search(video.title, 'movie', video.year)
|
||||
if not results:
|
||||
logger.warning('No results')
|
||||
return
|
||||
logger.debug('Found %d results', len(results))
|
||||
|
||||
# filter the results
|
||||
results = [r for r in results if sanitize(r['Title']) == sanitize(video.title)]
|
||||
if not results:
|
||||
logger.warning('No matching movie found')
|
||||
return
|
||||
|
||||
# process the results
|
||||
found = False
|
||||
for result in results:
|
||||
if video.year is None:
|
||||
logger.debug('Found result for movie without year')
|
||||
found = True
|
||||
break
|
||||
if video.year == int(result['Year']):
|
||||
logger.debug('Found result with matching year')
|
||||
found = True
|
||||
break
|
||||
|
||||
if not found:
|
||||
logger.warning('No matching movie found')
|
||||
return
|
||||
|
||||
# add movie information
|
||||
logger.debug('Found movie %r', result)
|
||||
video.title = result['Title']
|
||||
video.year = int(result['Year'].split(u'\u2013')[0])
|
||||
video.imdb_id = result['imdbID']
|
351
libs/subliminal2.7/refiners/tvdb.py
Normal file
351
libs/subliminal2.7/refiners/tvdb.py
Normal file
|
@ -0,0 +1,351 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from datetime import datetime, timedelta
|
||||
from functools import wraps
|
||||
import logging
|
||||
import re
|
||||
import _strptime
|
||||
import requests
|
||||
|
||||
from .. import __short_version__
|
||||
from ..cache import REFINER_EXPIRATION_TIME, region
|
||||
from ..utils import sanitize
|
||||
from ..video import Episode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
series_re = re.compile(r'^(?P<series>.*?)(?: \((?:(?P<year>\d{4})|(?P<country>[A-Z]{2}))\))?$')
|
||||
|
||||
|
||||
def requires_auth(func):
|
||||
"""Decorator for :class:`TVDBClient` methods that require authentication"""
|
||||
@wraps(func)
|
||||
def wrapper(self, *args, **kwargs):
|
||||
if self.token is None or self.token_expired:
|
||||
self.login()
|
||||
elif self.token_needs_refresh:
|
||||
self.refresh_token()
|
||||
return func(self, *args, **kwargs)
|
||||
return wrapper
|
||||
|
||||
|
||||
class TVDBClient(object):
|
||||
"""TVDB REST API Client
|
||||
|
||||
:param str apikey: API key to use.
|
||||
:param str username: username to use.
|
||||
:param str password: password to use.
|
||||
:param str language: language of the responses.
|
||||
:param session: session object to use.
|
||||
:type session: :class:`requests.sessions.Session` or compatible.
|
||||
:param dict headers: additional headers.
|
||||
:param int timeout: timeout for the requests.
|
||||
|
||||
"""
|
||||
#: Base URL of the API
|
||||
base_url = 'https://api.thetvdb.com'
|
||||
|
||||
#: Token lifespan
|
||||
token_lifespan = timedelta(hours=1)
|
||||
|
||||
#: Minimum token age before a :meth:`refresh_token` is triggered
|
||||
refresh_token_every = timedelta(minutes=30)
|
||||
|
||||
def __init__(self, apikey=None, username=None, password=None, language='en', session=None, headers=None,
|
||||
timeout=10):
|
||||
#: API key
|
||||
self.apikey = apikey
|
||||
|
||||
#: Username
|
||||
self.username = username
|
||||
|
||||
#: Password
|
||||
self.password = password
|
||||
|
||||
#: Last token acquisition date
|
||||
self.token_date = datetime.utcnow() - self.token_lifespan
|
||||
|
||||
#: Session for the requests
|
||||
self.session = session or requests.Session()
|
||||
self.session.timeout = timeout
|
||||
self.session.headers.update(headers or {})
|
||||
self.session.headers['Content-Type'] = 'application/json'
|
||||
self.session.headers['Accept-Language'] = language
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
return self.session.headers['Accept-Language']
|
||||
|
||||
@language.setter
|
||||
def language(self, value):
|
||||
self.session.headers['Accept-Language'] = value
|
||||
|
||||
@property
|
||||
def token(self):
|
||||
if 'Authorization' not in self.session.headers:
|
||||
return None
|
||||
return self.session.headers['Authorization'][7:]
|
||||
|
||||
@property
|
||||
def token_expired(self):
|
||||
return datetime.utcnow() - self.token_date > self.token_lifespan
|
||||
|
||||
@property
|
||||
def token_needs_refresh(self):
|
||||
return datetime.utcnow() - self.token_date > self.refresh_token_every
|
||||
|
||||
def login(self):
|
||||
"""Login"""
|
||||
# perform the request
|
||||
data = {'apikey': self.apikey, 'username': self.username, 'password': self.password}
|
||||
r = self.session.post(self.base_url + '/login', json=data)
|
||||
r.raise_for_status()
|
||||
|
||||
# set the Authorization header
|
||||
self.session.headers['Authorization'] = 'Bearer ' + r.json()['token']
|
||||
|
||||
# update token_date
|
||||
self.token_date = datetime.utcnow()
|
||||
|
||||
def refresh_token(self):
|
||||
"""Refresh token"""
|
||||
# perform the request
|
||||
r = self.session.get(self.base_url + '/refresh_token')
|
||||
r.raise_for_status()
|
||||
|
||||
# set the Authorization header
|
||||
self.session.headers['Authorization'] = 'Bearer ' + r.json()['token']
|
||||
|
||||
# update token_date
|
||||
self.token_date = datetime.utcnow()
|
||||
|
||||
@requires_auth
|
||||
def search_series(self, name=None, imdb_id=None, zap2it_id=None):
|
||||
"""Search series"""
|
||||
# perform the request
|
||||
params = {'name': name, 'imdbId': imdb_id, 'zap2itId': zap2it_id}
|
||||
r = self.session.get(self.base_url + '/search/series', params=params)
|
||||
if r.status_code == 404:
|
||||
return None
|
||||
r.raise_for_status()
|
||||
|
||||
return r.json()['data']
|
||||
|
||||
@requires_auth
|
||||
def get_series(self, id):
|
||||
"""Get series"""
|
||||
# perform the request
|
||||
r = self.session.get(self.base_url + '/series/{}'.format(id))
|
||||
if r.status_code == 404:
|
||||
return None
|
||||
r.raise_for_status()
|
||||
|
||||
return r.json()['data']
|
||||
|
||||
@requires_auth
|
||||
def get_series_actors(self, id):
|
||||
"""Get series actors"""
|
||||
# perform the request
|
||||
r = self.session.get(self.base_url + '/series/{}/actors'.format(id))
|
||||
if r.status_code == 404:
|
||||
return None
|
||||
r.raise_for_status()
|
||||
|
||||
return r.json()['data']
|
||||
|
||||
@requires_auth
|
||||
def get_series_episodes(self, id, page=1):
|
||||
"""Get series episodes"""
|
||||
# perform the request
|
||||
params = {'page': page}
|
||||
r = self.session.get(self.base_url + '/series/{}/episodes'.format(id), params=params)
|
||||
if r.status_code == 404:
|
||||
return None
|
||||
r.raise_for_status()
|
||||
|
||||
return r.json()
|
||||
|
||||
@requires_auth
|
||||
def query_series_episodes(self, id, absolute_number=None, aired_season=None, aired_episode=None, dvd_season=None,
|
||||
dvd_episode=None, imdb_id=None, page=1):
|
||||
"""Query series episodes"""
|
||||
# perform the request
|
||||
params = {'absoluteNumber': absolute_number, 'airedSeason': aired_season, 'airedEpisode': aired_episode,
|
||||
'dvdSeason': dvd_season, 'dvdEpisode': dvd_episode, 'imdbId': imdb_id, 'page': page}
|
||||
r = self.session.get(self.base_url + '/series/{}/episodes/query'.format(id), params=params)
|
||||
if r.status_code == 404:
|
||||
return None
|
||||
r.raise_for_status()
|
||||
|
||||
return r.json()
|
||||
|
||||
@requires_auth
|
||||
def get_episode(self, id):
|
||||
"""Get episode"""
|
||||
# perform the request
|
||||
r = self.session.get(self.base_url + '/episodes/{}'.format(id))
|
||||
if r.status_code == 404:
|
||||
return None
|
||||
r.raise_for_status()
|
||||
|
||||
return r.json()['data']
|
||||
|
||||
|
||||
#: Configured instance of :class:`TVDBClient`
|
||||
tvdb_client = TVDBClient('5EC930FB90DA1ADA', headers={'User-Agent': 'Subliminal/%s' % __short_version__})
|
||||
|
||||
|
||||
@region.cache_on_arguments(expiration_time=REFINER_EXPIRATION_TIME)
|
||||
def search_series(name):
|
||||
"""Search series.
|
||||
|
||||
:param str name: name of the series.
|
||||
:return: the search results.
|
||||
:rtype: list
|
||||
|
||||
"""
|
||||
return tvdb_client.search_series(name)
|
||||
|
||||
|
||||
@region.cache_on_arguments(expiration_time=REFINER_EXPIRATION_TIME)
|
||||
def get_series(id):
|
||||
"""Get series.
|
||||
|
||||
:param int id: id of the series.
|
||||
:return: the series data.
|
||||
:rtype: dict
|
||||
|
||||
"""
|
||||
return tvdb_client.get_series(id)
|
||||
|
||||
|
||||
@region.cache_on_arguments(expiration_time=REFINER_EXPIRATION_TIME)
|
||||
def get_series_episode(series_id, season, episode):
|
||||
"""Get an episode of a series.
|
||||
|
||||
:param int series_id: id of the series.
|
||||
:param int season: season number of the episode.
|
||||
:param int episode: episode number of the episode.
|
||||
:return: the episode data.
|
||||
:rtype: dict
|
||||
|
||||
"""
|
||||
result = tvdb_client.query_series_episodes(series_id, aired_season=season, aired_episode=episode)
|
||||
if result:
|
||||
return tvdb_client.get_episode(result['data'][0]['id'])
|
||||
|
||||
|
||||
def refine(video, **kwargs):
|
||||
"""Refine a video by searching `TheTVDB <http://thetvdb.com/>`_.
|
||||
|
||||
.. note::
|
||||
|
||||
This refiner only work for instances of :class:`~subliminal.video.Episode`.
|
||||
|
||||
Several attributes can be found:
|
||||
|
||||
* :attr:`~subliminal.video.Episode.series`
|
||||
* :attr:`~subliminal.video.Episode.year`
|
||||
* :attr:`~subliminal.video.Episode.series_imdb_id`
|
||||
* :attr:`~subliminal.video.Episode.series_tvdb_id`
|
||||
* :attr:`~subliminal.video.Episode.title`
|
||||
* :attr:`~subliminal.video.Video.imdb_id`
|
||||
* :attr:`~subliminal.video.Episode.tvdb_id`
|
||||
|
||||
"""
|
||||
# only deal with Episode videos
|
||||
if not isinstance(video, Episode):
|
||||
logger.error('Cannot refine episodes')
|
||||
return
|
||||
|
||||
# exit if the information is complete
|
||||
if video.series_tvdb_id and video.tvdb_id:
|
||||
logger.debug('No need to search')
|
||||
return
|
||||
|
||||
# search the series
|
||||
logger.info('Searching series %r', video.series)
|
||||
results = search_series(video.series.lower())
|
||||
if not results:
|
||||
logger.warning('No results for series')
|
||||
return
|
||||
logger.debug('Found %d results', len(results))
|
||||
|
||||
# search for exact matches
|
||||
matching_results = []
|
||||
for result in results:
|
||||
matching_result = {}
|
||||
|
||||
# use seriesName and aliases
|
||||
series_names = [result['seriesName']]
|
||||
series_names.extend(result['aliases'])
|
||||
|
||||
# parse the original series as series + year or country
|
||||
original_match = series_re.match(result['seriesName']).groupdict()
|
||||
|
||||
# parse series year
|
||||
series_year = None
|
||||
if result['firstAired']:
|
||||
series_year = datetime.strptime(result['firstAired'], '%Y-%m-%d').year
|
||||
|
||||
# discard mismatches on year
|
||||
if video.year and series_year and video.year != series_year:
|
||||
logger.debug('Discarding series %r mismatch on year %d', result['seriesName'], series_year)
|
||||
continue
|
||||
|
||||
# iterate over series names
|
||||
for series_name in series_names:
|
||||
# parse as series and year
|
||||
series, year, country = series_re.match(series_name).groups()
|
||||
if year:
|
||||
year = int(year)
|
||||
|
||||
# discard mismatches on year
|
||||
if year and (video.original_series or video.year != year):
|
||||
logger.debug('Discarding series name %r mismatch on year %d', series, year)
|
||||
continue
|
||||
|
||||
# match on sanitized series name
|
||||
if sanitize(series) == sanitize(video.series):
|
||||
logger.debug('Found exact match on series %r', series_name)
|
||||
matching_result['match'] = {'series': original_match['series'], 'year': series_year,
|
||||
'original_series': original_match['year'] is None}
|
||||
break
|
||||
|
||||
# add the result on match
|
||||
if matching_result:
|
||||
matching_result['data'] = result
|
||||
matching_results.append(matching_result)
|
||||
|
||||
# exit if we don't have exactly 1 matching result
|
||||
if not matching_results:
|
||||
logger.error('No matching series found')
|
||||
return
|
||||
if len(matching_results) > 1:
|
||||
logger.error('Multiple matches found')
|
||||
return
|
||||
|
||||
# get the series
|
||||
matching_result = matching_results[0]
|
||||
series = get_series(matching_result['data']['id'])
|
||||
|
||||
# add series information
|
||||
logger.debug('Found series %r', series)
|
||||
video.series = matching_result['match']['series']
|
||||
video.alternative_series.extend(series['aliases'])
|
||||
video.year = matching_result['match']['year']
|
||||
video.original_series = matching_result['match']['original_series']
|
||||
video.series_tvdb_id = series['id']
|
||||
video.series_imdb_id = series['imdbId'] or None
|
||||
|
||||
# get the episode
|
||||
logger.info('Getting series episode %dx%d', video.season, video.episode)
|
||||
episode = get_series_episode(video.series_tvdb_id, video.season, video.episode)
|
||||
if not episode:
|
||||
logger.warning('No results for episode')
|
||||
return
|
||||
|
||||
# add episode information
|
||||
logger.debug('Found episode %r', episode)
|
||||
video.tvdb_id = episode['id']
|
||||
video.title = episode['episodeName'] or None
|
||||
video.imdb_id = episode['imdbId'] or None
|
234
libs/subliminal2.7/score.py
Normal file
234
libs/subliminal2.7/score.py
Normal file
|
@ -0,0 +1,234 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
This module provides the default implementation of the `compute_score` parameter in
|
||||
:meth:`~subliminal.core.ProviderPool.download_best_subtitles` and :func:`~subliminal.core.download_best_subtitles`.
|
||||
|
||||
.. note::
|
||||
|
||||
To avoid unnecessary dependency on `sympy <http://www.sympy.org/>`_ and boost subliminal's import time, the
|
||||
resulting scores are hardcoded here and manually updated when the set of equations change.
|
||||
|
||||
Available matches:
|
||||
|
||||
* hash
|
||||
* title
|
||||
* year
|
||||
* series
|
||||
* season
|
||||
* episode
|
||||
* release_group
|
||||
* format
|
||||
* audio_codec
|
||||
* resolution
|
||||
* hearing_impaired
|
||||
* video_codec
|
||||
* series_imdb_id
|
||||
* imdb_id
|
||||
* tvdb_id
|
||||
|
||||
"""
|
||||
from __future__ import division, print_function
|
||||
import logging
|
||||
|
||||
from .video import Episode, Movie
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
#: Scores for episodes
|
||||
episode_scores = {'hash': 359, 'series': 180, 'year': 90, 'season': 30, 'episode': 30, 'release_group': 15,
|
||||
'format': 7, 'audio_codec': 3, 'resolution': 2, 'video_codec': 2, 'hearing_impaired': 1}
|
||||
|
||||
#: Scores for movies
|
||||
movie_scores = {'hash': 119, 'title': 60, 'year': 30, 'release_group': 15,
|
||||
'format': 7, 'audio_codec': 3, 'resolution': 2, 'video_codec': 2, 'hearing_impaired': 1}
|
||||
|
||||
#: Equivalent release groups
|
||||
equivalent_release_groups = ({'LOL', 'DIMENSION'}, {'ASAP', 'IMMERSE', 'FLEET'}, {'AVS', 'SVA'})
|
||||
|
||||
|
||||
def get_equivalent_release_groups(release_group):
|
||||
"""Get all the equivalents of the given release group.
|
||||
|
||||
:param str release_group: the release group to get the equivalents of.
|
||||
:return: the equivalent release groups.
|
||||
:rtype: set
|
||||
|
||||
"""
|
||||
for equivalent_release_group in equivalent_release_groups:
|
||||
if release_group in equivalent_release_group:
|
||||
return equivalent_release_group
|
||||
|
||||
return {release_group}
|
||||
|
||||
|
||||
def get_scores(video):
|
||||
"""Get the scores dict for the given `video`.
|
||||
|
||||
This will return either :data:`episode_scores` or :data:`movie_scores` based on the type of the `video`.
|
||||
|
||||
:param video: the video to compute the score against.
|
||||
:type video: :class:`~subliminal.video.Video`
|
||||
:return: the scores dict.
|
||||
:rtype: dict
|
||||
|
||||
"""
|
||||
if isinstance(video, Episode):
|
||||
return episode_scores
|
||||
elif isinstance(video, Movie):
|
||||
return movie_scores
|
||||
|
||||
raise ValueError('video must be an instance of Episode or Movie')
|
||||
|
||||
|
||||
def compute_score(subtitle, video, hearing_impaired=None):
|
||||
"""Compute the score of the `subtitle` against the `video` with `hearing_impaired` preference.
|
||||
|
||||
:func:`compute_score` uses the :meth:`Subtitle.get_matches <subliminal.subtitle.Subtitle.get_matches>` method and
|
||||
applies the scores (either from :data:`episode_scores` or :data:`movie_scores`) after some processing.
|
||||
|
||||
:param subtitle: the subtitle to compute the score of.
|
||||
:type subtitle: :class:`~subliminal.subtitle.Subtitle`
|
||||
:param video: the video to compute the score against.
|
||||
:type video: :class:`~subliminal.video.Video`
|
||||
:param bool hearing_impaired: hearing impaired preference.
|
||||
:return: score of the subtitle.
|
||||
:rtype: int
|
||||
|
||||
"""
|
||||
logger.info('Computing score of %r for video %r with %r', subtitle, video, dict(hearing_impaired=hearing_impaired))
|
||||
|
||||
# get the scores dict
|
||||
scores = get_scores(video)
|
||||
logger.debug('Using scores %r', scores)
|
||||
|
||||
# get the matches
|
||||
matches = subtitle.get_matches(video)
|
||||
logger.debug('Found matches %r', matches)
|
||||
|
||||
# on hash match, discard everything else
|
||||
if 'hash' in matches:
|
||||
logger.debug('Keeping only hash match')
|
||||
matches &= {'hash'}
|
||||
|
||||
# handle equivalent matches
|
||||
if isinstance(video, Episode):
|
||||
if 'title' in matches:
|
||||
logger.debug('Adding title match equivalent')
|
||||
matches.add('episode')
|
||||
if 'series_imdb_id' in matches:
|
||||
logger.debug('Adding series_imdb_id match equivalent')
|
||||
matches |= {'series', 'year'}
|
||||
if 'imdb_id' in matches:
|
||||
logger.debug('Adding imdb_id match equivalents')
|
||||
matches |= {'series', 'year', 'season', 'episode'}
|
||||
if 'tvdb_id' in matches:
|
||||
logger.debug('Adding tvdb_id match equivalents')
|
||||
matches |= {'series', 'year', 'season', 'episode'}
|
||||
if 'series_tvdb_id' in matches:
|
||||
logger.debug('Adding series_tvdb_id match equivalents')
|
||||
matches |= {'series', 'year'}
|
||||
elif isinstance(video, Movie):
|
||||
if 'imdb_id' in matches:
|
||||
logger.debug('Adding imdb_id match equivalents')
|
||||
matches |= {'title', 'year'}
|
||||
|
||||
# handle hearing impaired
|
||||
if hearing_impaired is not None and subtitle.hearing_impaired == hearing_impaired:
|
||||
logger.debug('Matched hearing_impaired')
|
||||
matches.add('hearing_impaired')
|
||||
|
||||
# compute the score
|
||||
score = sum((scores.get(match, 0) for match in matches))
|
||||
logger.info('Computed score %r with final matches %r', score, matches)
|
||||
|
||||
# ensure score is within valid bounds
|
||||
assert 0 <= score <= scores['hash'] + scores['hearing_impaired']
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def solve_episode_equations():
|
||||
from sympy import Eq, solve, symbols
|
||||
|
||||
hash, series, year, season, episode, release_group = symbols('hash series year season episode release_group')
|
||||
format, audio_codec, resolution, video_codec = symbols('format audio_codec resolution video_codec')
|
||||
hearing_impaired = symbols('hearing_impaired')
|
||||
|
||||
equations = [
|
||||
# hash is best
|
||||
Eq(hash, series + year + season + episode + release_group + format + audio_codec + resolution + video_codec),
|
||||
|
||||
# series counts for the most part in the total score
|
||||
Eq(series, year + season + episode + release_group + format + audio_codec + resolution + video_codec + 1),
|
||||
|
||||
# year is the second most important part
|
||||
Eq(year, season + episode + release_group + format + audio_codec + resolution + video_codec + 1),
|
||||
|
||||
# season is important too
|
||||
Eq(season, release_group + format + audio_codec + resolution + video_codec + 1),
|
||||
|
||||
# episode is equally important to season
|
||||
Eq(episode, season),
|
||||
|
||||
# release group is the next most wanted match
|
||||
Eq(release_group, format + audio_codec + resolution + video_codec + 1),
|
||||
|
||||
# format counts as much as audio_codec, resolution and video_codec
|
||||
Eq(format, audio_codec + resolution + video_codec),
|
||||
|
||||
# audio_codec is more valuable than video_codec
|
||||
Eq(audio_codec, video_codec + 1),
|
||||
|
||||
# resolution counts as much as video_codec
|
||||
Eq(resolution, video_codec),
|
||||
|
||||
# video_codec is the least valuable match but counts more than the sum of all scoring increasing matches
|
||||
Eq(video_codec, hearing_impaired + 1),
|
||||
|
||||
# hearing impaired is only used for score increasing, so put it to 1
|
||||
Eq(hearing_impaired, 1),
|
||||
]
|
||||
|
||||
return solve(equations, [hash, series, year, season, episode, release_group, format, audio_codec, resolution,
|
||||
hearing_impaired, video_codec])
|
||||
|
||||
|
||||
def solve_movie_equations():
|
||||
from sympy import Eq, solve, symbols
|
||||
|
||||
hash, title, year, release_group = symbols('hash title year release_group')
|
||||
format, audio_codec, resolution, video_codec = symbols('format audio_codec resolution video_codec')
|
||||
hearing_impaired = symbols('hearing_impaired')
|
||||
|
||||
equations = [
|
||||
# hash is best
|
||||
Eq(hash, title + year + release_group + format + audio_codec + resolution + video_codec),
|
||||
|
||||
# title counts for the most part in the total score
|
||||
Eq(title, year + release_group + format + audio_codec + resolution + video_codec + 1),
|
||||
|
||||
# year is the second most important part
|
||||
Eq(year, release_group + format + audio_codec + resolution + video_codec + 1),
|
||||
|
||||
# release group is the next most wanted match
|
||||
Eq(release_group, format + audio_codec + resolution + video_codec + 1),
|
||||
|
||||
# format counts as much as audio_codec, resolution and video_codec
|
||||
Eq(format, audio_codec + resolution + video_codec),
|
||||
|
||||
# audio_codec is more valuable than video_codec
|
||||
Eq(audio_codec, video_codec + 1),
|
||||
|
||||
# resolution counts as much as video_codec
|
||||
Eq(resolution, video_codec),
|
||||
|
||||
# video_codec is the least valuable match but counts more than the sum of all scoring increasing matches
|
||||
Eq(video_codec, hearing_impaired + 1),
|
||||
|
||||
# hearing impaired is only used for score increasing, so put it to 1
|
||||
Eq(hearing_impaired, 1),
|
||||
]
|
||||
|
||||
return solve(equations, [hash, title, year, release_group, format, audio_codec, resolution, hearing_impaired,
|
||||
video_codec])
|
261
libs/subliminal2.7/subtitle.py
Normal file
261
libs/subliminal2.7/subtitle.py
Normal file
|
@ -0,0 +1,261 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import codecs
|
||||
import logging
|
||||
import os
|
||||
|
||||
import chardet
|
||||
import pysrt
|
||||
|
||||
from .score import get_equivalent_release_groups
|
||||
from .video import Episode, Movie
|
||||
from .utils import sanitize, sanitize_release_group
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
#: Subtitle extensions
|
||||
SUBTITLE_EXTENSIONS = ('.srt', '.sub', '.smi', '.txt', '.ssa', '.ass', '.mpl')
|
||||
|
||||
|
||||
class Subtitle(object):
|
||||
"""Base class for subtitle.
|
||||
|
||||
:param language: language of the subtitle.
|
||||
:type language: :class:`~babelfish.language.Language`
|
||||
:param bool hearing_impaired: whether or not the subtitle is hearing impaired.
|
||||
:param page_link: URL of the web page from which the subtitle can be downloaded.
|
||||
:type page_link: str
|
||||
:param encoding: Text encoding of the subtitle.
|
||||
:type encoding: str
|
||||
|
||||
"""
|
||||
#: Name of the provider that returns that class of subtitle
|
||||
provider_name = ''
|
||||
|
||||
def __init__(self, language, hearing_impaired=False, page_link=None, encoding=None):
|
||||
#: Language of the subtitle
|
||||
self.language = language
|
||||
|
||||
#: Whether or not the subtitle is hearing impaired
|
||||
self.hearing_impaired = hearing_impaired
|
||||
|
||||
#: URL of the web page from which the subtitle can be downloaded
|
||||
self.page_link = page_link
|
||||
|
||||
#: Content as bytes
|
||||
self.content = None
|
||||
|
||||
#: Encoding to decode with when accessing :attr:`text`
|
||||
self.encoding = None
|
||||
|
||||
# validate the encoding
|
||||
if encoding:
|
||||
try:
|
||||
self.encoding = codecs.lookup(encoding).name
|
||||
except (TypeError, LookupError):
|
||||
logger.debug('Unsupported encoding %s', encoding)
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
"""Unique identifier of the subtitle"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
"""Content as string
|
||||
|
||||
If :attr:`encoding` is None, the encoding is guessed with :meth:`guess_encoding`
|
||||
|
||||
"""
|
||||
if not self.content:
|
||||
return
|
||||
|
||||
if self.encoding:
|
||||
return self.content.decode(self.encoding, errors='replace')
|
||||
|
||||
return self.content.decode(self.guess_encoding(), errors='replace')
|
||||
|
||||
def is_valid(self):
|
||||
"""Check if a :attr:`text` is a valid SubRip format.
|
||||
|
||||
:return: whether or not the subtitle is valid.
|
||||
:rtype: bool
|
||||
|
||||
"""
|
||||
if not self.text:
|
||||
return False
|
||||
|
||||
try:
|
||||
pysrt.from_string(self.text, error_handling=pysrt.ERROR_RAISE)
|
||||
except pysrt.Error as e:
|
||||
if e.args[0] < 80:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def guess_encoding(self):
|
||||
"""Guess encoding using the language, falling back on chardet.
|
||||
|
||||
:return: the guessed encoding.
|
||||
:rtype: str
|
||||
|
||||
"""
|
||||
logger.info('Guessing encoding for language %s', self.language)
|
||||
|
||||
# always try utf-8 first
|
||||
encodings = ['utf-8']
|
||||
|
||||
# add language-specific encodings
|
||||
if self.language.alpha3 == 'zho':
|
||||
encodings.extend(['gb18030', 'big5'])
|
||||
elif self.language.alpha3 == 'jpn':
|
||||
encodings.append('shift-jis')
|
||||
elif self.language.alpha3 == 'ara':
|
||||
encodings.append('windows-1256')
|
||||
elif self.language.alpha3 == 'heb':
|
||||
encodings.append('windows-1255')
|
||||
elif self.language.alpha3 == 'tur':
|
||||
encodings.extend(['iso-8859-9', 'windows-1254'])
|
||||
elif self.language.alpha3 == 'pol':
|
||||
# Eastern European Group 1
|
||||
encodings.extend(['windows-1250'])
|
||||
elif self.language.alpha3 == 'bul':
|
||||
# Eastern European Group 2
|
||||
encodings.extend(['windows-1251'])
|
||||
else:
|
||||
# Western European (windows-1252)
|
||||
encodings.append('latin-1')
|
||||
|
||||
# try to decode
|
||||
logger.debug('Trying encodings %r', encodings)
|
||||
for encoding in encodings:
|
||||
try:
|
||||
self.content.decode(encoding)
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
else:
|
||||
logger.info('Guessed encoding %s', encoding)
|
||||
return encoding
|
||||
|
||||
logger.warning('Could not guess encoding from language')
|
||||
|
||||
# fallback on chardet
|
||||
encoding = chardet.detect(self.content)['encoding']
|
||||
logger.info('Chardet found encoding %s', encoding)
|
||||
|
||||
return encoding
|
||||
|
||||
def get_matches(self, video):
|
||||
"""Get the matches against the `video`.
|
||||
|
||||
:param video: the video to get the matches with.
|
||||
:type video: :class:`~subliminal.video.Video`
|
||||
:return: matches of the subtitle.
|
||||
:rtype: set
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.provider_name + '-' + self.id)
|
||||
|
||||
def __repr__(self):
|
||||
return '<%s %r [%s]>' % (self.__class__.__name__, self.id, self.language)
|
||||
|
||||
|
||||
def get_subtitle_path(video_path, language=None, extension='.srt'):
|
||||
"""Get the subtitle path using the `video_path` and `language`.
|
||||
|
||||
:param str video_path: path to the video.
|
||||
:param language: language of the subtitle to put in the path.
|
||||
:type language: :class:`~babelfish.language.Language`
|
||||
:param str extension: extension of the subtitle.
|
||||
:return: path of the subtitle.
|
||||
:rtype: str
|
||||
|
||||
"""
|
||||
subtitle_root = os.path.splitext(video_path)[0]
|
||||
|
||||
if language:
|
||||
subtitle_root += '.' + str(language)
|
||||
|
||||
return subtitle_root + extension
|
||||
|
||||
|
||||
def guess_matches(video, guess, partial=False):
|
||||
"""Get matches between a `video` and a `guess`.
|
||||
|
||||
If a guess is `partial`, the absence information won't be counted as a match.
|
||||
|
||||
:param video: the video.
|
||||
:type video: :class:`~subliminal.video.Video`
|
||||
:param guess: the guess.
|
||||
:type guess: dict
|
||||
:param bool partial: whether or not the guess is partial.
|
||||
:return: matches between the `video` and the `guess`.
|
||||
:rtype: set
|
||||
|
||||
"""
|
||||
matches = set()
|
||||
if isinstance(video, Episode):
|
||||
# series
|
||||
if video.series and 'title' in guess and sanitize(guess['title']) == sanitize(video.series):
|
||||
matches.add('series')
|
||||
# title
|
||||
if video.title and 'episode_title' in guess and sanitize(guess['episode_title']) == sanitize(video.title):
|
||||
matches.add('title')
|
||||
# season
|
||||
if video.season and 'season' in guess and guess['season'] == video.season:
|
||||
matches.add('season')
|
||||
# episode
|
||||
# Currently we only have single-ep support (guessit returns a multi-ep as a list with int values)
|
||||
# Most providers only support single-ep, so make sure it contains only 1 episode
|
||||
# In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
|
||||
if video.episode and 'episode' in guess:
|
||||
episode_guess = guess['episode']
|
||||
episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
|
||||
if episode == video.episode:
|
||||
matches.add('episode')
|
||||
# year
|
||||
if video.year and 'year' in guess and guess['year'] == video.year:
|
||||
matches.add('year')
|
||||
# count "no year" as an information
|
||||
if not partial and video.original_series and 'year' not in guess:
|
||||
matches.add('year')
|
||||
elif isinstance(video, Movie):
|
||||
# year
|
||||
if video.year and 'year' in guess and guess['year'] == video.year:
|
||||
matches.add('year')
|
||||
# title
|
||||
if video.title and 'title' in guess and sanitize(guess['title']) == sanitize(video.title):
|
||||
matches.add('title')
|
||||
# release_group
|
||||
if (video.release_group and 'release_group' in guess and
|
||||
sanitize_release_group(guess['release_group']) in
|
||||
get_equivalent_release_groups(sanitize_release_group(video.release_group))):
|
||||
matches.add('release_group')
|
||||
# resolution
|
||||
if video.resolution and 'screen_size' in guess and guess['screen_size'] == video.resolution:
|
||||
matches.add('resolution')
|
||||
# format
|
||||
if video.format and 'format' in guess and guess['format'].lower() == video.format.lower():
|
||||
matches.add('format')
|
||||
# video_codec
|
||||
if video.video_codec and 'video_codec' in guess and guess['video_codec'] == video.video_codec:
|
||||
matches.add('video_codec')
|
||||
# audio_codec
|
||||
if video.audio_codec and 'audio_codec' in guess and guess['audio_codec'] == video.audio_codec:
|
||||
matches.add('audio_codec')
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
def fix_line_ending(content):
|
||||
"""Fix line ending of `content` by changing it to \n.
|
||||
|
||||
:param bytes content: content of the subtitle.
|
||||
:return: the content with fixed line endings.
|
||||
:rtype: bytes
|
||||
|
||||
"""
|
||||
return content.replace(b'\r\n', b'\n')
|
152
libs/subliminal2.7/utils.py
Normal file
152
libs/subliminal2.7/utils.py
Normal file
|
@ -0,0 +1,152 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from datetime import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import struct
|
||||
|
||||
|
||||
def hash_opensubtitles(video_path):
|
||||
"""Compute a hash using OpenSubtitles' algorithm.
|
||||
|
||||
:param str video_path: path of the video.
|
||||
:return: the hash.
|
||||
:rtype: str
|
||||
|
||||
"""
|
||||
bytesize = struct.calcsize(b'<q')
|
||||
with open(video_path, 'rb') as f:
|
||||
filesize = os.path.getsize(video_path)
|
||||
filehash = filesize
|
||||
if filesize < 65536 * 2:
|
||||
return
|
||||
for _ in range(65536 // bytesize):
|
||||
filebuffer = f.read(bytesize)
|
||||
(l_value,) = struct.unpack(b'<q', filebuffer)
|
||||
filehash += l_value
|
||||
filehash &= 0xFFFFFFFFFFFFFFFF # to remain as 64bit number
|
||||
f.seek(max(0, filesize - 65536), 0)
|
||||
for _ in range(65536 // bytesize):
|
||||
filebuffer = f.read(bytesize)
|
||||
(l_value,) = struct.unpack(b'<q', filebuffer)
|
||||
filehash += l_value
|
||||
filehash &= 0xFFFFFFFFFFFFFFFF
|
||||
returnedhash = '%016x' % filehash
|
||||
|
||||
return returnedhash
|
||||
|
||||
|
||||
def hash_thesubdb(video_path):
|
||||
"""Compute a hash using TheSubDB's algorithm.
|
||||
|
||||
:param str video_path: path of the video.
|
||||
:return: the hash.
|
||||
:rtype: str
|
||||
|
||||
"""
|
||||
readsize = 64 * 1024
|
||||
if os.path.getsize(video_path) < readsize:
|
||||
return
|
||||
with open(video_path, 'rb') as f:
|
||||
data = f.read(readsize)
|
||||
f.seek(-readsize, os.SEEK_END)
|
||||
data += f.read(readsize)
|
||||
|
||||
return hashlib.md5(data).hexdigest()
|
||||
|
||||
|
||||
def hash_napiprojekt(video_path):
|
||||
"""Compute a hash using NapiProjekt's algorithm.
|
||||
|
||||
:param str video_path: path of the video.
|
||||
:return: the hash.
|
||||
:rtype: str
|
||||
|
||||
"""
|
||||
readsize = 1024 * 1024 * 10
|
||||
with open(video_path, 'rb') as f:
|
||||
data = f.read(readsize)
|
||||
return hashlib.md5(data).hexdigest()
|
||||
|
||||
|
||||
def hash_shooter(video_path):
|
||||
"""Compute a hash using Shooter's algorithm
|
||||
|
||||
:param string video_path: path of the video
|
||||
:return: the hash
|
||||
:rtype: string
|
||||
|
||||
"""
|
||||
filesize = os.path.getsize(video_path)
|
||||
readsize = 4096
|
||||
if os.path.getsize(video_path) < readsize * 2:
|
||||
return None
|
||||
offsets = (readsize, filesize // 3 * 2, filesize // 3, filesize - readsize * 2)
|
||||
filehash = []
|
||||
with open(video_path, 'rb') as f:
|
||||
for offset in offsets:
|
||||
f.seek(offset)
|
||||
filehash.append(hashlib.md5(f.read(readsize)).hexdigest())
|
||||
return ';'.join(filehash)
|
||||
|
||||
|
||||
def sanitize(string, ignore_characters=None):
|
||||
"""Sanitize a string to strip special characters.
|
||||
|
||||
:param str string: the string to sanitize.
|
||||
:param set ignore_characters: characters to ignore.
|
||||
:return: the sanitized string.
|
||||
:rtype: str
|
||||
|
||||
"""
|
||||
# only deal with strings
|
||||
if string is None:
|
||||
return
|
||||
|
||||
ignore_characters = ignore_characters or set()
|
||||
|
||||
# replace some characters with one space
|
||||
characters = {'-', ':', '(', ')', '.'} - ignore_characters
|
||||
if characters:
|
||||
string = re.sub(r'[%s]' % re.escape(''.join(characters)), ' ', string)
|
||||
|
||||
# remove some characters
|
||||
characters = {'\''} - ignore_characters
|
||||
if characters:
|
||||
string = re.sub(r'[%s]' % re.escape(''.join(characters)), '', string)
|
||||
|
||||
# replace multiple spaces with one
|
||||
string = re.sub(r'\s+', ' ', string)
|
||||
|
||||
# strip and lower case
|
||||
return string.strip().lower()
|
||||
|
||||
|
||||
def sanitize_release_group(string):
|
||||
"""Sanitize a `release_group` string to remove content in square brackets.
|
||||
|
||||
:param str string: the release group to sanitize.
|
||||
:return: the sanitized release group.
|
||||
:rtype: str
|
||||
|
||||
"""
|
||||
# only deal with strings
|
||||
if string is None:
|
||||
return
|
||||
|
||||
# remove content in square brackets
|
||||
string = re.sub(r'\[\w+\]', '', string)
|
||||
|
||||
# strip and upper case
|
||||
return string.strip().upper()
|
||||
|
||||
|
||||
def timestamp(date):
|
||||
"""Get the timestamp of the `date`, python2/3 compatible
|
||||
|
||||
:param datetime.datetime date: the utc date.
|
||||
:return: the timestamp of the date.
|
||||
:rtype: float
|
||||
|
||||
"""
|
||||
return (date - datetime(1970, 1, 1)).total_seconds()
|
239
libs/subliminal2.7/video.py
Normal file
239
libs/subliminal2.7/video.py
Normal file
|
@ -0,0 +1,239 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import division
|
||||
from datetime import datetime, timedelta
|
||||
import logging
|
||||
import os
|
||||
|
||||
from guessit import guessit
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
#: Video extensions
|
||||
VIDEO_EXTENSIONS = ('.3g2', '.3gp', '.3gp2', '.3gpp', '.60d', '.ajp', '.asf', '.asx', '.avchd', '.avi', '.bik',
|
||||
'.bix', '.box', '.cam', '.dat', '.divx', '.dmf', '.dv', '.dvr-ms', '.evo', '.flc', '.fli',
|
||||
'.flic', '.flv', '.flx', '.gvi', '.gvp', '.h264', '.m1v', '.m2p', '.m2ts', '.m2v', '.m4e',
|
||||
'.m4v', '.mjp', '.mjpeg', '.mjpg', '.mkv', '.moov', '.mov', '.movhd', '.movie', '.movx', '.mp4',
|
||||
'.mpe', '.mpeg', '.mpg', '.mpv', '.mpv2', '.mxf', '.nsv', '.nut', '.ogg', '.ogm', '.ogv', '.omf',
|
||||
'.ps', '.qt', '.ram', '.rm', '.rmvb', '.swf', '.ts', '.vfw', '.vid', '.video', '.viv', '.vivo',
|
||||
'.vob', '.vro', '.webm', '.wm', '.wmv', '.wmx', '.wrap', '.wvx', '.wx', '.x264', '.xvid')
|
||||
|
||||
|
||||
class Video(object):
|
||||
"""Base class for videos.
|
||||
|
||||
Represent a video, existing or not.
|
||||
|
||||
:param str name: name or path of the video.
|
||||
:param str format: format of the video (HDTV, WEB-DL, BluRay, ...).
|
||||
:param str release_group: release group of the video.
|
||||
:param str resolution: resolution of the video stream (480p, 720p, 1080p or 1080i).
|
||||
:param str video_codec: codec of the video stream.
|
||||
:param str audio_codec: codec of the main audio stream.
|
||||
:param str imdb_id: IMDb id of the video.
|
||||
:param dict hashes: hashes of the video file by provider names.
|
||||
:param int size: size of the video file in bytes.
|
||||
:param set subtitle_languages: existing subtitle languages.
|
||||
|
||||
"""
|
||||
def __init__(self, name, format=None, release_group=None, resolution=None, video_codec=None, audio_codec=None,
|
||||
imdb_id=None, hashes=None, size=None, subtitle_languages=None):
|
||||
#: Name or path of the video
|
||||
self.name = name
|
||||
|
||||
#: Format of the video (HDTV, WEB-DL, BluRay, ...)
|
||||
self.format = format
|
||||
|
||||
#: Release group of the video
|
||||
self.release_group = release_group
|
||||
|
||||
#: Resolution of the video stream (480p, 720p, 1080p or 1080i)
|
||||
self.resolution = resolution
|
||||
|
||||
#: Codec of the video stream
|
||||
self.video_codec = video_codec
|
||||
|
||||
#: Codec of the main audio stream
|
||||
self.audio_codec = audio_codec
|
||||
|
||||
#: IMDb id of the video
|
||||
self.imdb_id = imdb_id
|
||||
|
||||
#: Hashes of the video file by provider names
|
||||
self.hashes = hashes or {}
|
||||
|
||||
#: Size of the video file in bytes
|
||||
self.size = size
|
||||
|
||||
#: Existing subtitle languages
|
||||
self.subtitle_languages = subtitle_languages or set()
|
||||
|
||||
@property
|
||||
def exists(self):
|
||||
"""Test whether the video exists"""
|
||||
return os.path.exists(self.name)
|
||||
|
||||
@property
|
||||
def age(self):
|
||||
"""Age of the video"""
|
||||
if self.exists:
|
||||
return datetime.utcnow() - datetime.utcfromtimestamp(os.path.getmtime(self.name))
|
||||
|
||||
return timedelta()
|
||||
|
||||
@classmethod
|
||||
def fromguess(cls, name, guess):
|
||||
"""Create an :class:`Episode` or a :class:`Movie` with the given `name` based on the `guess`.
|
||||
|
||||
:param str name: name of the video.
|
||||
:param dict guess: guessed data.
|
||||
:raise: :class:`ValueError` if the `type` of the `guess` is invalid
|
||||
|
||||
"""
|
||||
if guess['type'] == 'episode':
|
||||
return Episode.fromguess(name, guess)
|
||||
|
||||
if guess['type'] == 'movie':
|
||||
return Movie.fromguess(name, guess)
|
||||
|
||||
raise ValueError('The guess must be an episode or a movie guess')
|
||||
|
||||
@classmethod
|
||||
def fromname(cls, name):
|
||||
"""Shortcut for :meth:`fromguess` with a `guess` guessed from the `name`.
|
||||
|
||||
:param str name: name of the video.
|
||||
|
||||
"""
|
||||
return cls.fromguess(name, guessit(name))
|
||||
|
||||
def __repr__(self):
|
||||
return '<%s [%r]>' % (self.__class__.__name__, self.name)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.name)
|
||||
|
||||
|
||||
class Episode(Video):
|
||||
"""Episode :class:`Video`.
|
||||
|
||||
:param str series: series of the episode.
|
||||
:param int season: season number of the episode.
|
||||
:param int episode: episode number of the episode.
|
||||
:param str title: title of the episode.
|
||||
:param int year: year of the series.
|
||||
:param bool original_series: whether the series is the first with this name.
|
||||
:param int tvdb_id: TVDB id of the episode.
|
||||
:param list alternative_series: alternative names of the series
|
||||
:param \*\*kwargs: additional parameters for the :class:`Video` constructor.
|
||||
|
||||
"""
|
||||
def __init__(self, name, series, season, episode, title=None, year=None, original_series=True, tvdb_id=None,
|
||||
series_tvdb_id=None, series_imdb_id=None, alternative_series=None, **kwargs):
|
||||
super(Episode, self).__init__(name, **kwargs)
|
||||
|
||||
#: Series of the episode
|
||||
self.series = series
|
||||
|
||||
#: Season number of the episode
|
||||
self.season = season
|
||||
|
||||
#: Episode number of the episode
|
||||
self.episode = episode
|
||||
|
||||
#: Title of the episode
|
||||
self.title = title
|
||||
|
||||
#: Year of series
|
||||
self.year = year
|
||||
|
||||
#: The series is the first with this name
|
||||
self.original_series = original_series
|
||||
|
||||
#: TVDB id of the episode
|
||||
self.tvdb_id = tvdb_id
|
||||
|
||||
#: TVDB id of the series
|
||||
self.series_tvdb_id = series_tvdb_id
|
||||
|
||||
#: IMDb id of the series
|
||||
self.series_imdb_id = series_imdb_id
|
||||
|
||||
#: Alternative names of the series
|
||||
self.alternative_series = alternative_series or []
|
||||
|
||||
@classmethod
|
||||
def fromguess(cls, name, guess):
|
||||
if guess['type'] != 'episode':
|
||||
raise ValueError('The guess must be an episode guess')
|
||||
|
||||
if 'title' not in guess or 'episode' not in guess:
|
||||
raise ValueError('Insufficient data to process the guess')
|
||||
|
||||
# Currently we only have single-ep support (guessit returns a multi-ep as a list with int values)
|
||||
# Most providers only support single-ep, so make sure it contains only 1 episode
|
||||
# In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
|
||||
episode_guess = guess.get('episode')
|
||||
episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
|
||||
|
||||
return cls(name, guess['title'], guess.get('season', 1), episode, title=guess.get('episode_title'),
|
||||
year=guess.get('year'), format=guess.get('format'), original_series='year' not in guess,
|
||||
release_group=guess.get('release_group'), resolution=guess.get('screen_size'),
|
||||
video_codec=guess.get('video_codec'), audio_codec=guess.get('audio_codec'))
|
||||
|
||||
@classmethod
|
||||
def fromname(cls, name):
|
||||
return cls.fromguess(name, guessit(name, {'type': 'episode'}))
|
||||
|
||||
def __repr__(self):
|
||||
if self.year is None:
|
||||
return '<%s [%r, %dx%d]>' % (self.__class__.__name__, self.series, self.season, self.episode)
|
||||
|
||||
return '<%s [%r, %d, %dx%d]>' % (self.__class__.__name__, self.series, self.year, self.season, self.episode)
|
||||
|
||||
|
||||
class Movie(Video):
|
||||
"""Movie :class:`Video`.
|
||||
|
||||
:param str title: title of the movie.
|
||||
:param int year: year of the movie.
|
||||
:param list alternative_titles: alternative titles of the movie
|
||||
:param \*\*kwargs: additional parameters for the :class:`Video` constructor.
|
||||
|
||||
"""
|
||||
def __init__(self, name, title, year=None, alternative_titles=None, **kwargs):
|
||||
super(Movie, self).__init__(name, **kwargs)
|
||||
|
||||
#: Title of the movie
|
||||
self.title = title
|
||||
|
||||
#: Year of the movie
|
||||
self.year = year
|
||||
|
||||
#: Alternative titles of the movie
|
||||
self.alternative_titles = alternative_titles or []
|
||||
|
||||
@classmethod
|
||||
def fromguess(cls, name, guess):
|
||||
if guess['type'] != 'movie':
|
||||
raise ValueError('The guess must be a movie guess')
|
||||
|
||||
if 'title' not in guess:
|
||||
raise ValueError('Insufficient data to process the guess')
|
||||
|
||||
alternative_titles = []
|
||||
if 'alternative_title' in guess:
|
||||
alternative_titles.append(u"%s %s" % (guess['title'], guess['alternative_title']))
|
||||
|
||||
return cls(name, guess['title'], format=guess.get('format'), release_group=guess.get('release_group'),
|
||||
resolution=guess.get('screen_size'), video_codec=guess.get('video_codec'),
|
||||
audio_codec=guess.get('audio_codec'), year=guess.get('year'), alternative_titles=alternative_titles)
|
||||
|
||||
@classmethod
|
||||
def fromname(cls, name):
|
||||
return cls.fromguess(name, guessit(name, {'type': 'movie'}))
|
||||
|
||||
def __repr__(self):
|
||||
if self.year is None:
|
||||
return '<%s [%r]>' % (self.__class__.__name__, self.title)
|
||||
|
||||
return '<%s [%r, %d]>' % (self.__class__.__name__, self.title, self.year)
|
402
libs/yaml/__init__.py
Normal file
402
libs/yaml/__init__.py
Normal file
|
@ -0,0 +1,402 @@
|
|||
|
||||
from .error import *
|
||||
|
||||
from .tokens import *
|
||||
from .events import *
|
||||
from .nodes import *
|
||||
|
||||
from .loader import *
|
||||
from .dumper import *
|
||||
|
||||
__version__ = '5.1.2'
|
||||
try:
|
||||
from .cyaml import *
|
||||
__with_libyaml__ = True
|
||||
except ImportError:
|
||||
__with_libyaml__ = False
|
||||
|
||||
import io
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# Warnings control
|
||||
#------------------------------------------------------------------------------
|
||||
|
||||
# 'Global' warnings state:
|
||||
_warnings_enabled = {
|
||||
'YAMLLoadWarning': True,
|
||||
}
|
||||
|
||||
# Get or set global warnings' state
|
||||
def warnings(settings=None):
|
||||
if settings is None:
|
||||
return _warnings_enabled
|
||||
|
||||
if type(settings) is dict:
|
||||
for key in settings:
|
||||
if key in _warnings_enabled:
|
||||
_warnings_enabled[key] = settings[key]
|
||||
|
||||
# Warn when load() is called without Loader=...
|
||||
class YAMLLoadWarning(RuntimeWarning):
|
||||
pass
|
||||
|
||||
def load_warning(method):
|
||||
if _warnings_enabled['YAMLLoadWarning'] is False:
|
||||
return
|
||||
|
||||
import warnings
|
||||
|
||||
message = (
|
||||
"calling yaml.%s() without Loader=... is deprecated, as the "
|
||||
"default Loader is unsafe. Please read "
|
||||
"https://msg.pyyaml.org/load for full details."
|
||||
) % method
|
||||
|
||||
warnings.warn(message, YAMLLoadWarning, stacklevel=3)
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
def scan(stream, Loader=Loader):
|
||||
"""
|
||||
Scan a YAML stream and produce scanning tokens.
|
||||
"""
|
||||
loader = Loader(stream)
|
||||
try:
|
||||
while loader.check_token():
|
||||
yield loader.get_token()
|
||||
finally:
|
||||
loader.dispose()
|
||||
|
||||
def parse(stream, Loader=Loader):
|
||||
"""
|
||||
Parse a YAML stream and produce parsing events.
|
||||
"""
|
||||
loader = Loader(stream)
|
||||
try:
|
||||
while loader.check_event():
|
||||
yield loader.get_event()
|
||||
finally:
|
||||
loader.dispose()
|
||||
|
||||
def compose(stream, Loader=Loader):
|
||||
"""
|
||||
Parse the first YAML document in a stream
|
||||
and produce the corresponding representation tree.
|
||||
"""
|
||||
loader = Loader(stream)
|
||||
try:
|
||||
return loader.get_single_node()
|
||||
finally:
|
||||
loader.dispose()
|
||||
|
||||
def compose_all(stream, Loader=Loader):
|
||||
"""
|
||||
Parse all YAML documents in a stream
|
||||
and produce corresponding representation trees.
|
||||
"""
|
||||
loader = Loader(stream)
|
||||
try:
|
||||
while loader.check_node():
|
||||
yield loader.get_node()
|
||||
finally:
|
||||
loader.dispose()
|
||||
|
||||
def load(stream, Loader=None):
|
||||
"""
|
||||
Parse the first YAML document in a stream
|
||||
and produce the corresponding Python object.
|
||||
"""
|
||||
if Loader is None:
|
||||
load_warning('load')
|
||||
Loader = FullLoader
|
||||
|
||||
loader = Loader(stream)
|
||||
try:
|
||||
return loader.get_single_data()
|
||||
finally:
|
||||
loader.dispose()
|
||||
|
||||
def load_all(stream, Loader=None):
|
||||
"""
|
||||
Parse all YAML documents in a stream
|
||||
and produce corresponding Python objects.
|
||||
"""
|
||||
if Loader is None:
|
||||
load_warning('load_all')
|
||||
Loader = FullLoader
|
||||
|
||||
loader = Loader(stream)
|
||||
try:
|
||||
while loader.check_data():
|
||||
yield loader.get_data()
|
||||
finally:
|
||||
loader.dispose()
|
||||
|
||||
def full_load(stream):
|
||||
"""
|
||||
Parse the first YAML document in a stream
|
||||
and produce the corresponding Python object.
|
||||
|
||||
Resolve all tags except those known to be
|
||||
unsafe on untrusted input.
|
||||
"""
|
||||
return load(stream, FullLoader)
|
||||
|
||||
def full_load_all(stream):
|
||||
"""
|
||||
Parse all YAML documents in a stream
|
||||
and produce corresponding Python objects.
|
||||
|
||||
Resolve all tags except those known to be
|
||||
unsafe on untrusted input.
|
||||
"""
|
||||
return load_all(stream, FullLoader)
|
||||
|
||||
def safe_load(stream):
|
||||
"""
|
||||
Parse the first YAML document in a stream
|
||||
and produce the corresponding Python object.
|
||||
|
||||
Resolve only basic YAML tags. This is known
|
||||
to be safe for untrusted input.
|
||||
"""
|
||||
return load(stream, SafeLoader)
|
||||
|
||||
def safe_load_all(stream):
|
||||
"""
|
||||
Parse all YAML documents in a stream
|
||||
and produce corresponding Python objects.
|
||||
|
||||
Resolve only basic YAML tags. This is known
|
||||
to be safe for untrusted input.
|
||||
"""
|
||||
return load_all(stream, SafeLoader)
|
||||
|
||||
def unsafe_load(stream):
|
||||
"""
|
||||
Parse the first YAML document in a stream
|
||||
and produce the corresponding Python object.
|
||||
|
||||
Resolve all tags, even those known to be
|
||||
unsafe on untrusted input.
|
||||
"""
|
||||
return load(stream, UnsafeLoader)
|
||||
|
||||
def unsafe_load_all(stream):
|
||||
"""
|
||||
Parse all YAML documents in a stream
|
||||
and produce corresponding Python objects.
|
||||
|
||||
Resolve all tags, even those known to be
|
||||
unsafe on untrusted input.
|
||||
"""
|
||||
return load_all(stream, UnsafeLoader)
|
||||
|
||||
def emit(events, stream=None, Dumper=Dumper,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None):
|
||||
"""
|
||||
Emit YAML parsing events into a stream.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
getvalue = None
|
||||
if stream is None:
|
||||
stream = io.StringIO()
|
||||
getvalue = stream.getvalue
|
||||
dumper = Dumper(stream, canonical=canonical, indent=indent, width=width,
|
||||
allow_unicode=allow_unicode, line_break=line_break)
|
||||
try:
|
||||
for event in events:
|
||||
dumper.emit(event)
|
||||
finally:
|
||||
dumper.dispose()
|
||||
if getvalue:
|
||||
return getvalue()
|
||||
|
||||
def serialize_all(nodes, stream=None, Dumper=Dumper,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None):
|
||||
"""
|
||||
Serialize a sequence of representation trees into a YAML stream.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
getvalue = None
|
||||
if stream is None:
|
||||
if encoding is None:
|
||||
stream = io.StringIO()
|
||||
else:
|
||||
stream = io.BytesIO()
|
||||
getvalue = stream.getvalue
|
||||
dumper = Dumper(stream, canonical=canonical, indent=indent, width=width,
|
||||
allow_unicode=allow_unicode, line_break=line_break,
|
||||
encoding=encoding, version=version, tags=tags,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end)
|
||||
try:
|
||||
dumper.open()
|
||||
for node in nodes:
|
||||
dumper.serialize(node)
|
||||
dumper.close()
|
||||
finally:
|
||||
dumper.dispose()
|
||||
if getvalue:
|
||||
return getvalue()
|
||||
|
||||
def serialize(node, stream=None, Dumper=Dumper, **kwds):
|
||||
"""
|
||||
Serialize a representation tree into a YAML stream.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
return serialize_all([node], stream, Dumper=Dumper, **kwds)
|
||||
|
||||
def dump_all(documents, stream=None, Dumper=Dumper,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
"""
|
||||
Serialize a sequence of Python objects into a YAML stream.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
getvalue = None
|
||||
if stream is None:
|
||||
if encoding is None:
|
||||
stream = io.StringIO()
|
||||
else:
|
||||
stream = io.BytesIO()
|
||||
getvalue = stream.getvalue
|
||||
dumper = Dumper(stream, default_style=default_style,
|
||||
default_flow_style=default_flow_style,
|
||||
canonical=canonical, indent=indent, width=width,
|
||||
allow_unicode=allow_unicode, line_break=line_break,
|
||||
encoding=encoding, version=version, tags=tags,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end, sort_keys=sort_keys)
|
||||
try:
|
||||
dumper.open()
|
||||
for data in documents:
|
||||
dumper.represent(data)
|
||||
dumper.close()
|
||||
finally:
|
||||
dumper.dispose()
|
||||
if getvalue:
|
||||
return getvalue()
|
||||
|
||||
def dump(data, stream=None, Dumper=Dumper, **kwds):
|
||||
"""
|
||||
Serialize a Python object into a YAML stream.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
return dump_all([data], stream, Dumper=Dumper, **kwds)
|
||||
|
||||
def safe_dump_all(documents, stream=None, **kwds):
|
||||
"""
|
||||
Serialize a sequence of Python objects into a YAML stream.
|
||||
Produce only basic YAML tags.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
return dump_all(documents, stream, Dumper=SafeDumper, **kwds)
|
||||
|
||||
def safe_dump(data, stream=None, **kwds):
|
||||
"""
|
||||
Serialize a Python object into a YAML stream.
|
||||
Produce only basic YAML tags.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
return dump_all([data], stream, Dumper=SafeDumper, **kwds)
|
||||
|
||||
def add_implicit_resolver(tag, regexp, first=None,
|
||||
Loader=Loader, Dumper=Dumper):
|
||||
"""
|
||||
Add an implicit scalar detector.
|
||||
If an implicit scalar value matches the given regexp,
|
||||
the corresponding tag is assigned to the scalar.
|
||||
first is a sequence of possible initial characters or None.
|
||||
"""
|
||||
Loader.add_implicit_resolver(tag, regexp, first)
|
||||
Dumper.add_implicit_resolver(tag, regexp, first)
|
||||
|
||||
def add_path_resolver(tag, path, kind=None, Loader=Loader, Dumper=Dumper):
|
||||
"""
|
||||
Add a path based resolver for the given tag.
|
||||
A path is a list of keys that forms a path
|
||||
to a node in the representation tree.
|
||||
Keys can be string values, integers, or None.
|
||||
"""
|
||||
Loader.add_path_resolver(tag, path, kind)
|
||||
Dumper.add_path_resolver(tag, path, kind)
|
||||
|
||||
def add_constructor(tag, constructor, Loader=Loader):
|
||||
"""
|
||||
Add a constructor for the given tag.
|
||||
Constructor is a function that accepts a Loader instance
|
||||
and a node object and produces the corresponding Python object.
|
||||
"""
|
||||
Loader.add_constructor(tag, constructor)
|
||||
|
||||
def add_multi_constructor(tag_prefix, multi_constructor, Loader=Loader):
|
||||
"""
|
||||
Add a multi-constructor for the given tag prefix.
|
||||
Multi-constructor is called for a node if its tag starts with tag_prefix.
|
||||
Multi-constructor accepts a Loader instance, a tag suffix,
|
||||
and a node object and produces the corresponding Python object.
|
||||
"""
|
||||
Loader.add_multi_constructor(tag_prefix, multi_constructor)
|
||||
|
||||
def add_representer(data_type, representer, Dumper=Dumper):
|
||||
"""
|
||||
Add a representer for the given type.
|
||||
Representer is a function accepting a Dumper instance
|
||||
and an instance of the given data type
|
||||
and producing the corresponding representation node.
|
||||
"""
|
||||
Dumper.add_representer(data_type, representer)
|
||||
|
||||
def add_multi_representer(data_type, multi_representer, Dumper=Dumper):
|
||||
"""
|
||||
Add a representer for the given type.
|
||||
Multi-representer is a function accepting a Dumper instance
|
||||
and an instance of the given data type or subtype
|
||||
and producing the corresponding representation node.
|
||||
"""
|
||||
Dumper.add_multi_representer(data_type, multi_representer)
|
||||
|
||||
class YAMLObjectMetaclass(type):
|
||||
"""
|
||||
The metaclass for YAMLObject.
|
||||
"""
|
||||
def __init__(cls, name, bases, kwds):
|
||||
super(YAMLObjectMetaclass, cls).__init__(name, bases, kwds)
|
||||
if 'yaml_tag' in kwds and kwds['yaml_tag'] is not None:
|
||||
cls.yaml_loader.add_constructor(cls.yaml_tag, cls.from_yaml)
|
||||
cls.yaml_dumper.add_representer(cls, cls.to_yaml)
|
||||
|
||||
class YAMLObject(metaclass=YAMLObjectMetaclass):
|
||||
"""
|
||||
An object that can dump itself to a YAML stream
|
||||
and load itself from a YAML stream.
|
||||
"""
|
||||
|
||||
__slots__ = () # no direct instantiation, so allow immutable subclasses
|
||||
|
||||
yaml_loader = Loader
|
||||
yaml_dumper = Dumper
|
||||
|
||||
yaml_tag = None
|
||||
yaml_flow_style = None
|
||||
|
||||
@classmethod
|
||||
def from_yaml(cls, loader, node):
|
||||
"""
|
||||
Convert a representation node to a Python object.
|
||||
"""
|
||||
return loader.construct_yaml_object(node, cls)
|
||||
|
||||
@classmethod
|
||||
def to_yaml(cls, dumper, data):
|
||||
"""
|
||||
Convert a Python object to a representation node.
|
||||
"""
|
||||
return dumper.represent_yaml_object(cls.yaml_tag, data, cls,
|
||||
flow_style=cls.yaml_flow_style)
|
||||
|
139
libs/yaml/composer.py
Normal file
139
libs/yaml/composer.py
Normal file
|
@ -0,0 +1,139 @@
|
|||
|
||||
__all__ = ['Composer', 'ComposerError']
|
||||
|
||||
from .error import MarkedYAMLError
|
||||
from .events import *
|
||||
from .nodes import *
|
||||
|
||||
class ComposerError(MarkedYAMLError):
|
||||
pass
|
||||
|
||||
class Composer:
|
||||
|
||||
def __init__(self):
|
||||
self.anchors = {}
|
||||
|
||||
def check_node(self):
|
||||
# Drop the STREAM-START event.
|
||||
if self.check_event(StreamStartEvent):
|
||||
self.get_event()
|
||||
|
||||
# If there are more documents available?
|
||||
return not self.check_event(StreamEndEvent)
|
||||
|
||||
def get_node(self):
|
||||
# Get the root node of the next document.
|
||||
if not self.check_event(StreamEndEvent):
|
||||
return self.compose_document()
|
||||
|
||||
def get_single_node(self):
|
||||
# Drop the STREAM-START event.
|
||||
self.get_event()
|
||||
|
||||
# Compose a document if the stream is not empty.
|
||||
document = None
|
||||
if not self.check_event(StreamEndEvent):
|
||||
document = self.compose_document()
|
||||
|
||||
# Ensure that the stream contains no more documents.
|
||||
if not self.check_event(StreamEndEvent):
|
||||
event = self.get_event()
|
||||
raise ComposerError("expected a single document in the stream",
|
||||
document.start_mark, "but found another document",
|
||||
event.start_mark)
|
||||
|
||||
# Drop the STREAM-END event.
|
||||
self.get_event()
|
||||
|
||||
return document
|
||||
|
||||
def compose_document(self):
|
||||
# Drop the DOCUMENT-START event.
|
||||
self.get_event()
|
||||
|
||||
# Compose the root node.
|
||||
node = self.compose_node(None, None)
|
||||
|
||||
# Drop the DOCUMENT-END event.
|
||||
self.get_event()
|
||||
|
||||
self.anchors = {}
|
||||
return node
|
||||
|
||||
def compose_node(self, parent, index):
|
||||
if self.check_event(AliasEvent):
|
||||
event = self.get_event()
|
||||
anchor = event.anchor
|
||||
if anchor not in self.anchors:
|
||||
raise ComposerError(None, None, "found undefined alias %r"
|
||||
% anchor, event.start_mark)
|
||||
return self.anchors[anchor]
|
||||
event = self.peek_event()
|
||||
anchor = event.anchor
|
||||
if anchor is not None:
|
||||
if anchor in self.anchors:
|
||||
raise ComposerError("found duplicate anchor %r; first occurrence"
|
||||
% anchor, self.anchors[anchor].start_mark,
|
||||
"second occurrence", event.start_mark)
|
||||
self.descend_resolver(parent, index)
|
||||
if self.check_event(ScalarEvent):
|
||||
node = self.compose_scalar_node(anchor)
|
||||
elif self.check_event(SequenceStartEvent):
|
||||
node = self.compose_sequence_node(anchor)
|
||||
elif self.check_event(MappingStartEvent):
|
||||
node = self.compose_mapping_node(anchor)
|
||||
self.ascend_resolver()
|
||||
return node
|
||||
|
||||
def compose_scalar_node(self, anchor):
|
||||
event = self.get_event()
|
||||
tag = event.tag
|
||||
if tag is None or tag == '!':
|
||||
tag = self.resolve(ScalarNode, event.value, event.implicit)
|
||||
node = ScalarNode(tag, event.value,
|
||||
event.start_mark, event.end_mark, style=event.style)
|
||||
if anchor is not None:
|
||||
self.anchors[anchor] = node
|
||||
return node
|
||||
|
||||
def compose_sequence_node(self, anchor):
|
||||
start_event = self.get_event()
|
||||
tag = start_event.tag
|
||||
if tag is None or tag == '!':
|
||||
tag = self.resolve(SequenceNode, None, start_event.implicit)
|
||||
node = SequenceNode(tag, [],
|
||||
start_event.start_mark, None,
|
||||
flow_style=start_event.flow_style)
|
||||
if anchor is not None:
|
||||
self.anchors[anchor] = node
|
||||
index = 0
|
||||
while not self.check_event(SequenceEndEvent):
|
||||
node.value.append(self.compose_node(node, index))
|
||||
index += 1
|
||||
end_event = self.get_event()
|
||||
node.end_mark = end_event.end_mark
|
||||
return node
|
||||
|
||||
def compose_mapping_node(self, anchor):
|
||||
start_event = self.get_event()
|
||||
tag = start_event.tag
|
||||
if tag is None or tag == '!':
|
||||
tag = self.resolve(MappingNode, None, start_event.implicit)
|
||||
node = MappingNode(tag, [],
|
||||
start_event.start_mark, None,
|
||||
flow_style=start_event.flow_style)
|
||||
if anchor is not None:
|
||||
self.anchors[anchor] = node
|
||||
while not self.check_event(MappingEndEvent):
|
||||
#key_event = self.peek_event()
|
||||
item_key = self.compose_node(node, None)
|
||||
#if item_key in node.value:
|
||||
# raise ComposerError("while composing a mapping", start_event.start_mark,
|
||||
# "found duplicate key", key_event.start_mark)
|
||||
item_value = self.compose_node(node, item_key)
|
||||
#node.value[item_key] = item_value
|
||||
node.value.append((item_key, item_value))
|
||||
end_event = self.get_event()
|
||||
node.end_mark = end_event.end_mark
|
||||
return node
|
||||
|
720
libs/yaml/constructor.py
Normal file
720
libs/yaml/constructor.py
Normal file
|
@ -0,0 +1,720 @@
|
|||
|
||||
__all__ = [
|
||||
'BaseConstructor',
|
||||
'SafeConstructor',
|
||||
'FullConstructor',
|
||||
'UnsafeConstructor',
|
||||
'Constructor',
|
||||
'ConstructorError'
|
||||
]
|
||||
|
||||
from .error import *
|
||||
from .nodes import *
|
||||
|
||||
import collections.abc, datetime, base64, binascii, re, sys, types
|
||||
|
||||
class ConstructorError(MarkedYAMLError):
|
||||
pass
|
||||
|
||||
class BaseConstructor:
|
||||
|
||||
yaml_constructors = {}
|
||||
yaml_multi_constructors = {}
|
||||
|
||||
def __init__(self):
|
||||
self.constructed_objects = {}
|
||||
self.recursive_objects = {}
|
||||
self.state_generators = []
|
||||
self.deep_construct = False
|
||||
|
||||
def check_data(self):
|
||||
# If there are more documents available?
|
||||
return self.check_node()
|
||||
|
||||
def get_data(self):
|
||||
# Construct and return the next document.
|
||||
if self.check_node():
|
||||
return self.construct_document(self.get_node())
|
||||
|
||||
def get_single_data(self):
|
||||
# Ensure that the stream contains a single document and construct it.
|
||||
node = self.get_single_node()
|
||||
if node is not None:
|
||||
return self.construct_document(node)
|
||||
return None
|
||||
|
||||
def construct_document(self, node):
|
||||
data = self.construct_object(node)
|
||||
while self.state_generators:
|
||||
state_generators = self.state_generators
|
||||
self.state_generators = []
|
||||
for generator in state_generators:
|
||||
for dummy in generator:
|
||||
pass
|
||||
self.constructed_objects = {}
|
||||
self.recursive_objects = {}
|
||||
self.deep_construct = False
|
||||
return data
|
||||
|
||||
def construct_object(self, node, deep=False):
|
||||
if node in self.constructed_objects:
|
||||
return self.constructed_objects[node]
|
||||
if deep:
|
||||
old_deep = self.deep_construct
|
||||
self.deep_construct = True
|
||||
if node in self.recursive_objects:
|
||||
raise ConstructorError(None, None,
|
||||
"found unconstructable recursive node", node.start_mark)
|
||||
self.recursive_objects[node] = None
|
||||
constructor = None
|
||||
tag_suffix = None
|
||||
if node.tag in self.yaml_constructors:
|
||||
constructor = self.yaml_constructors[node.tag]
|
||||
else:
|
||||
for tag_prefix in self.yaml_multi_constructors:
|
||||
if node.tag.startswith(tag_prefix):
|
||||
tag_suffix = node.tag[len(tag_prefix):]
|
||||
constructor = self.yaml_multi_constructors[tag_prefix]
|
||||
break
|
||||
else:
|
||||
if None in self.yaml_multi_constructors:
|
||||
tag_suffix = node.tag
|
||||
constructor = self.yaml_multi_constructors[None]
|
||||
elif None in self.yaml_constructors:
|
||||
constructor = self.yaml_constructors[None]
|
||||
elif isinstance(node, ScalarNode):
|
||||
constructor = self.__class__.construct_scalar
|
||||
elif isinstance(node, SequenceNode):
|
||||
constructor = self.__class__.construct_sequence
|
||||
elif isinstance(node, MappingNode):
|
||||
constructor = self.__class__.construct_mapping
|
||||
if tag_suffix is None:
|
||||
data = constructor(self, node)
|
||||
else:
|
||||
data = constructor(self, tag_suffix, node)
|
||||
if isinstance(data, types.GeneratorType):
|
||||
generator = data
|
||||
data = next(generator)
|
||||
if self.deep_construct:
|
||||
for dummy in generator:
|
||||
pass
|
||||
else:
|
||||
self.state_generators.append(generator)
|
||||
self.constructed_objects[node] = data
|
||||
del self.recursive_objects[node]
|
||||
if deep:
|
||||
self.deep_construct = old_deep
|
||||
return data
|
||||
|
||||
def construct_scalar(self, node):
|
||||
if not isinstance(node, ScalarNode):
|
||||
raise ConstructorError(None, None,
|
||||
"expected a scalar node, but found %s" % node.id,
|
||||
node.start_mark)
|
||||
return node.value
|
||||
|
||||
def construct_sequence(self, node, deep=False):
|
||||
if not isinstance(node, SequenceNode):
|
||||
raise ConstructorError(None, None,
|
||||
"expected a sequence node, but found %s" % node.id,
|
||||
node.start_mark)
|
||||
return [self.construct_object(child, deep=deep)
|
||||
for child in node.value]
|
||||
|
||||
def construct_mapping(self, node, deep=False):
|
||||
if not isinstance(node, MappingNode):
|
||||
raise ConstructorError(None, None,
|
||||
"expected a mapping node, but found %s" % node.id,
|
||||
node.start_mark)
|
||||
mapping = {}
|
||||
for key_node, value_node in node.value:
|
||||
key = self.construct_object(key_node, deep=deep)
|
||||
if not isinstance(key, collections.abc.Hashable):
|
||||
raise ConstructorError("while constructing a mapping", node.start_mark,
|
||||
"found unhashable key", key_node.start_mark)
|
||||
value = self.construct_object(value_node, deep=deep)
|
||||
mapping[key] = value
|
||||
return mapping
|
||||
|
||||
def construct_pairs(self, node, deep=False):
|
||||
if not isinstance(node, MappingNode):
|
||||
raise ConstructorError(None, None,
|
||||
"expected a mapping node, but found %s" % node.id,
|
||||
node.start_mark)
|
||||
pairs = []
|
||||
for key_node, value_node in node.value:
|
||||
key = self.construct_object(key_node, deep=deep)
|
||||
value = self.construct_object(value_node, deep=deep)
|
||||
pairs.append((key, value))
|
||||
return pairs
|
||||
|
||||
@classmethod
|
||||
def add_constructor(cls, tag, constructor):
|
||||
if not 'yaml_constructors' in cls.__dict__:
|
||||
cls.yaml_constructors = cls.yaml_constructors.copy()
|
||||
cls.yaml_constructors[tag] = constructor
|
||||
|
||||
@classmethod
|
||||
def add_multi_constructor(cls, tag_prefix, multi_constructor):
|
||||
if not 'yaml_multi_constructors' in cls.__dict__:
|
||||
cls.yaml_multi_constructors = cls.yaml_multi_constructors.copy()
|
||||
cls.yaml_multi_constructors[tag_prefix] = multi_constructor
|
||||
|
||||
class SafeConstructor(BaseConstructor):
|
||||
|
||||
def construct_scalar(self, node):
|
||||
if isinstance(node, MappingNode):
|
||||
for key_node, value_node in node.value:
|
||||
if key_node.tag == 'tag:yaml.org,2002:value':
|
||||
return self.construct_scalar(value_node)
|
||||
return super().construct_scalar(node)
|
||||
|
||||
def flatten_mapping(self, node):
|
||||
merge = []
|
||||
index = 0
|
||||
while index < len(node.value):
|
||||
key_node, value_node = node.value[index]
|
||||
if key_node.tag == 'tag:yaml.org,2002:merge':
|
||||
del node.value[index]
|
||||
if isinstance(value_node, MappingNode):
|
||||
self.flatten_mapping(value_node)
|
||||
merge.extend(value_node.value)
|
||||
elif isinstance(value_node, SequenceNode):
|
||||
submerge = []
|
||||
for subnode in value_node.value:
|
||||
if not isinstance(subnode, MappingNode):
|
||||
raise ConstructorError("while constructing a mapping",
|
||||
node.start_mark,
|
||||
"expected a mapping for merging, but found %s"
|
||||
% subnode.id, subnode.start_mark)
|
||||
self.flatten_mapping(subnode)
|
||||
submerge.append(subnode.value)
|
||||
submerge.reverse()
|
||||
for value in submerge:
|
||||
merge.extend(value)
|
||||
else:
|
||||
raise ConstructorError("while constructing a mapping", node.start_mark,
|
||||
"expected a mapping or list of mappings for merging, but found %s"
|
||||
% value_node.id, value_node.start_mark)
|
||||
elif key_node.tag == 'tag:yaml.org,2002:value':
|
||||
key_node.tag = 'tag:yaml.org,2002:str'
|
||||
index += 1
|
||||
else:
|
||||
index += 1
|
||||
if merge:
|
||||
node.value = merge + node.value
|
||||
|
||||
def construct_mapping(self, node, deep=False):
|
||||
if isinstance(node, MappingNode):
|
||||
self.flatten_mapping(node)
|
||||
return super().construct_mapping(node, deep=deep)
|
||||
|
||||
def construct_yaml_null(self, node):
|
||||
self.construct_scalar(node)
|
||||
return None
|
||||
|
||||
bool_values = {
|
||||
'yes': True,
|
||||
'no': False,
|
||||
'true': True,
|
||||
'false': False,
|
||||
'on': True,
|
||||
'off': False,
|
||||
}
|
||||
|
||||
def construct_yaml_bool(self, node):
|
||||
value = self.construct_scalar(node)
|
||||
return self.bool_values[value.lower()]
|
||||
|
||||
def construct_yaml_int(self, node):
|
||||
value = self.construct_scalar(node)
|
||||
value = value.replace('_', '')
|
||||
sign = +1
|
||||
if value[0] == '-':
|
||||
sign = -1
|
||||
if value[0] in '+-':
|
||||
value = value[1:]
|
||||
if value == '0':
|
||||
return 0
|
||||
elif value.startswith('0b'):
|
||||
return sign*int(value[2:], 2)
|
||||
elif value.startswith('0x'):
|
||||
return sign*int(value[2:], 16)
|
||||
elif value[0] == '0':
|
||||
return sign*int(value, 8)
|
||||
elif ':' in value:
|
||||
digits = [int(part) for part in value.split(':')]
|
||||
digits.reverse()
|
||||
base = 1
|
||||
value = 0
|
||||
for digit in digits:
|
||||
value += digit*base
|
||||
base *= 60
|
||||
return sign*value
|
||||
else:
|
||||
return sign*int(value)
|
||||
|
||||
inf_value = 1e300
|
||||
while inf_value != inf_value*inf_value:
|
||||
inf_value *= inf_value
|
||||
nan_value = -inf_value/inf_value # Trying to make a quiet NaN (like C99).
|
||||
|
||||
def construct_yaml_float(self, node):
|
||||
value = self.construct_scalar(node)
|
||||
value = value.replace('_', '').lower()
|
||||
sign = +1
|
||||
if value[0] == '-':
|
||||
sign = -1
|
||||
if value[0] in '+-':
|
||||
value = value[1:]
|
||||
if value == '.inf':
|
||||
return sign*self.inf_value
|
||||
elif value == '.nan':
|
||||
return self.nan_value
|
||||
elif ':' in value:
|
||||
digits = [float(part) for part in value.split(':')]
|
||||
digits.reverse()
|
||||
base = 1
|
||||
value = 0.0
|
||||
for digit in digits:
|
||||
value += digit*base
|
||||
base *= 60
|
||||
return sign*value
|
||||
else:
|
||||
return sign*float(value)
|
||||
|
||||
def construct_yaml_binary(self, node):
|
||||
try:
|
||||
value = self.construct_scalar(node).encode('ascii')
|
||||
except UnicodeEncodeError as exc:
|
||||
raise ConstructorError(None, None,
|
||||
"failed to convert base64 data into ascii: %s" % exc,
|
||||
node.start_mark)
|
||||
try:
|
||||
if hasattr(base64, 'decodebytes'):
|
||||
return base64.decodebytes(value)
|
||||
else:
|
||||
return base64.decodestring(value)
|
||||
except binascii.Error as exc:
|
||||
raise ConstructorError(None, None,
|
||||
"failed to decode base64 data: %s" % exc, node.start_mark)
|
||||
|
||||
timestamp_regexp = re.compile(
|
||||
r'''^(?P<year>[0-9][0-9][0-9][0-9])
|
||||
-(?P<month>[0-9][0-9]?)
|
||||
-(?P<day>[0-9][0-9]?)
|
||||
(?:(?:[Tt]|[ \t]+)
|
||||
(?P<hour>[0-9][0-9]?)
|
||||
:(?P<minute>[0-9][0-9])
|
||||
:(?P<second>[0-9][0-9])
|
||||
(?:\.(?P<fraction>[0-9]*))?
|
||||
(?:[ \t]*(?P<tz>Z|(?P<tz_sign>[-+])(?P<tz_hour>[0-9][0-9]?)
|
||||
(?::(?P<tz_minute>[0-9][0-9]))?))?)?$''', re.X)
|
||||
|
||||
def construct_yaml_timestamp(self, node):
|
||||
value = self.construct_scalar(node)
|
||||
match = self.timestamp_regexp.match(node.value)
|
||||
values = match.groupdict()
|
||||
year = int(values['year'])
|
||||
month = int(values['month'])
|
||||
day = int(values['day'])
|
||||
if not values['hour']:
|
||||
return datetime.date(year, month, day)
|
||||
hour = int(values['hour'])
|
||||
minute = int(values['minute'])
|
||||
second = int(values['second'])
|
||||
fraction = 0
|
||||
if values['fraction']:
|
||||
fraction = values['fraction'][:6]
|
||||
while len(fraction) < 6:
|
||||
fraction += '0'
|
||||
fraction = int(fraction)
|
||||
delta = None
|
||||
if values['tz_sign']:
|
||||
tz_hour = int(values['tz_hour'])
|
||||
tz_minute = int(values['tz_minute'] or 0)
|
||||
delta = datetime.timedelta(hours=tz_hour, minutes=tz_minute)
|
||||
if values['tz_sign'] == '-':
|
||||
delta = -delta
|
||||
data = datetime.datetime(year, month, day, hour, minute, second, fraction)
|
||||
if delta:
|
||||
data -= delta
|
||||
return data
|
||||
|
||||
def construct_yaml_omap(self, node):
|
||||
# Note: we do not check for duplicate keys, because it's too
|
||||
# CPU-expensive.
|
||||
omap = []
|
||||
yield omap
|
||||
if not isinstance(node, SequenceNode):
|
||||
raise ConstructorError("while constructing an ordered map", node.start_mark,
|
||||
"expected a sequence, but found %s" % node.id, node.start_mark)
|
||||
for subnode in node.value:
|
||||
if not isinstance(subnode, MappingNode):
|
||||
raise ConstructorError("while constructing an ordered map", node.start_mark,
|
||||
"expected a mapping of length 1, but found %s" % subnode.id,
|
||||
subnode.start_mark)
|
||||
if len(subnode.value) != 1:
|
||||
raise ConstructorError("while constructing an ordered map", node.start_mark,
|
||||
"expected a single mapping item, but found %d items" % len(subnode.value),
|
||||
subnode.start_mark)
|
||||
key_node, value_node = subnode.value[0]
|
||||
key = self.construct_object(key_node)
|
||||
value = self.construct_object(value_node)
|
||||
omap.append((key, value))
|
||||
|
||||
def construct_yaml_pairs(self, node):
|
||||
# Note: the same code as `construct_yaml_omap`.
|
||||
pairs = []
|
||||
yield pairs
|
||||
if not isinstance(node, SequenceNode):
|
||||
raise ConstructorError("while constructing pairs", node.start_mark,
|
||||
"expected a sequence, but found %s" % node.id, node.start_mark)
|
||||
for subnode in node.value:
|
||||
if not isinstance(subnode, MappingNode):
|
||||
raise ConstructorError("while constructing pairs", node.start_mark,
|
||||
"expected a mapping of length 1, but found %s" % subnode.id,
|
||||
subnode.start_mark)
|
||||
if len(subnode.value) != 1:
|
||||
raise ConstructorError("while constructing pairs", node.start_mark,
|
||||
"expected a single mapping item, but found %d items" % len(subnode.value),
|
||||
subnode.start_mark)
|
||||
key_node, value_node = subnode.value[0]
|
||||
key = self.construct_object(key_node)
|
||||
value = self.construct_object(value_node)
|
||||
pairs.append((key, value))
|
||||
|
||||
def construct_yaml_set(self, node):
|
||||
data = set()
|
||||
yield data
|
||||
value = self.construct_mapping(node)
|
||||
data.update(value)
|
||||
|
||||
def construct_yaml_str(self, node):
|
||||
return self.construct_scalar(node)
|
||||
|
||||
def construct_yaml_seq(self, node):
|
||||
data = []
|
||||
yield data
|
||||
data.extend(self.construct_sequence(node))
|
||||
|
||||
def construct_yaml_map(self, node):
|
||||
data = {}
|
||||
yield data
|
||||
value = self.construct_mapping(node)
|
||||
data.update(value)
|
||||
|
||||
def construct_yaml_object(self, node, cls):
|
||||
data = cls.__new__(cls)
|
||||
yield data
|
||||
if hasattr(data, '__setstate__'):
|
||||
state = self.construct_mapping(node, deep=True)
|
||||
data.__setstate__(state)
|
||||
else:
|
||||
state = self.construct_mapping(node)
|
||||
data.__dict__.update(state)
|
||||
|
||||
def construct_undefined(self, node):
|
||||
raise ConstructorError(None, None,
|
||||
"could not determine a constructor for the tag %r" % node.tag,
|
||||
node.start_mark)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:null',
|
||||
SafeConstructor.construct_yaml_null)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:bool',
|
||||
SafeConstructor.construct_yaml_bool)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:int',
|
||||
SafeConstructor.construct_yaml_int)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:float',
|
||||
SafeConstructor.construct_yaml_float)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:binary',
|
||||
SafeConstructor.construct_yaml_binary)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:timestamp',
|
||||
SafeConstructor.construct_yaml_timestamp)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:omap',
|
||||
SafeConstructor.construct_yaml_omap)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:pairs',
|
||||
SafeConstructor.construct_yaml_pairs)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:set',
|
||||
SafeConstructor.construct_yaml_set)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:str',
|
||||
SafeConstructor.construct_yaml_str)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:seq',
|
||||
SafeConstructor.construct_yaml_seq)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:map',
|
||||
SafeConstructor.construct_yaml_map)
|
||||
|
||||
SafeConstructor.add_constructor(None,
|
||||
SafeConstructor.construct_undefined)
|
||||
|
||||
class FullConstructor(SafeConstructor):
|
||||
|
||||
def construct_python_str(self, node):
|
||||
return self.construct_scalar(node)
|
||||
|
||||
def construct_python_unicode(self, node):
|
||||
return self.construct_scalar(node)
|
||||
|
||||
def construct_python_bytes(self, node):
|
||||
try:
|
||||
value = self.construct_scalar(node).encode('ascii')
|
||||
except UnicodeEncodeError as exc:
|
||||
raise ConstructorError(None, None,
|
||||
"failed to convert base64 data into ascii: %s" % exc,
|
||||
node.start_mark)
|
||||
try:
|
||||
if hasattr(base64, 'decodebytes'):
|
||||
return base64.decodebytes(value)
|
||||
else:
|
||||
return base64.decodestring(value)
|
||||
except binascii.Error as exc:
|
||||
raise ConstructorError(None, None,
|
||||
"failed to decode base64 data: %s" % exc, node.start_mark)
|
||||
|
||||
def construct_python_long(self, node):
|
||||
return self.construct_yaml_int(node)
|
||||
|
||||
def construct_python_complex(self, node):
|
||||
return complex(self.construct_scalar(node))
|
||||
|
||||
def construct_python_tuple(self, node):
|
||||
return tuple(self.construct_sequence(node))
|
||||
|
||||
def find_python_module(self, name, mark, unsafe=False):
|
||||
if not name:
|
||||
raise ConstructorError("while constructing a Python module", mark,
|
||||
"expected non-empty name appended to the tag", mark)
|
||||
if unsafe:
|
||||
try:
|
||||
__import__(name)
|
||||
except ImportError as exc:
|
||||
raise ConstructorError("while constructing a Python module", mark,
|
||||
"cannot find module %r (%s)" % (name, exc), mark)
|
||||
if not name in sys.modules:
|
||||
raise ConstructorError("while constructing a Python module", mark,
|
||||
"module %r is not imported" % name, mark)
|
||||
return sys.modules[name]
|
||||
|
||||
def find_python_name(self, name, mark, unsafe=False):
|
||||
if not name:
|
||||
raise ConstructorError("while constructing a Python object", mark,
|
||||
"expected non-empty name appended to the tag", mark)
|
||||
if '.' in name:
|
||||
module_name, object_name = name.rsplit('.', 1)
|
||||
else:
|
||||
module_name = 'builtins'
|
||||
object_name = name
|
||||
if unsafe:
|
||||
try:
|
||||
__import__(module_name)
|
||||
except ImportError as exc:
|
||||
raise ConstructorError("while constructing a Python object", mark,
|
||||
"cannot find module %r (%s)" % (module_name, exc), mark)
|
||||
if not module_name in sys.modules:
|
||||
raise ConstructorError("while constructing a Python object", mark,
|
||||
"module %r is not imported" % module_name, mark)
|
||||
module = sys.modules[module_name]
|
||||
if not hasattr(module, object_name):
|
||||
raise ConstructorError("while constructing a Python object", mark,
|
||||
"cannot find %r in the module %r"
|
||||
% (object_name, module.__name__), mark)
|
||||
return getattr(module, object_name)
|
||||
|
||||
def construct_python_name(self, suffix, node):
|
||||
value = self.construct_scalar(node)
|
||||
if value:
|
||||
raise ConstructorError("while constructing a Python name", node.start_mark,
|
||||
"expected the empty value, but found %r" % value, node.start_mark)
|
||||
return self.find_python_name(suffix, node.start_mark)
|
||||
|
||||
def construct_python_module(self, suffix, node):
|
||||
value = self.construct_scalar(node)
|
||||
if value:
|
||||
raise ConstructorError("while constructing a Python module", node.start_mark,
|
||||
"expected the empty value, but found %r" % value, node.start_mark)
|
||||
return self.find_python_module(suffix, node.start_mark)
|
||||
|
||||
def make_python_instance(self, suffix, node,
|
||||
args=None, kwds=None, newobj=False, unsafe=False):
|
||||
if not args:
|
||||
args = []
|
||||
if not kwds:
|
||||
kwds = {}
|
||||
cls = self.find_python_name(suffix, node.start_mark)
|
||||
if not (unsafe or isinstance(cls, type)):
|
||||
raise ConstructorError("while constructing a Python instance", node.start_mark,
|
||||
"expected a class, but found %r" % type(cls),
|
||||
node.start_mark)
|
||||
if newobj and isinstance(cls, type):
|
||||
return cls.__new__(cls, *args, **kwds)
|
||||
else:
|
||||
return cls(*args, **kwds)
|
||||
|
||||
def set_python_instance_state(self, instance, state):
|
||||
if hasattr(instance, '__setstate__'):
|
||||
instance.__setstate__(state)
|
||||
else:
|
||||
slotstate = {}
|
||||
if isinstance(state, tuple) and len(state) == 2:
|
||||
state, slotstate = state
|
||||
if hasattr(instance, '__dict__'):
|
||||
instance.__dict__.update(state)
|
||||
elif state:
|
||||
slotstate.update(state)
|
||||
for key, value in slotstate.items():
|
||||
setattr(object, key, value)
|
||||
|
||||
def construct_python_object(self, suffix, node):
|
||||
# Format:
|
||||
# !!python/object:module.name { ... state ... }
|
||||
instance = self.make_python_instance(suffix, node, newobj=True)
|
||||
yield instance
|
||||
deep = hasattr(instance, '__setstate__')
|
||||
state = self.construct_mapping(node, deep=deep)
|
||||
self.set_python_instance_state(instance, state)
|
||||
|
||||
def construct_python_object_apply(self, suffix, node, newobj=False):
|
||||
# Format:
|
||||
# !!python/object/apply # (or !!python/object/new)
|
||||
# args: [ ... arguments ... ]
|
||||
# kwds: { ... keywords ... }
|
||||
# state: ... state ...
|
||||
# listitems: [ ... listitems ... ]
|
||||
# dictitems: { ... dictitems ... }
|
||||
# or short format:
|
||||
# !!python/object/apply [ ... arguments ... ]
|
||||
# The difference between !!python/object/apply and !!python/object/new
|
||||
# is how an object is created, check make_python_instance for details.
|
||||
if isinstance(node, SequenceNode):
|
||||
args = self.construct_sequence(node, deep=True)
|
||||
kwds = {}
|
||||
state = {}
|
||||
listitems = []
|
||||
dictitems = {}
|
||||
else:
|
||||
value = self.construct_mapping(node, deep=True)
|
||||
args = value.get('args', [])
|
||||
kwds = value.get('kwds', {})
|
||||
state = value.get('state', {})
|
||||
listitems = value.get('listitems', [])
|
||||
dictitems = value.get('dictitems', {})
|
||||
instance = self.make_python_instance(suffix, node, args, kwds, newobj)
|
||||
if state:
|
||||
self.set_python_instance_state(instance, state)
|
||||
if listitems:
|
||||
instance.extend(listitems)
|
||||
if dictitems:
|
||||
for key in dictitems:
|
||||
instance[key] = dictitems[key]
|
||||
return instance
|
||||
|
||||
def construct_python_object_new(self, suffix, node):
|
||||
return self.construct_python_object_apply(suffix, node, newobj=True)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:python/none',
|
||||
FullConstructor.construct_yaml_null)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:python/bool',
|
||||
FullConstructor.construct_yaml_bool)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:python/str',
|
||||
FullConstructor.construct_python_str)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:python/unicode',
|
||||
FullConstructor.construct_python_unicode)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:python/bytes',
|
||||
FullConstructor.construct_python_bytes)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:python/int',
|
||||
FullConstructor.construct_yaml_int)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:python/long',
|
||||
FullConstructor.construct_python_long)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:python/float',
|
||||
FullConstructor.construct_yaml_float)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:python/complex',
|
||||
FullConstructor.construct_python_complex)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:python/list',
|
||||
FullConstructor.construct_yaml_seq)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:python/tuple',
|
||||
FullConstructor.construct_python_tuple)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
'tag:yaml.org,2002:python/dict',
|
||||
FullConstructor.construct_yaml_map)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
'tag:yaml.org,2002:python/name:',
|
||||
FullConstructor.construct_python_name)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
'tag:yaml.org,2002:python/module:',
|
||||
FullConstructor.construct_python_module)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
'tag:yaml.org,2002:python/object:',
|
||||
FullConstructor.construct_python_object)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
'tag:yaml.org,2002:python/object/apply:',
|
||||
FullConstructor.construct_python_object_apply)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
'tag:yaml.org,2002:python/object/new:',
|
||||
FullConstructor.construct_python_object_new)
|
||||
|
||||
class UnsafeConstructor(FullConstructor):
|
||||
|
||||
def find_python_module(self, name, mark):
|
||||
return super(UnsafeConstructor, self).find_python_module(name, mark, unsafe=True)
|
||||
|
||||
def find_python_name(self, name, mark):
|
||||
return super(UnsafeConstructor, self).find_python_name(name, mark, unsafe=True)
|
||||
|
||||
def make_python_instance(self, suffix, node, args=None, kwds=None, newobj=False):
|
||||
return super(UnsafeConstructor, self).make_python_instance(
|
||||
suffix, node, args, kwds, newobj, unsafe=True)
|
||||
|
||||
# Constructor is same as UnsafeConstructor. Need to leave this in place in case
|
||||
# people have extended it directly.
|
||||
class Constructor(UnsafeConstructor):
|
||||
pass
|
101
libs/yaml/cyaml.py
Normal file
101
libs/yaml/cyaml.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
|
||||
__all__ = [
|
||||
'CBaseLoader', 'CSafeLoader', 'CFullLoader', 'CUnsafeLoader', 'CLoader',
|
||||
'CBaseDumper', 'CSafeDumper', 'CDumper'
|
||||
]
|
||||
|
||||
from _yaml import CParser, CEmitter
|
||||
|
||||
from .constructor import *
|
||||
|
||||
from .serializer import *
|
||||
from .representer import *
|
||||
|
||||
from .resolver import *
|
||||
|
||||
class CBaseLoader(CParser, BaseConstructor, BaseResolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
CParser.__init__(self, stream)
|
||||
BaseConstructor.__init__(self)
|
||||
BaseResolver.__init__(self)
|
||||
|
||||
class CSafeLoader(CParser, SafeConstructor, Resolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
CParser.__init__(self, stream)
|
||||
SafeConstructor.__init__(self)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class CFullLoader(CParser, FullConstructor, Resolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
CParser.__init__(self, stream)
|
||||
FullConstructor.__init__(self)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class CUnsafeLoader(CParser, UnsafeConstructor, Resolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
CParser.__init__(self, stream)
|
||||
UnsafeConstructor.__init__(self)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class CLoader(CParser, Constructor, Resolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
CParser.__init__(self, stream)
|
||||
Constructor.__init__(self)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class CBaseDumper(CEmitter, BaseRepresenter, BaseResolver):
|
||||
|
||||
def __init__(self, stream,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
CEmitter.__init__(self, stream, canonical=canonical,
|
||||
indent=indent, width=width, encoding=encoding,
|
||||
allow_unicode=allow_unicode, line_break=line_break,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end,
|
||||
version=version, tags=tags)
|
||||
Representer.__init__(self, default_style=default_style,
|
||||
default_flow_style=default_flow_style, sort_keys=sort_keys)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class CSafeDumper(CEmitter, SafeRepresenter, Resolver):
|
||||
|
||||
def __init__(self, stream,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
CEmitter.__init__(self, stream, canonical=canonical,
|
||||
indent=indent, width=width, encoding=encoding,
|
||||
allow_unicode=allow_unicode, line_break=line_break,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end,
|
||||
version=version, tags=tags)
|
||||
SafeRepresenter.__init__(self, default_style=default_style,
|
||||
default_flow_style=default_flow_style, sort_keys=sort_keys)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class CDumper(CEmitter, Serializer, Representer, Resolver):
|
||||
|
||||
def __init__(self, stream,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
CEmitter.__init__(self, stream, canonical=canonical,
|
||||
indent=indent, width=width, encoding=encoding,
|
||||
allow_unicode=allow_unicode, line_break=line_break,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end,
|
||||
version=version, tags=tags)
|
||||
Representer.__init__(self, default_style=default_style,
|
||||
default_flow_style=default_flow_style, sort_keys=sort_keys)
|
||||
Resolver.__init__(self)
|
||||
|
62
libs/yaml/dumper.py
Normal file
62
libs/yaml/dumper.py
Normal file
|
@ -0,0 +1,62 @@
|
|||
|
||||
__all__ = ['BaseDumper', 'SafeDumper', 'Dumper']
|
||||
|
||||
from .emitter import *
|
||||
from .serializer import *
|
||||
from .representer import *
|
||||
from .resolver import *
|
||||
|
||||
class BaseDumper(Emitter, Serializer, BaseRepresenter, BaseResolver):
|
||||
|
||||
def __init__(self, stream,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
Emitter.__init__(self, stream, canonical=canonical,
|
||||
indent=indent, width=width,
|
||||
allow_unicode=allow_unicode, line_break=line_break)
|
||||
Serializer.__init__(self, encoding=encoding,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end,
|
||||
version=version, tags=tags)
|
||||
Representer.__init__(self, default_style=default_style,
|
||||
default_flow_style=default_flow_style, sort_keys=sort_keys)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class SafeDumper(Emitter, Serializer, SafeRepresenter, Resolver):
|
||||
|
||||
def __init__(self, stream,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
Emitter.__init__(self, stream, canonical=canonical,
|
||||
indent=indent, width=width,
|
||||
allow_unicode=allow_unicode, line_break=line_break)
|
||||
Serializer.__init__(self, encoding=encoding,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end,
|
||||
version=version, tags=tags)
|
||||
SafeRepresenter.__init__(self, default_style=default_style,
|
||||
default_flow_style=default_flow_style, sort_keys=sort_keys)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class Dumper(Emitter, Serializer, Representer, Resolver):
|
||||
|
||||
def __init__(self, stream,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
Emitter.__init__(self, stream, canonical=canonical,
|
||||
indent=indent, width=width,
|
||||
allow_unicode=allow_unicode, line_break=line_break)
|
||||
Serializer.__init__(self, encoding=encoding,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end,
|
||||
version=version, tags=tags)
|
||||
Representer.__init__(self, default_style=default_style,
|
||||
default_flow_style=default_flow_style, sort_keys=sort_keys)
|
||||
Resolver.__init__(self)
|
||||
|
1137
libs/yaml/emitter.py
Normal file
1137
libs/yaml/emitter.py
Normal file
File diff suppressed because it is too large
Load diff
75
libs/yaml/error.py
Normal file
75
libs/yaml/error.py
Normal file
|
@ -0,0 +1,75 @@
|
|||
|
||||
__all__ = ['Mark', 'YAMLError', 'MarkedYAMLError']
|
||||
|
||||
class Mark:
|
||||
|
||||
def __init__(self, name, index, line, column, buffer, pointer):
|
||||
self.name = name
|
||||
self.index = index
|
||||
self.line = line
|
||||
self.column = column
|
||||
self.buffer = buffer
|
||||
self.pointer = pointer
|
||||
|
||||
def get_snippet(self, indent=4, max_length=75):
|
||||
if self.buffer is None:
|
||||
return None
|
||||
head = ''
|
||||
start = self.pointer
|
||||
while start > 0 and self.buffer[start-1] not in '\0\r\n\x85\u2028\u2029':
|
||||
start -= 1
|
||||
if self.pointer-start > max_length/2-1:
|
||||
head = ' ... '
|
||||
start += 5
|
||||
break
|
||||
tail = ''
|
||||
end = self.pointer
|
||||
while end < len(self.buffer) and self.buffer[end] not in '\0\r\n\x85\u2028\u2029':
|
||||
end += 1
|
||||
if end-self.pointer > max_length/2-1:
|
||||
tail = ' ... '
|
||||
end -= 5
|
||||
break
|
||||
snippet = self.buffer[start:end]
|
||||
return ' '*indent + head + snippet + tail + '\n' \
|
||||
+ ' '*(indent+self.pointer-start+len(head)) + '^'
|
||||
|
||||
def __str__(self):
|
||||
snippet = self.get_snippet()
|
||||
where = " in \"%s\", line %d, column %d" \
|
||||
% (self.name, self.line+1, self.column+1)
|
||||
if snippet is not None:
|
||||
where += ":\n"+snippet
|
||||
return where
|
||||
|
||||
class YAMLError(Exception):
|
||||
pass
|
||||
|
||||
class MarkedYAMLError(YAMLError):
|
||||
|
||||
def __init__(self, context=None, context_mark=None,
|
||||
problem=None, problem_mark=None, note=None):
|
||||
self.context = context
|
||||
self.context_mark = context_mark
|
||||
self.problem = problem
|
||||
self.problem_mark = problem_mark
|
||||
self.note = note
|
||||
|
||||
def __str__(self):
|
||||
lines = []
|
||||
if self.context is not None:
|
||||
lines.append(self.context)
|
||||
if self.context_mark is not None \
|
||||
and (self.problem is None or self.problem_mark is None
|
||||
or self.context_mark.name != self.problem_mark.name
|
||||
or self.context_mark.line != self.problem_mark.line
|
||||
or self.context_mark.column != self.problem_mark.column):
|
||||
lines.append(str(self.context_mark))
|
||||
if self.problem is not None:
|
||||
lines.append(self.problem)
|
||||
if self.problem_mark is not None:
|
||||
lines.append(str(self.problem_mark))
|
||||
if self.note is not None:
|
||||
lines.append(self.note)
|
||||
return '\n'.join(lines)
|
||||
|
86
libs/yaml/events.py
Normal file
86
libs/yaml/events.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
|
||||
# Abstract classes.
|
||||
|
||||
class Event(object):
|
||||
def __init__(self, start_mark=None, end_mark=None):
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
def __repr__(self):
|
||||
attributes = [key for key in ['anchor', 'tag', 'implicit', 'value']
|
||||
if hasattr(self, key)]
|
||||
arguments = ', '.join(['%s=%r' % (key, getattr(self, key))
|
||||
for key in attributes])
|
||||
return '%s(%s)' % (self.__class__.__name__, arguments)
|
||||
|
||||
class NodeEvent(Event):
|
||||
def __init__(self, anchor, start_mark=None, end_mark=None):
|
||||
self.anchor = anchor
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
|
||||
class CollectionStartEvent(NodeEvent):
|
||||
def __init__(self, anchor, tag, implicit, start_mark=None, end_mark=None,
|
||||
flow_style=None):
|
||||
self.anchor = anchor
|
||||
self.tag = tag
|
||||
self.implicit = implicit
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.flow_style = flow_style
|
||||
|
||||
class CollectionEndEvent(Event):
|
||||
pass
|
||||
|
||||
# Implementations.
|
||||
|
||||
class StreamStartEvent(Event):
|
||||
def __init__(self, start_mark=None, end_mark=None, encoding=None):
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.encoding = encoding
|
||||
|
||||
class StreamEndEvent(Event):
|
||||
pass
|
||||
|
||||
class DocumentStartEvent(Event):
|
||||
def __init__(self, start_mark=None, end_mark=None,
|
||||
explicit=None, version=None, tags=None):
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.explicit = explicit
|
||||
self.version = version
|
||||
self.tags = tags
|
||||
|
||||
class DocumentEndEvent(Event):
|
||||
def __init__(self, start_mark=None, end_mark=None,
|
||||
explicit=None):
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.explicit = explicit
|
||||
|
||||
class AliasEvent(NodeEvent):
|
||||
pass
|
||||
|
||||
class ScalarEvent(NodeEvent):
|
||||
def __init__(self, anchor, tag, implicit, value,
|
||||
start_mark=None, end_mark=None, style=None):
|
||||
self.anchor = anchor
|
||||
self.tag = tag
|
||||
self.implicit = implicit
|
||||
self.value = value
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.style = style
|
||||
|
||||
class SequenceStartEvent(CollectionStartEvent):
|
||||
pass
|
||||
|
||||
class SequenceEndEvent(CollectionEndEvent):
|
||||
pass
|
||||
|
||||
class MappingStartEvent(CollectionStartEvent):
|
||||
pass
|
||||
|
||||
class MappingEndEvent(CollectionEndEvent):
|
||||
pass
|
||||
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue