add images for wiki 2.5

release 2.5.0.2241
Merge branch 'develop-2.1'
2018-02-14 17:43:41 -05:00 · 2018-02-14 16:19:23 +01:00 · 2018-02-14 16:18:04 +01:00 · 2018-02-14 16:10:17 +01:00 · 2018-02-13 15:32:55 +01:00 · 2018-02-13 15:06:20 +01:00
335 changed files with 29830 additions and 59086 deletions
@@ -55,4 +55,5 @@ docs/_build/
 # pycharm
 .idea

-icon.psd
+icon.psd
+main-icon.psd
@@ -1,4 +1,122 @@

+2.0.33.1871
+- core: normalize line endings in subtitles to LF (\n)
+- core: add subtitle storage lock to avoid race condition
+- core: be more verbose about subtitle storage addition
+- core: fix MPL2 newline parsing, which resulted in broken subtitles
+- core: encoding change: reduce log spam
+- submod: common: fix CM_starting_spacedots
+- opensubtitles: fix request/response handling
+
+
+
+2.0.33.1849
+- opensubtitles: add VIP server handling + preference; VIP benefits: 10€/year, ad-free subs, 1000 subs/day, no-cache VIP server, help SZ and subscribe via http://v.ht/osvip
+- opensubtitles: try to reuse previous token instead of logging in every time
+- core: add throttling between searches (10 seconds)
+- core: fix IETF handling for good
+- core: fix no subtitles being searched in certain situations (when an external subtitle without special tag exists)
+- core: add subtitle blacklist
+- core: fixes
+- core: fix detection of certain PMS media stream language tags ("FR" for example)
+- core: missing subtitles: correctly skip unwanted subtitle extensions
+- core: missing subtitles: honor "treat undefined as first language" option correctly
+- api: add blacklisting endpoints for quickly searching for new subtitls via bookmarklet
+- submod: colors: apply color mods at the end of processing modifications; fix color mods
+- submod: new remove_tags modification to remove all styling tags from subtitles
+- submod: HI: be more aggressive at handling brackets
+- submod: OCR: update en and hrv
+- submod: common: remove "torrent downloaded from ..." lines
+- submod: OCR: fix WholeWord handling, improving modification
+- submod: apply OCR fixes before HI
+- submod: OCR: fix broken HI tag colons (ANNOUNCER'. instead of ANNOUNCER:)
+- menu: advanced: speed up batch modifications
+- menu: add subtitle blacklist
+- menu: recently played: show only TV episodes and movies (music tracks were listed here as well)
+
+
+2.0.29.1767
+- core: fix internal subtitle storage issues
+- core: handle "embedded-forced" tag (futureproofing)
+- core: remove more garbage tags from release groups (nzbgeek, chamele0n, buymore, xpost, postbot)
+- submod: OCR fix: fix music icon = paragraph
+
+
+2.0.29.1756
+- core: don't fail on uppercase file extensions
+- core: don't re-download a subtitle if we already downloaded one, it still physically exists and external subtitles are configured to be ignored
+- core: fix VTT subtitle duplication
+- core: if forced subtitles not explicitly wanted, ignore existing forced subtitles when searching
+- core: add full IETF language support for `Treat languages with country attribute as ISO 639-1 (e.g. don't download pt-BR if pt subtitle exists)`-setting for embedded subtitles
+- menu: remove buggy dynamic permission-based channel icon introduced in 1715
+- menu: improve `Items with missing subtitles` menu usage and item display
+- menu: `Advanced -> Get my logs` handle custom domains without port
+- menu: correctly show country/script part of languages with such attributes (e.g. pt-BR)
+- config: rename `Scan:` settings; make them better understandable and translatable
+- config: rephrase IETF options as "languages with country attribute" (e.g. pt-BR)
+- config: separate IETF options into how to display languages with country attribute and how they should be handled when searching/scanning (e.g. pt-BR)
+- config: `Scheduler: Item age to be considered recent` now can go up to 12 weeks
+- config: `Scheduler: Periodically search for recent items with missing subtitles` added `every 2 hours`
+- submod: swe: add Ĺ to Å
+
+
+2.0.26.1715
+- core: submod: OCR fixes: swe: replace ĺ with å inside words
+- core: fix handling of non-existant PMS audio_codec info
+- core: filename matching ignored the strictness setting in certain global directory configurations (thanks @raduc)
+- core: don't fail on migration errors
+- provider titlovi: handle multiple subtitles per archive
+- provider addic7ed: reset default boost to 19 (was 21)
+- menu: add warning icon on missing permissions
+- menu: manual subtitle list sometimes listed duplicates (thanks @andreashoyer)
+- menu: don't request PMS metadata in item details menu twice
+- menu: don't fail badly on non existant PMS metadata in item details menu
+
+
+2.0.26.1695
+## ATTENTION: THIS RELEASE RESETS YOUR CONFIGURED LANGUAGES TO DEFAULT!
+- core: fix bug that caused SZ not to work for Windows users with special characters in their username
+- core: fix issues when logging failed manual download actions
+- core: update guessit to 2.1.4
+- core: fix issue causing the background task scheduler to stop after changing preferences
+- core: fix polish encoding (try windows-1250 first, then iso 8859-2)
+- core: remove subscenter provider as it now uses captchas
+- core: add titlovi as default provider (thanks viking!)
+- core: increase default PMS API request timeout to 15 (old: 10, max: 45); add preference for that
+- core: re-add separate legacy FindMissingSubtitles task and run it on the first run to prime SZ's internal subtitle storage
+- core: add "low impact mode" for people with remote filesystems (currently enabled for List LANGUAGE subtitles in detail menu); alleviates certain plexweb timeout issues
+- menu: change naming of find missing subtitles menu item
+- legendastv: fix multi value guessit issues
+- submod: OCR: update eng and hrv OCR replace dictionaries; fix ". L am huge"
+
+
+2.0.25.1635
+- core: update memory handling, possibly reduce memory problems of 2.0
+- core: support for MPL2 subtitle format
+- core: update task handling
+- core: re-enable NVIDIA SHIELD support by fixing rarfile behaviour
+- core: add SZ_UNRAR_TOOL environment variable for custom unrar location
+- core: disable SZ when no providers are enabled
+- core: only start activity monitor if channel or agent are enabled
+- core: improve custom provider integration
+- core: update eastern european encoding detection (especially Romanian)
+- tasks: reduce provider stress by introducing wait times between searches/downloads
+- windows: correctly ship UnRAR.exe
+- windows: skip DBM checks
+- addic7ed: fix Nip/Tuck
+- subscenter: use new domain
+
+
+2.0.24.1581
+- legendastv: ship unrar.exe for Windows users (fixes unrar issues)
+- addic7ed: fix TooManyRequests error
+- submod: OCR fixes NL: add custom dictionary data for malformed characters
+- submod: OCR fixes: update hrv/NL dictionaries
+- submod: common: remove spaces before punctuation
+- podnapisi: now returns more subtitles again
+ATTENTION: Sub-Zero is still broken on PMS for SHIELD. Help needed!
+
+
 2.0.24.1565
 - core: fix searchallrecentlymissing task erroring if item not found
 - core: fix non-plex-items appearing in and crashing the recently played list
@@ -1,7 +1,6 @@
 # coding=utf-8
 import sys
 import datetime
-import os

 from subzero.sandbox import restore_builtins

@@ -24,8 +23,9 @@ sys.modules["interface"] = interface

 from subzero.constants import OS_PLEX_USERAGENT, PERSONAL_MEDIA_IDENTIFIER
 from interface.menu import *
-from support.plex_media import media_to_videos, get_media_item_ids, scan_videos
-from support.storage import save_subtitles, store_subtitle_info
+from support.plex_media import media_to_videos, get_media_item_ids
+from support.scanning import scan_videos
+from support.storage import save_subtitles, store_subtitle_info, get_subtitle_storage
 from support.items import is_ignored
 from support.config import config
 from support.lib import get_intent
@@ -114,12 +114,43 @@ def update_local_media(metadata, media, media_type="movies"):
            pass


+def agent_extract_embedded(videos):
+    try:
+        subtitle_storage = get_subtitle_storage()
+
+        for video in videos:
+            item = video["item"]
+            stored_subs = subtitle_storage.load_or_new(item)
+
+            for part in get_all_parts(item):
+                for requested_language in config.lang_list:
+                    embedded_subs = stored_subs.get_by_provider(part.id, requested_language, "embedded")
+                    current = stored_subs.get_any(part.id, requested_language)
+                    if not embedded_subs:
+                        stream_data = get_embedded_subtitle_streams(part, requested_language=requested_language,
+                                                                    get_forced=config.forced_only)
+
+                        if stream_data:
+                            stream = stream_data[0]["stream"]
+
+                            extract_embedded_sub(rating_key=item.rating_key, part_id=part.id,
+                                                 stream_index=str(stream.index),
+                                                 language=str(requested_language), with_mods=True, refresh=False,
+                                                 set_current=not current)
+                    else:
+                        Log.Debug("Skipping embedded subtitle extraction for %s, already got %r from %s",
+                                  item.rating_key, requested_language, embedded_subs[0].id)
+    except:
+        Log.Error("Something went wrong when auto-extracting subtitles, continuing: %s", traceback.format_exc())
+
+
 class SubZeroAgent(object):
    agent_type = None
    agent_type_verbose = None
    languages = [Locale.Language.English]
    primary_provider = False
    score_prefs_key = None
+    debounce = 10

    def __init__(self, *args, **kwargs):
        super(SubZeroAgent, self).__init__(*args, **kwargs)
@@ -130,7 +161,14 @@ class SubZeroAgent(object):
        Log.Debug("Sub-Zero %s, %s search" % (config.version, self.agent_type))
        results.Append(MetadataSearchResult(id='null', score=100))

+    def store_blank_subtitle_metadata(self, video_part_map):
+        store_subtitle_info(video_part_map, dict((k, []) for k in video_part_map.keys()), None, mode="a")
+
    def update(self, metadata, media, lang):
+        if not config.enable_agent:
+            Log.Debug("Skipping Sub-Zero agent(s)")
+            return
+
        Log.Debug("Sub-Zero %s, %s update called" % (config.version, self.agent_type))
        intent = get_intent()

@@ -167,36 +205,71 @@ class SubZeroAgent(object):
            set_refresh_menu_state(media, media_type=self.agent_type)

            # scanned_video_part_map = {subliminal.Video: plex_part, ...}
-            scanned_video_part_map = scan_videos(videos, kind=self.agent_type)
+            providers = config.get_providers(media_type=self.agent_type)
+            scanned_video_part_map = scan_videos(videos, providers=providers)
+
+            # auto extract embedded
+            if config.embedded_auto_extract:
+                agent_extract_embedded(videos)
+
+            # clear missing subtitles menu data
+            if not scheduler.is_task_running("MissingSubtitles"):
+                scheduler.clear_task_data("MissingSubtitles")

            downloaded_subtitles = None
-            if not config.enable_agent:
-                Log.Debug("Skipping Sub-Zero agent(s)")

-            else:
-                # downloaded_subtitles = {subliminal.Video: [subtitle, subtitle, ...]}
-                downloaded_subtitles = download_best_subtitles(scanned_video_part_map, min_score=use_score)
-                item_ids = get_media_item_ids(media, kind=self.agent_type)
+            # debounce for self.debounce seconds
+            now = datetime.datetime.now()
+            if "last_call" in Dict:
+                last_call = Dict["last_call"]
+                if last_call + datetime.timedelta(seconds=self.debounce) > now:
+                    wait = self.debounce - (now - last_call).seconds
+                    if wait >= 1:
+                        Log.Debug("Waiting %s seconds until continuing", wait)
+                        Thread.Sleep(wait)
+
+            # downloaded_subtitles = {subliminal.Video: [subtitle, subtitle, ...]}
+            try:
+                downloaded_subtitles = download_best_subtitles(scanned_video_part_map, min_score=use_score,
+                                                               throttle_time=self.debounce, providers=providers)
+            except:
+                Log.Exception("Something went wrong when downloading subtitles")
+
+            if downloaded_subtitles is not None:
+                Dict["last_call"] = datetime.datetime.now()
+
+            item_ids = get_media_item_ids(media, kind=self.agent_type)

            downloaded_any = False
            if downloaded_subtitles:
                downloaded_any = any(downloaded_subtitles.values())

            if downloaded_any:
-                save_subtitles(scanned_video_part_map, downloaded_subtitles, mods=config.default_mods)
+                save_successful = False
+                try:
+                    save_successful = save_subtitles(scanned_video_part_map, downloaded_subtitles,
+                                                     mods=config.default_mods)
+                except:
+                    Log.Exception("Something went wrong when saving subtitles")
+
                track_usage("Subtitle", "refreshed", "download", 1)

-                for video, video_subtitles in downloaded_subtitles.items():
-                    # store item(s) in history
-                    for subtitle in video_subtitles:
-                        item_title = get_title_for_video_metadata(video.plexapi_metadata, add_section_title=False)
-                        history = get_history()
-                        history.add(item_title, video.id, section_title=video.plexapi_metadata["section"],
-                                    subtitle=subtitle)
+                # store SZ meta info even if download wasn't successful
+                if not save_successful:
+                    self.store_blank_subtitle_metadata(scanned_video_part_map)
+
+                else:
+                    for video, video_subtitles in downloaded_subtitles.items():
+                        # store item(s) in history
+                        for subtitle in video_subtitles:
+                            item_title = get_title_for_video_metadata(video.plexapi_metadata, add_section_title=False)
+                            history = get_history()
+                            history.add(item_title, video.id, section_title=video.plexapi_metadata["section"],
+                                        subtitle=subtitle)
+                            history.destroy()
            else:
-                # store subtitle info even if we've downloaded none
-                store_subtitle_info(scanned_video_part_map, dict((k, []) for k in scanned_video_part_map.keys()),
-                                    None, mode="a")
+                # store SZ meta info even if we've downloaded none
+                self.store_blank_subtitle_metadata(scanned_video_part_map)

            update_local_media(metadata, media, media_type=self.agent_type)

@@ -213,6 +286,10 @@ class SubZeroAgent(object):

            Dict.Save()

+            # fsync cache
+            if config.new_style_cache:
+                config.sync_cache()
+

 class SubZeroSubtitlesAgentMovies(SubZeroAgent, Agent.Movies):
    contributes_to = ['com.plexapp.agents.imdb', 'com.plexapp.agents.xbmcnfo', 'com.plexapp.agents.themoviedb', 'com.plexapp.agents.hama']
@@ -8,7 +8,7 @@ import urlparse

 from zipfile import ZipFile, ZIP_DEFLATED

-from babelfish import Language
+from subzero.language import Language

 from subzero.lib.io import FileIO
 from subzero.constants import PREFIX, PLUGIN_IDENTIFIER
@@ -49,6 +49,10 @@ def AdvancedMenu(randomize=None, header=None, message=None):
        key=Callback(TriggerBetterSubtitles, randomize=timestamp()),
        title=pad_title("Trigger find better subtitles"),
    ))
+    oc.add(DirectoryObject(
+        key=Callback(SkipFindBetterSubtitles, randomize=timestamp()),
+        title=pad_title("Skip next find better subtitles (sets last run to now)"),
+    ))
    oc.add(DirectoryObject(
        key=Callback(TriggerStorageMaintenance, randomize=timestamp()),
        title=pad_title("Trigger subtitle storage maintenance"),
@@ -57,6 +61,10 @@ def AdvancedMenu(randomize=None, header=None, message=None):
        key=Callback(TriggerStorageMigration, randomize=timestamp()),
        title=pad_title("Trigger subtitle storage migration (expensive)"),
    ))
+    oc.add(DirectoryObject(
+        key=Callback(TriggerCacheMaintenance, randomize=timestamp()),
+        title=pad_title("Trigger cache maintenance (refiners, providers and packs/archives)"),
+    ))
    oc.add(DirectoryObject(
        key=Callback(ApplyDefaultMods, randomize=timestamp()),
        title=pad_title("Apply configured default subtitle mods to all (active) stored subtitles"),
@@ -89,6 +97,10 @@ def AdvancedMenu(randomize=None, header=None, message=None):
        key=Callback(InvalidateCache, randomize=timestamp()),
        title=pad_title("Invalidate Sub-Zero metadata caches (subliminal)"),
    ))
+    oc.add(DirectoryObject(
+        key=Callback(ResetProviderThrottle, randomize=timestamp()),
+        title=pad_title("Reset provider throttle states"),
+    ))
    return oc


@@ -158,6 +170,20 @@ def TriggerBetterSubtitles(randomize=None):
    )


+
+@route(PREFIX + '/skipbetter')
+@debounce
+def SkipFindBetterSubtitles(randomize=None):
+    task = scheduler.task("FindBetterSubtitles")
+    task.last_run = datetime.datetime.now()
+
+    return AdvancedMenu(
+        randomize=timestamp(),
+        header='Success',
+        message='FindBetterSubtitles skipped'
+    )
+
+
@route(PREFIX + '/triggermaintenance')
@debounce
 def TriggerStorageMaintenance(randomize=None):
@@ -180,6 +206,17 @@ def TriggerStorageMigration(randomize=None):
    )


+@route(PREFIX + '/triggercachemaintenance')
+@debounce
+def TriggerCacheMaintenance(randomize=None):
+    scheduler.dispatch_task("CacheMaintenance")
+    return AdvancedMenu(
+        randomize=timestamp(),
+        header='Success',
+        message='TriggerCacheMaintenance triggered'
+    )
+
+
 def apply_default_mods(reapply_current=False):
    storage = get_subtitle_storage()
    subs_applied = 0
@@ -264,7 +301,7 @@ def GetLogsLink():

    elif "Referer" in req_headers:
        parsed = urlparse.urlparse(req_headers["Referer"])
-        link_base = "%s://%s:%s" % (parsed.scheme, parsed.hostname, parsed.port)
+        link_base = "%s://%s%s" % (parsed.scheme, parsed.hostname, (":%s" % parsed.port) if parsed.port else "")
        Log.Debug("Using referer-based link_base")
        get_external_ip = False

@@ -300,7 +337,10 @@ def DownloadLogs():
@debounce
 def InvalidateCache(randomize=None):
    from subliminal.cache import region
-    region.invalidate()
+    if config.new_style_cache:
+        region.backend.clear()
+    else:
+        region.invalidate()
    return AdvancedMenu(
        randomize=timestamp(),
        header='Success',
@@ -338,3 +378,14 @@ def ClearPin(randomize=None):
    Dict["pin_correct_time"] = None
    config.locked = True
    return fatality(force_title="Menu locked", header=" ", no_history=True)
+
+
+@route(PREFIX + '/reset_throttle')
+def ResetProviderThrottle(randomize=None):
+    Dict["provider_throttle"] = {}
+    Dict.Save()
+    return AdvancedMenu(
+        randomize=timestamp(),
+        header='Success',
+        message='Provider throttles reset'
+    )
@@ -1,17 +1,21 @@
 # coding=utf-8
 import os
+import subprocess
+import traceback
+
+from subzero.language import Language

 from sub_mod import SubtitleModificationsMenu
 from menu_helpers import debounce, SubFolderObjectContainer, default_thumb, add_ignore_options, get_item_task_data, \
-    set_refresh_menu_state, route
+    set_refresh_menu_state, route, extract_embedded_sub

 from refresh_item import RefreshItem
 from subzero.constants import PREFIX
-from support.config import config
-from support.helpers import timestamp, cast_bool, df, get_language
-from support.items import get_item_kind_from_rating_key, get_item, get_current_sub
-from support.lib import Plex
-from support.plex_media import get_plex_metadata, scan_videos, PMSMediaProxy
+from support.config import config, TEXT_SUBTITLE_EXTS
+from support.helpers import timestamp, df, get_language, display_language, quote_args, get_language_from_stream
+from support.items import get_item_kind_from_rating_key, get_item, get_current_sub, get_item_title, save_stored_sub
+from support.plex_media import get_plex_metadata, get_part, get_embedded_subtitle_streams
+from support.scanning import scan_videos
 from support.scheduler import scheduler
 from support.storage import get_subtitle_storage

@@ -20,7 +24,7 @@ from support.storage import get_subtitle_storage

@route(PREFIX + '/item/{rating_key}/actions')
@debounce
-def ItemDetailsMenu(rating_key, title=None, base_title=None, item_title=None, randomize=None):
+def ItemDetailsMenu(rating_key, title=None, base_title=None, item_title=None, randomize=None, header=None):
    """
    displays the item details menu of an item that doesn't contain any deeper tree, such as a movie or an episode
    :param rating_key:
@@ -33,12 +37,22 @@ def ItemDetailsMenu(rating_key, title=None, base_title=None, item_title=None, ra
    from interface.main import IgnoreMenu

    title = unicode(base_title) + " > " + unicode(title) if base_title else unicode(title)
-    item = get_item(rating_key)
+    item = plex_item = get_item(rating_key)
    current_kind = get_item_kind_from_rating_key(rating_key)

    timeout = 30

-    oc = SubFolderObjectContainer(title2=title, replace_parent=True)
+    oc = SubFolderObjectContainer(title2=title, replace_parent=True, header=header)
+
+    if not item:
+        oc.add(DirectoryObject(
+            key=Callback(ItemDetailsMenu, rating_key=rating_key, title=title, base_title=base_title,
+                         item_title=item_title, randomize=timestamp()),
+            title=u"Item not found: %s!" % item_title,
+            summary="Plex didn't return any information about the item, please refresh it and come back later",
+            thumb=default_thumb
+        ))
+        return oc

    # add back to season for episode
    if current_kind == "episode":
@@ -74,9 +88,6 @@ def ItemDetailsMenu(rating_key, title=None, base_title=None, item_title=None, ra
    subtitle_storage = get_subtitle_storage()
    stored_subs = subtitle_storage.load_or_new(item)

-    # get the plex item
-    plex_item = get_item(rating_key)
-
    # look for subtitles for all available media parts and all of their languages
    has_multiple_parts = len(plex_item.media) > 1
    part_index = 0
@@ -89,6 +100,12 @@ def ItemDetailsMenu(rating_key, title=None, base_title=None, item_title=None, ra
            part_id = str(part.id)
            part_index += 1

+            part_index_addon = ""
+            part_summary_addon = ""
+            if has_multiple_parts:
+                part_index_addon = u"File %s: " % part_index
+                part_summary_addon = "%s " % filename
+
            # iterate through all configured languages
            for lang in config.lang_list:
                # get corresponding stored subtitle data for that media part (physical media item), for language
@@ -96,12 +113,6 @@ def ItemDetailsMenu(rating_key, title=None, base_title=None, item_title=None, ra
                current_sub_id = None
                current_sub_provider_name = None

-                part_index_addon = ""
-                part_summary_addon = ""
-                if has_multiple_parts:
-                    part_index_addon = u"File %s: " % part_index
-                    part_summary_addon = "%s " % filename
-
                summary = u"%sNo current subtitle in storage" % part_summary_addon
                current_score = None
                if current_sub:
@@ -111,45 +122,77 @@ def ItemDetailsMenu(rating_key, title=None, base_title=None, item_title=None, ra

                    summary = u"%sCurrent subtitle: %s (added: %s, %s), Language: %s, Score: %i, Storage: %s" % \
                              (part_summary_addon, current_sub.provider_name, df(current_sub.date_added),
-                               current_sub.mode_verbose, lang, current_sub.score, current_sub.storage_type)
+                               current_sub.mode_verbose, display_language(lang), current_sub.score,
+                               current_sub.storage_type)

                    oc.add(DirectoryObject(
                        key=Callback(SubtitleOptionsMenu, rating_key=rating_key, part_id=part_id, title=title,
-                                     item_title=item_title, language=lang, language_name=lang.name,
+                                     item_title=item_title, language=lang, language_name=display_language(lang),
                                     current_id=current_sub_id,
                                     item_type=plex_item.type, filename=filename, current_data=summary,
                                     randomize=timestamp(), current_provider=current_sub_provider_name,
                                     current_score=current_score),
-                        title=u"%sActions for %s subtitle" % (part_index_addon, lang.name),
+                        title=u"%sManage %s subtitle" % (part_index_addon, display_language(lang)),
                        summary=summary
                    ))
                else:
                    oc.add(DirectoryObject(
                        key=Callback(ListAvailableSubsForItemMenu, rating_key=rating_key, part_id=part_id, title=title,
-                                     item_title=item_title, language=lang, language_name=lang.name,
+                                     item_title=item_title, language=lang, language_name=display_language(lang),
                                     current_id=current_sub_id,
                                     item_type=plex_item.type, filename=filename, current_data=summary,
                                     randomize=timestamp(), current_provider=current_sub_provider_name,
                                     current_score=current_score),
-                        title=u"%sList %s subtitles" % (part_index_addon, lang.name),
+                        title=u"%sList %s subtitles" % (part_index_addon, display_language(lang)),
                        summary=summary
                    ))

-    add_ignore_options(oc, "videos", title=item_title, rating_key=rating_key, callback_menu=IgnoreMenu)
+            if config.plex_transcoder:
+                # embedded subtitles
+                embedded_count = 0
+                embedded_langs = []
+                for stream in part.streams:
+                    # subtitle stream
+                    if stream.stream_type == 3 and not stream.stream_key and stream.codec in TEXT_SUBTITLE_EXTS:
+                        lang = get_language_from_stream(stream.language_code)
+
+                        if not lang and config.treat_und_as_first:
+                            lang = list(config.lang_list)[0]
+
+                        if lang:
+                            embedded_langs.append(lang)
+                            embedded_count += 1
+
+                if embedded_count:
+                    oc.add(DirectoryObject(
+                        key=Callback(ListEmbeddedSubsForItemMenu, rating_key=rating_key, part_id=part_id, title=title,
+                                     item_type=plex_item.type, item_title=item_title, base_title=base_title,
+                                     randomize=timestamp()),
+                        title=u"%sEmbedded subtitles (%s)" % (part_index_addon, ", ".join(display_language(l) for l in
+                                                                                          set(embedded_langs))),
+                        summary=u"Extract and activate embedded subtitle streams"
+                    ))
+
+    ignore_title = item_title
+    if current_kind == "episode":
+        ignore_title = get_item_title(item)
+    add_ignore_options(oc, "videos", title=ignore_title, rating_key=rating_key, callback_menu=IgnoreMenu)
    subtitle_storage.destroy()

    return oc


-@route(PREFIX + '/item/current_sub/{rating_key}/{part_id}', force=bool)
-@debounce
+@route(PREFIX + '/item/current_sub/{rating_key}/{part_id}')
 def SubtitleOptionsMenu(**kwargs):
-    oc = SubFolderObjectContainer(title2=kwargs["title"], replace_parent=True)
+    oc = SubFolderObjectContainer(title2=unicode(kwargs["title"]), replace_parent=True, header=kwargs.get("header"),
+                                  message=kwargs.get("message"))
    rating_key = kwargs["rating_key"]
    part_id = kwargs["part_id"]
    language = kwargs["language"]
+    current_data = kwargs["current_data"]

    current_sub, stored_subs, storage = get_current_sub(rating_key, part_id, language)
+    subs_count = stored_subs.count(part_id, language)
    kwargs.pop("randomize")

    oc.add(DirectoryObject(
@@ -159,22 +202,235 @@ def SubtitleOptionsMenu(**kwargs):
        summary=kwargs["current_data"],
        thumb=default_thumb
    ))
+    if subs_count:
+        oc.add(DirectoryObject(
+            key=Callback(ListStoredSubsForItemMenu, randomize=timestamp(), **kwargs),
+            title=u"Select active %s subtitle" % kwargs["language_name"],
+            summary=u"%d subtitles in storage" % subs_count
+        ))
+
    oc.add(DirectoryObject(
        key=Callback(ListAvailableSubsForItemMenu, randomize=timestamp(), **kwargs),
-        title=u"List %s subtitles" % kwargs["language_name"],
+        title=u"List available %s subtitles" % kwargs["language_name"],
        summary=kwargs["current_data"]
    ))
    if current_sub:
        oc.add(DirectoryObject(
            key=Callback(SubtitleModificationsMenu, randomize=timestamp(), **kwargs),
-            title=u"Modify %s subtitle" % kwargs["language_name"],
+            title=u"Modify current %s subtitle" % kwargs["language_name"],
            summary=u"Currently applied mods: %s" % (", ".join(current_sub.mods) if current_sub.mods else "none")
        ))

+        if current_sub.provider_name != "embedded":
+            oc.add(DirectoryObject(
+                key=Callback(BlacklistSubtitleMenu, randomize=timestamp(), **kwargs),
+                title=u"Blacklist current %s subtitle and search for a new one" % kwargs["language_name"],
+                summary=current_data
+            ))
+
+        current_bl, subs = stored_subs.get_blacklist(part_id, language)
+        if current_bl:
+            oc.add(DirectoryObject(
+                key=Callback(ManageBlacklistMenu, randomize=timestamp(), **kwargs),
+                title=u"Manage blacklist (%s contained)" % len(current_bl),
+                summary=u"Inspect currently blacklisted subtitles"
+            ))
+
    storage.destroy()
    return oc


+@route(PREFIX + '/item/list_stored_subs/{rating_key}/{part_id}')
+def ListStoredSubsForItemMenu(**kwargs):
+    oc = SubFolderObjectContainer(title2=unicode(kwargs["title"]), replace_parent=True)
+    rating_key = kwargs["rating_key"]
+    part_id = kwargs["part_id"]
+    language = Language.fromietf(kwargs["language"])
+
+    current_sub, stored_subs, storage = get_current_sub(rating_key, part_id, language)
+    all_subs = stored_subs.get_all(part_id, language)
+    kwargs.pop("randomize")
+
+    for key, subtitle in sorted(filter(lambda x: x[0] != "current", all_subs.items()),
+                                key=lambda x: x[1].date_added, reverse=True):
+        is_current = key == all_subs["current"]
+
+        summary = u"added: %s, %s, Language: %s, Score: %i, Storage: %s" % \
+                  (df(subtitle.date_added),
+                   subtitle.mode_verbose, display_language(language), subtitle.score,
+                   subtitle.storage_type)
+
+        sub_name = subtitle.provider_name
+        if sub_name == "embedded":
+            sub_name += " (%s)" % subtitle.id
+
+        oc.add(DirectoryObject(
+            key=Callback(SelectStoredSubForItemMenu, randomize=timestamp(), sub_key="__".join(key), **kwargs),
+            title=u"%s%s, Score: %s" % ("Current: " if is_current else "Stored: ", sub_name,
+                                        subtitle.score),
+            summary=summary
+        ))
+
+    return oc
+
+
+@route(PREFIX + '/item/set_current_sub/{rating_key}/{part_id}')
+@debounce
+def SelectStoredSubForItemMenu(**kwargs):
+    rating_key = kwargs["rating_key"]
+    part_id = kwargs["part_id"]
+    language = Language.fromietf(kwargs["language"])
+    item_type = kwargs["item_type"]
+    sub_key = tuple(kwargs.pop("sub_key").split("__"))
+
+    plex_item = get_item(rating_key)
+    storage = get_subtitle_storage()
+    stored_subs = storage.load(plex_item.rating_key)
+
+    subtitles = stored_subs.get_all(part_id, language)
+    subtitle = subtitles[sub_key]
+
+    subtitles["current"] = sub_key
+
+    save_stored_sub(subtitle, rating_key, part_id, language, item_type, plex_item=plex_item, storage=storage,
+                    stored_subs=stored_subs)
+
+    storage.destroy()
+
+    kwargs.pop("randomize")
+
+    kwargs["header"] = 'Success'
+    kwargs["message"] = 'Subtitle saved to disk'
+
+    return SubtitleOptionsMenu(randomize=timestamp(), **kwargs)
+
+
+@route(PREFIX + '/item/blacklist_recent/{language}')
+@route(PREFIX + '/item/blacklist_recent')
+def BlacklistRecentSubtitleMenu(**kwargs):
+    if "last_played_items" not in Dict or not Dict["last_played_items"]:
+        return
+
+    rating_key = Dict["last_played_items"][0]
+    kwargs["rating_key"] = rating_key
+    return BlacklistAllPartsSubtitleMenu(**kwargs)
+
+
+@route(PREFIX + '/item/blacklist_all/{rating_key}/{language}')
+@route(PREFIX + '/item/blacklist_all/{rating_key}')
+def BlacklistAllPartsSubtitleMenu(**kwargs):
+    rating_key = kwargs.get("rating_key")
+    language = kwargs.get("language")
+    if language:
+        language = Language.fromietf(language)
+
+    item = get_item(rating_key)
+
+    if not item:
+        return
+
+    item_title = get_item_title(item)
+
+    subtitle_storage = get_subtitle_storage()
+    stored_subs = subtitle_storage.load_or_new(item)
+    for part_id, languages in stored_subs.parts.iteritems():
+        sub_dict = languages
+        if language:
+            key = str(language)
+            if key not in sub_dict:
+                continue
+
+            sub_dict = {key: sub_dict[key]}
+
+        for language, subs in sub_dict.iteritems():
+            if "current" in subs:
+                stored_subs.blacklist(part_id, language, subs["current"])
+                Log.Info("Added %s to blacklist", subs["current"])
+
+    subtitle_storage.save(stored_subs)
+    subtitle_storage.destroy()
+
+    return RefreshItem(rating_key=rating_key, item_title=item_title, force=True, randomize=timestamp(), timeout=30000)
+
+
+def blacklist(rating_key, part_id, language):
+    current_sub, stored_subs, storage = get_current_sub(rating_key, part_id, language)
+    if not current_sub:
+        return
+
+    stored_subs.blacklist(part_id, language, current_sub.key)
+    storage.save(stored_subs)
+    storage.destroy()
+
+    Log.Info("Added %s to blacklist", current_sub.key)
+
+    return True
+
+
+@route(PREFIX + '/item/blacklist/{rating_key}/{part_id}')
+@debounce
+def BlacklistSubtitleMenu(**kwargs):
+    rating_key = kwargs["rating_key"]
+    part_id = kwargs["part_id"]
+    language = kwargs["language"]
+    item_title = kwargs["item_title"]
+
+    blacklist(rating_key, part_id, language)
+    kwargs.pop("randomize")
+
+    return RefreshItem(rating_key=rating_key, item_title=item_title, force=True, randomize=timestamp(), timeout=30000)
+
+
+@route(PREFIX + '/item/manage_blacklist/{rating_key}/{part_id}', force=bool)
+@debounce
+def ManageBlacklistMenu(**kwargs):
+    oc = SubFolderObjectContainer(title2=unicode(kwargs["title"]), replace_parent=True)
+    rating_key = kwargs["rating_key"]
+    part_id = kwargs["part_id"]
+    language = kwargs["language"]
+    remove_sub_key = kwargs.pop("remove_sub_key", None)
+
+    current_sub, stored_subs, storage = get_current_sub(rating_key, part_id, language)
+    current_bl, subs = stored_subs.get_blacklist(part_id, language)
+
+    if remove_sub_key:
+        remove_sub_key = tuple(remove_sub_key.split("__"))
+        stored_subs.blacklist(part_id, language, remove_sub_key, add=False)
+        storage.save(stored_subs)
+        Log.Info("Removed %s from blacklist", remove_sub_key)
+
+    kwargs.pop("randomize")
+
+    oc.add(DirectoryObject(
+        key=Callback(ItemDetailsMenu, rating_key=kwargs["rating_key"], item_title=kwargs["item_title"],
+                     title=kwargs["title"], randomize=timestamp()),
+        title=u"< Back to %s" % kwargs["title"],
+        summary=kwargs["current_data"],
+        thumb=default_thumb
+    ))
+
+    def sorter(pair):
+        # thanks RestrictedModule parser for messing with lambda (x, y)
+        return pair[1]["date_added"]
+
+    for sub_key, data in sorted(current_bl.iteritems(), key=sorter, reverse=True):
+        provider_name, subtitle_id = sub_key
+        title = u"%s, %s (added: %s, %s), Language: " \
+                u"%s, Score: %i, Storage: %s" % (provider_name, subtitle_id, df(data["date_added"]),
+                                                 current_sub.get_mode_verbose(data["mode"]),
+                                                 display_language(Language.fromietf(language)), data["score"],
+                                                 data["storage_type"])
+        oc.add(DirectoryObject(
+            key=Callback(ManageBlacklistMenu, remove_sub_key="__".join(sub_key), randomize=timestamp(), **kwargs),
+            title=title,
+            summary=u"Remove subtitle from blacklist"
+        ))
+
+    storage.destroy()
+
+    return oc
+
+
@route(PREFIX + '/item/search/{rating_key}/{part_id}', force=bool)
@debounce
 def ListAvailableSubsForItemMenu(rating_key=None, part_id=None, title=None, item_title=None, filename=None,
@@ -200,18 +456,22 @@ def ListAvailableSubsForItemMenu(rating_key=None, part_id=None, title=None, item
    ))

    metadata = get_plex_metadata(rating_key, part_id, item_type)
-    scanned_parts = scan_videos([metadata], kind="series" if item_type == "episode" else "movie", ignore_all=True)
+    plex_part = None
+    if not config.low_impact_mode:
+        scanned_parts = scan_videos([metadata], ignore_all=True)

-    if not scanned_parts:
-        Log.Error("Couldn't list available subtitles for %s", rating_key)
-        return oc
+        if not scanned_parts:
+            Log.Error("Couldn't list available subtitles for %s", rating_key)
+            return oc

-    video, plex_part = scanned_parts.items()[0]
+        video, plex_part = scanned_parts.items()[0]

-    video_display_data = [video.format] if video.format else []
-    if video.release_group:
-        video_display_data.append(u"by %s" % video.release_group)
-    video_display_data = " ".join(video_display_data)
+        video_display_data = [video.format] if video.format else []
+        if video.release_group:
+            video_display_data.append(u"by %s" % video.release_group)
+        video_display_data = " ".join(video_display_data)
+    else:
+        video_display_data = metadata["filename"]

    current_display = (u"Current: %s (%s) " % (current_provider, current_score) if current_provider else "")
    if not running:
@@ -243,7 +503,8 @@ def ListAvailableSubsForItemMenu(rating_key=None, part_id=None, title=None, item
                         part_id=part_id, title=title, current_id=current_id, item_type=item_type,
                         current_provider=current_provider, current_score=current_score,
                         randomize=timestamp()),
-            title=u"Searching for %s subs (%s), refresh here ..." % (get_language(language).name, video_display_data),
+            title=u"Searching for %s subs (%s), refresh here ..." % (display_language(get_language(language)),
+                                                                     video_display_data),
            summary=u"%sFilename: %s" % (current_display, filename),
            thumb=default_thumb
        ))
@@ -251,25 +512,35 @@ def ListAvailableSubsForItemMenu(rating_key=None, part_id=None, title=None, item
    if not search_results or search_results == "found_none":
        return oc

+    current_sub, stored_subs, storage = get_current_sub(rating_key, part_id, language)
+    current_bl, subs = stored_subs.get_blacklist(part_id, language)
+
    seen = []
    for subtitle in search_results:
        if subtitle.id in seen:
            continue

+        bl_addon = ""
+        if (str(subtitle.provider_name), str(subtitle.id)) in current_bl:
+            bl_addon = "Blacklisted "
+
        wrong_fps_addon = ""
        if subtitle.wrong_fps:
-            wrong_fps_addon = " (wrong FPS, sub: %s, media: %s)" % (subtitle.fps, plex_part.fps)
+            if plex_part:
+                wrong_fps_addon = " (wrong FPS, sub: %s, media: %s)" % (subtitle.fps, plex_part.fps)
+            else:
+                wrong_fps_addon = " (wrong FPS, sub: %s, media: unknown, low impact mode)" % subtitle.fps

        oc.add(DirectoryObject(
            key=Callback(TriggerDownloadSubtitle, rating_key=rating_key, randomize=timestamp(), item_title=item_title,
                         subtitle_id=str(subtitle.id), language=language),
-            title=u"%s: %s, score: %s%s" % ("Available" if current_id != subtitle.id else "Current",
-                                            subtitle.provider_name, subtitle.score, wrong_fps_addon),
+            title=u"%s%s: %s, score: %s%s" % (bl_addon, "Available" if current_id != subtitle.id else "Current",
+                                              subtitle.provider_name, subtitle.score, wrong_fps_addon),
            summary=u"Release: %s, Matches: %s" % (subtitle.release_info, ", ".join(subtitle.matches)),
            thumb=default_thumb
        ))

-        seen.append(current_id)
+        seen.append(subtitle.id)

    return oc

@@ -296,3 +567,74 @@ def TriggerDownloadSubtitle(rating_key=None, subtitle_id=None, item_title=None,
    scheduler.clear_task_data("AvailableSubsForItem")

    return fatality(randomize=timestamp(), header=" ", replace_parent=True)
+
+
+@route(PREFIX + '/item/embedded/{rating_key}/{part_id}')
+def ListEmbeddedSubsForItemMenu(**kwargs):
+    rating_key = kwargs["rating_key"]
+    part_id = kwargs["part_id"]
+    title = kwargs["title"]
+    kwargs.pop("randomize")
+
+    oc = SubFolderObjectContainer(title2=title, replace_parent=True)
+
+    oc.add(DirectoryObject(
+        key=Callback(ItemDetailsMenu, rating_key=kwargs["rating_key"], item_title=kwargs["item_title"],
+                     base_title=kwargs["base_title"], title=kwargs["item_title"], randomize=timestamp()),
+        title=u"< Back to %s" % kwargs["title"],
+        thumb=default_thumb
+    ))
+
+    plex_item = get_item(rating_key)
+    part = get_part(plex_item, part_id)
+
+    if part:
+        for stream_data in get_embedded_subtitle_streams(part, skip_duplicate_unknown=False):
+            language = stream_data["language"]
+            is_unknown = stream_data["is_unknown"]
+            stream = stream_data["stream"]
+
+            if language:
+                forced = stream.forced
+                oc.add(DirectoryObject(
+                    key=Callback(TriggerExtractEmbeddedSubForItemMenu, randomize=timestamp(),
+                                 stream_index=str(stream.index), language=language, with_mods=True, **kwargs),
+                    title=u"Extract stream %s, "
+                          u"%s%s%s%s with default mods" % (stream.index, display_language(language),
+                                                           " (unknown)" if is_unknown else "",
+                                                           " (forced)" if forced else "",
+                                                           " (\"%s\")" % stream.title if stream.title else ""),
+                ))
+                oc.add(DirectoryObject(
+                    key=Callback(TriggerExtractEmbeddedSubForItemMenu, randomize=timestamp(),
+                                 stream_index=str(stream.index), language=language, **kwargs),
+                    title=u"Extract stream %s, %s%s%s%s" % (stream.index, display_language(language),
+                                                            " (unknown)" if is_unknown else "",
+                                                            " (forced)" if forced else "",
+                                                            " (\"%s\")" % stream.title if stream.title else ""),
+                ))
+    return oc
+
+
+@route(PREFIX + '/item/extract_embedded/{rating_key}/{part_id}/{stream_index}')
+@debounce
+def TriggerExtractEmbeddedSubForItemMenu(**kwargs):
+    rating_key = kwargs["rating_key"]
+    part_id = kwargs.get("part_id")
+    stream_index = kwargs.get("stream_index")
+
+    Thread.Create(extract_embedded_sub, **kwargs)
+    header = u"Extracting of embedded subtitle %s of part %s:%s triggered" % (stream_index, rating_key, part_id)
+
+    kwargs.pop("randomize")
+    kwargs.pop("item_type")
+    kwargs.pop("stream_index")
+    kwargs.pop("part_id")
+    kwargs.pop("with_mods", False)
+    kwargs.pop("language")
+    kwargs["title"] = kwargs["item_title"]
+    kwargs["header"] = header
+
+    return ItemDetailsMenu(randomize=timestamp(), **kwargs)
+
+
@@ -2,11 +2,10 @@

 from subzero.constants import PREFIX, TITLE, ART
 from support.config import config
-from support.helpers import pad_title, timestamp, df, get_plex_item_display_title
+from support.helpers import pad_title, timestamp, df, display_language
 from support.scheduler import scheduler
 from support.ignore import ignore_list
-from support.items import get_item_thumb, get_on_deck_items, get_all_items, get_items_info, get_item, \
-    get_item_kind_from_item
+from support.items import get_item_thumb, get_on_deck_items, get_all_items, get_items_info, get_item, get_item_title
 from menu_helpers import main_icon, debounce, SubFolderObjectContainer, default_thumb, dig_tree, add_ignore_options, \
    ObjectContainer, route, handler
 from item_details import ItemDetailsMenu
@@ -92,10 +91,9 @@ def fatality(randomize=None, force_title=None, header=None, message=None, only_r
        ))
        oc.add(DirectoryObject(
            key=Callback(RecentMissingSubtitlesMenu, randomize=timestamp()),
-            title="Items with missing subtitles",
-            summary="Shows the items honoring the configured 'Item age to be considered recent'-setting (%s)"
-                    " and allowing you to individually (force-) refresh their metadata/subtitles. " %
-                    Prefs["scheduler.item_is_recent_age"],
+            title="Show recently added items with missing subtitles",
+            summary="Lists items with missing subtitles. Click on \"Find recent items with missing subs\" "
+                    "to update list",
            thumb=R("icon-missing.jpg")
        ))
        oc.add(DirectoryObject(
@@ -112,9 +110,11 @@ def fatality(randomize=None, force_title=None, header=None, message=None, only_r
        if task.ready_for_display:
            task_state = "Running: %s/%s (%s%%)" % (task.items_done, task.items_searching, task.percentage)
        else:
-            task_state = "Last scheduler run: %s; Next scheduled run: %s; Last runtime: %s" % (
-                df(scheduler.last_run(task_name)) or "never",
-                df(scheduler.next_run(task_name)) or "never",
+            lr = scheduler.last_run(task_name)
+            nr = scheduler.next_run(task_name)
+            task_state = "Last run: %s; Next scheduled run: %s; Last runtime: %s" % (
+                df(scheduler.last_run(task_name)) if lr else "never",
+                df(scheduler.next_run(task_name)) if nr else "never",
                str(task.last_run_time).split(".")[0])

        oc.add(DirectoryObject(
@@ -158,6 +158,19 @@ def fatality(randomize=None, force_title=None, header=None, message=None, only_r
        ))

    if not only_refresh:
+        if "provider_throttle" in Dict and Dict["provider_throttle"].keys():
+            summary_data = []
+            for provider, data in Dict["provider_throttle"].iteritems():
+                reason, until, desc = data
+                summary_data.append("%s until %s (%s)" % (provider, until.strftime("%y/%m/%d %H:%M"), reason))
+
+            oc.add(DirectoryObject(
+                key=Callback(fatality, force_title=" ", randomize=timestamp()),
+                title=pad_title("Throttled providers: %s" % ", ".join(Dict["provider_throttle"].keys())),
+                summary=", ".join(summary_data),
+                thumb=R("icon-throttled.jpg")
+            ))
+
        oc.add(DirectoryObject(
            key=Callback(AdvancedMenu),
            title=pad_title("Advanced functions"),
@@ -187,15 +200,10 @@ def RecentlyPlayedMenu():
        if not item:
            continue

-        kind = get_item_kind_from_item(item)
-        if kind not in ("episode", "movie"):
+        if getattr(getattr(item, "__class__"), "__name__") not in ("Episode", "Movie"):
            continue

-        if kind == "episode":
-            item_title = get_plex_item_display_title(item, "show", parent=item.season, section_title=None,
-                                                     parent_title=item.show.title)
-        else:
-            item_title = get_plex_item_display_title(item, kind, section_title=None)
+        item_title = get_item_title(item)

        oc.add(DirectoryObject(
            title=item_title,
@@ -233,7 +241,7 @@ def RecentMissingSubtitlesMenu(force=False, randomize=None):
    if not running:
        oc.add(DirectoryObject(
            key=Callback(RecentMissingSubtitlesMenu, force=True, randomize=timestamp()),
-            title=u"Get items with missing subtitles",
+            title=u"Find recent items with missing subtitles",
            thumb=default_thumb
        ))
    else:
@@ -249,7 +257,7 @@ def RecentMissingSubtitlesMenu(force=False, randomize=None):
                key=Callback(ItemDetailsMenu, title=title + " > " + item_title, item_title=item_title,
                             rating_key=item_id),
                title=item_title,
-                summary="Missing: %s" % ", ".join(l.name for l in missing_languages),
+                summary="Missing: %s" % ", ".join(display_language(l) for l in missing_languages),
                thumb=get_item_thumb(item) or default_thumb
            ))

@@ -2,22 +2,28 @@
 import locale
 import logging
 import os
+import platform
+import traceback

 import logger
+import copy

+from requests import HTTPError
 from item_details import ItemDetailsMenu
 from refresh_item import RefreshItem
 from menu_helpers import add_ignore_options, dig_tree, set_refresh_menu_state, \
-    should_display_ignore, default_thumb, debounce, ObjectContainer, SubFolderObjectContainer, route
+    default_thumb, debounce, ObjectContainer, SubFolderObjectContainer, route, \
+    extract_embedded_sub
 from main import fatality, IgnoreMenu
 from advanced import DispatchRestart
 from subzero.constants import ART, PREFIX, DEPENDENCY_MODULE_NAMES
+from support.plex_media import get_all_parts, get_embedded_subtitle_streams
 from support.scheduler import scheduler
 from support.config import config
-from support.helpers import timestamp, df
+from support.helpers import timestamp, df, display_language
 from support.ignore import ignore_list
-from support.items import get_all_items, get_items_info, \
-    get_item_kind_from_rating_key, get_item
+from support.items import get_all_items, get_items_info, get_item_kind_from_rating_key, get_item, MI_KEY, get_item_title
+from support.storage import get_subtitle_storage

 # init GUI
 ObjectContainer.art = R(ART)
@@ -25,6 +31,7 @@ ObjectContainer.no_cache = True

 # default thumb for DirectoryObjects
 DirectoryObject.thumb = default_thumb
+Plugin.AddViewGroup("full_details", viewMode="InfoList", mediaType="items", type="list", summary=2)


@route(PREFIX + '/section/firstLetter/key', deeper=bool)
@@ -51,7 +58,7 @@ def FirstLetterMetadataMenu(rating_key, key, title=None, base_title=None, displa

@route(PREFIX + '/section/contents', display_items=bool)
 def MetadataMenu(rating_key, title=None, base_title=None, display_items=False, previous_item_type=None,
-                 previous_rating_key=None, randomize=None):
+                 previous_rating_key=None, header=None, randomize=None):
    """
    displays the contents of a section based on whether it has a deeper tree or not (movies->movie (item) list; series->series list)
    :param rating_key:
@@ -65,16 +72,18 @@ def MetadataMenu(rating_key, title=None, base_title=None, display_items=False, p
    title = unicode(title)
    item_title = title
    title = base_title + " > " + title
-    oc = SubFolderObjectContainer(title2=title, no_cache=True, no_history=True)
+    oc = SubFolderObjectContainer(title2=title, no_cache=True, no_history=True, header=header,
+                                  view_group="full_details")

    current_kind = get_item_kind_from_rating_key(rating_key)

    if display_items:
        timeout = 30
+        show = None

        # add back to series for season
        if current_kind == "season":
-            timeout = 360
+            timeout = 720

            show = get_item(previous_rating_key)
            oc.add(DirectoryObject(
@@ -84,16 +93,43 @@ def MetadataMenu(rating_key, title=None, base_title=None, display_items=False, p
                thumb=show.thumb or default_thumb
            ))
        elif current_kind == "series":
-            timeout = 1800
+            # it shouldn't take more than 6 minutes to scan all of a series' files and determine the force refresh
+            timeout = 3600

        items = get_all_items(key="children", value=rating_key, base="library/metadata")
        kind, deeper = get_items_info(items)
        dig_tree(oc, items, MetadataMenu,
                 pass_kwargs={"base_title": title, "display_items": deeper, "previous_item_type": kind,
                              "previous_rating_key": rating_key})
+
        # we don't know exactly where we are here, only add ignore option to series
-        if should_display_ignore(items, previous=previous_item_type):
-            add_ignore_options(oc, "series", title=item_title, rating_key=rating_key, callback_menu=IgnoreMenu)
+        if current_kind in ("series", "season"):
+            item = get_item(rating_key)
+            sub_title = get_item_title(item)
+            add_ignore_options(oc, current_kind, title=sub_title, rating_key=rating_key, callback_menu=IgnoreMenu)
+
+        # mass-extract embedded
+        if current_kind == "season" and config.plex_transcoder:
+            for lang in config.lang_list:
+                oc.add(DirectoryObject(
+                    key=Callback(SeasonExtractEmbedded, rating_key=rating_key, language=lang,
+                                 base_title=show.section.title, display_items=display_items, item_title=item_title,
+                                 title=title,
+                                 previous_item_type=previous_item_type, with_mods=True,
+                                 previous_rating_key=previous_rating_key, randomize=timestamp()),
+                    title=u"Extract missing %s embedded subtitles with default mods" % display_language(lang),
+                    summary="Extracts the not yet extracted embedded subtitles of all episodes for the current season "
+                            "with all configured default modifications"
+                ))
+                oc.add(DirectoryObject(
+                    key=Callback(SeasonExtractEmbedded, rating_key=rating_key, language=lang,
+                                 base_title=show.section.title, display_items=display_items, item_title=item_title,
+                                 title=title,
+                                 previous_item_type=previous_item_type, with_mods=False,
+                                 previous_rating_key=previous_rating_key, randomize=timestamp()),
+                    title=u"Extract missing %s embedded subtitles" % display_language(lang),
+                    summary="Extracts the not yet extracted embedded subtitles of all episodes for the current season"
+                ))

        # add refresh
        oc.add(DirectoryObject(
@@ -115,6 +151,48 @@ def MetadataMenu(rating_key, title=None, base_title=None, display_items=False, p
    return oc


+@route(PREFIX + '/season/extract_embedded/{rating_key}/{language}')
+def SeasonExtractEmbedded(**kwargs):
+    rating_key = kwargs.get("rating_key")
+    requested_language = kwargs.pop("language")
+    with_mods = kwargs.pop("with_mods")
+    item_title = kwargs.pop("item_title")
+    title = kwargs.pop("title")
+
+    Thread.Create(season_extract_embedded, **{"rating_key": rating_key, "requested_language": requested_language,
+                                              "with_mods": with_mods})
+
+    kwargs["header"] = 'Success'
+    kwargs["message"] = u"Extracting of embedded subtitles for %s triggered" % title
+
+    kwargs.pop("randomize")
+    return MetadataMenu(randomize=timestamp(), title=item_title, **kwargs)
+
+
+def season_extract_embedded(rating_key, requested_language, with_mods=False):
+    # get stored subtitle info for item id
+    subtitle_storage = get_subtitle_storage()
+
+    try:
+        for data in get_all_items(key="children", value=rating_key, base="library/metadata"):
+            item = get_item(data[MI_KEY])
+            if item:
+                stored_subs = subtitle_storage.load_or_new(item)
+                for part in get_all_parts(item):
+                    embedded_subs = stored_subs.get_by_provider(part.id, requested_language, "embedded")
+                    if not embedded_subs:
+                        stream_data = get_embedded_subtitle_streams(part, requested_language=requested_language,
+                                                                    get_forced=config.forced_only)
+                        if stream_data:
+                            stream = stream_data[0]["stream"]
+
+                            extract_embedded_sub(rating_key=item.rating_key, part_id=part.id,
+                                                 stream_index=str(stream.index),
+                                                 language=requested_language, with_mods=with_mods)
+    finally:
+        subtitle_storage.destroy()
+
+
@route(PREFIX + '/ignore_list')
 def IgnoreListMenu():
    oc = SubFolderObjectContainer(title2="Ignore list", replace_parent=True)
@@ -132,15 +210,20 @@ def HistoryMenu():
    history = get_history()
    oc = SubFolderObjectContainer(title2="History", replace_parent=True)

-    for item in history.history_items:
+    for item in history.items:
+        possible_language = item.language
+        language_display = item.lang_name if not possible_language else display_language(possible_language)
+
        oc.add(DirectoryObject(
            key=Callback(ItemDetailsMenu, title=item.title, item_title=item.item_title,
                         rating_key=item.rating_key),
            title=u"%s (%s)" % (item.item_title, item.mode_verbose),
-            summary=u"%s in %s (%s, score: %s), %s" % (item.lang_name, item.section_title,
+            summary=u"%s in %s (%s, score: %s), %s" % (language_display, item.section_title,
                                                       item.provider_name, item.score, df(item.time))
        ))

+    history.destroy()
+
    return oc


@@ -152,6 +235,15 @@ def RefreshMissing(randomize=None):
    return fatality(header=header, replace_parent=True)


+def replace_item(obj, key, replace_value):
+    for k, v in obj.items():
+        if isinstance(v, dict):
+            obj[k] = replace_item(v, key, replace_value)
+    if key in obj:
+        obj[key] = replace_value
+    return obj
+
+
@route(PREFIX + '/ValidatePrefs', enforce_route=True)
 def ValidatePrefs():
    Core.log.setLevel(logging.DEBUG)
@@ -196,8 +288,8 @@ def ValidatePrefs():
        DispatchRestart()
        return

-    scheduler.stop()
    scheduler.setup_tasks()
+    scheduler.clear_task_data("MissingSubtitles")
    set_refresh_menu_state(None)

    Log.Debug("Validate Prefs called.")
@@ -205,21 +297,31 @@ def ValidatePrefs():
    # SZ config debug
    Log.Debug("--- SZ Config-Debug ---")
    for attr in [
-            "app_support_path", "data_path", "data_items_path", "enable_agent",
+            "version", "app_support_path", "data_path", "data_items_path", "enable_agent",
            "enable_channel", "permissions_ok", "missing_permissions", "fs_encoding",
-            "subtitle_destination_folder", "dbm_supported", "lang_list", "providers"]:
-        Log.Debug("config.%s: %s", attr, getattr(config, attr))
+            "subtitle_destination_folder", "new_style_cache", "dbm_supported", "lang_list", "providers",
+            "plex_transcoder", "refiner_settings"]:
+
+        value = getattr(config, attr)
+        if isinstance(value, dict):
+            d = replace_item(copy.deepcopy(value), "api_key", "xxxxxxxxxxxxxxxxxxxxxxxxx")
+            Log.Debug("config.%s: %s", attr, d)
+            continue
+
+        Log.Debug("config.%s: %s", attr, value)

    for attr in ["plugin_log_path", "server_log_path"]:
        value = getattr(config, attr)
-        access = os.access(value, os.R_OK)
-        if Core.runtime.os == "Windows":
-            try:
-                f = open(value, "r")
-                f.read(1)
-                f.close()
-            except:
-                access = False
+
+        if value:
+            access = os.access(value, os.R_OK)
+            if Core.runtime.os == "Windows":
+                try:
+                    f = open(value, "r")
+                    f.read(1)
+                    f.close()
+                except:
+                    access = False

        Log.Debug("config.%s: %s (accessible: %s)", attr, value, access)

@@ -227,10 +329,33 @@ def ValidatePrefs():
            "subtitles.save.filesystem", ]:
        Log.Debug("Pref.%s: %s", attr, Prefs[attr])

+    # debug drone
+    if "sonarr" in config.refiner_settings or "radarr" in config.refiner_settings:
+        Log.Debug("----- Connections -----")
+        from subliminal_patch.refiners.drone import SonarrClient, RadarrClient
+        for key, cls in [("sonarr", SonarrClient), ("radarr", RadarrClient)]:
+            if key in config.refiner_settings:
+                cname = key.capitalize()
+                try:
+                    status = cls(**config.refiner_settings[key]).status()
+                except HTTPError, e:
+                    if e.response.status_code == 401:
+                        Log.Debug("%s: NOT WORKING - BAD API KEY", cname)
+                    else:
+                        Log.Debug("%s: NOT WORKING - %s", cname, traceback.format_exc())
+                except:
+                    Log.Debug("%s: NOT WORKING - %s", cname, traceback.format_exc())
+                else:
+                    if status["version"]:
+                        Log.Debug("%s: OK - %s", cname, status["version"])
+                    else:
+                        Log.Debug("%s: NOT WORKING - %s", cname)
+
    # fixme: check existance of and os access of logs
+    Log.Debug("----- Environment -----")
    Log.Debug("Platform: %s", Core.runtime.platform)
    Log.Debug("OS: %s", Core.runtime.os)
-    Log.Debug("----- Environment -----")
+    Log.Debug("Python: %s", platform.python_version())
    for key, value in os.environ.iteritems():
        if key.startswith("PLEX") or key.startswith("SZ_"):
            if "TOKEN" in key:
@@ -1,15 +1,24 @@
 # coding=utf-8
+import traceback
 import types
 import datetime
+import subprocess
+import os

 from func import enable_channel_wrapper
-from support.items import get_kind, get_item_thumb
-from support.helpers import get_video_display_title
+from subzero.language import Language
+from support.items import get_kind, get_item_thumb, get_item, get_item_kind_from_item, refresh_item
+from support.helpers import get_video_display_title, pad_title, display_language, quote_args
 from support.ignore import ignore_list
 from support.lib import get_intent
 from support.config import config
 from subzero.constants import ICON_SUB, ICON
+from support.plex_media import get_part, get_plex_metadata
 from support.scheduler import scheduler
+from support.scanning import scan_videos
+from support.storage import save_subtitles
+
+from subliminal_patch.subtitle import ModifiedSubtitle

 default_thumb = R(ICON_SUB)
 main_icon = ICON if not config.is_development else "icon-dev.jpg"
@@ -20,14 +29,6 @@ route = enable_channel_wrapper(route)
 handler = enable_channel_wrapper(handler)


-def should_display_ignore(items, previous=None):
-    kind = get_kind(items)
-    return items and (
-        (kind in ("show", "season")) or
-        (kind == "episode" and previous != "season")
-    )
-
-
 def add_ignore_options(oc, kind, callback_menu=None, title=None, rating_key=None, add_kind=True):
    """

@@ -72,7 +73,7 @@ def dig_tree(oc, items, menu_callback, menu_determination_callback=None, force_r
        oc.add(DirectoryObject(
            key=Callback(menu_callback or menu_determination_callback(kind, item, pass_kwargs=pass_kwargs), title=title,
                         rating_key=force_rating_key or key, **add_kwargs),
-            title=title, thumb=thumb, summary=summary
+            title=pad_title(title) if kind in ("show", "season") else title, thumb=thumb, summary=summary
        ))
    return oc

@@ -150,6 +151,57 @@ def debounce(func):
    return wrap


+def extract_embedded_sub(**kwargs):
+    rating_key = kwargs["rating_key"]
+    part_id = kwargs.pop("part_id")
+    stream_index = kwargs.pop("stream_index")
+    with_mods = kwargs.pop("with_mods", False)
+    language = Language.fromietf(kwargs.pop("language"))
+    refresh = kwargs.pop("refresh", True)
+    set_current = kwargs.pop("set_current", True)
+
+    plex_item = get_item(rating_key)
+    item_type = get_item_kind_from_item(plex_item)
+    part = get_part(plex_item, part_id)
+
+    if part:
+        metadata = get_plex_metadata(rating_key, part_id, item_type, plex_item=plex_item)
+        scanned_parts = scan_videos([metadata], ignore_all=True, skip_hashing=True)
+        for stream in part.streams:
+            # subtitle stream
+            if str(stream.index) == stream_index:
+                forced = stream.forced
+                bn = os.path.basename(part.file)
+
+                set_refresh_menu_state(u"Extracting subtitle %s of %s" % (stream_index, bn))
+                Log.Info(u"Extracting stream %s (%s) of %s", stream_index, display_language(language), bn)
+
+                args = [
+                    config.plex_transcoder, "-i", part.file, "-map", "0:%s" % stream_index, "-f", "srt", "-"
+                ]
+                output = None
+                try:
+                    output = subprocess.check_output(quote_args(args), stderr=subprocess.PIPE, shell=True)
+                except:
+                    Log.Error("Extraction failed: %s", traceback.format_exc())
+
+                if output:
+                    subtitle = ModifiedSubtitle(language, mods=config.default_mods if with_mods else None)
+                    subtitle.content = output
+                    subtitle.provider_name = "embedded"
+                    subtitle.id = "stream_%s" % stream_index
+                    subtitle.score = 0
+                    subtitle.set_encoding("utf-8")
+
+                    # fixme: speedup video; only video.name is needed
+                    save_successful = save_subtitles(scanned_parts, {scanned_parts.keys()[0]: [subtitle]}, mode="m",
+                                                     set_current=set_current)
+                    set_refresh_menu_state(None)
+
+                    if save_successful and refresh:
+                        refresh_item(rating_key)
+
+
 class SZObjectContainer(ObjectContainer):
    def __init__(self, *args, **kwargs):
        skip_pin_lock = kwargs.pop("skip_pin_lock", False)
@@ -6,7 +6,8 @@ from support.items import refresh_item
 from support.helpers import timestamp


-@route(PREFIX + '/item/{rating_key}')
+@route(PREFIX + '/item/refresh/{rating_key}/force', force=True)
+@route(PREFIX + '/item/refresh/{rating_key}')
@debounce
 def RefreshItem(rating_key=None, came_from="/recent", item_title=None, force=False, refresh_kind=None,
                previous_rating_key=None, timeout=8000, randomize=None, trigger=True):
@@ -3,12 +3,13 @@
 import traceback
 import types

-from babelfish import Language
+from subzero.language import Language

 from menu_helpers import debounce, SubFolderObjectContainer, default_thumb, route
 from subzero.modification import registry as mod_registry, SubtitleModifications
 from subzero.constants import PREFIX
-from support.plex_media import get_plex_metadata, scan_videos
+from support.plex_media import get_plex_metadata
+from support.scanning import scan_videos
 from support.helpers import timestamp, pad_title
 from support.items import get_current_sub, set_mods_for_part

@@ -75,6 +76,11 @@ def SubtitleModificationsMenu(**kwargs):
            title=pad_title("Manage applied mods"),
            summary=u"Currently applied mods: %s" % (", ".join(current_mods))
        ))
+        oc.add(DirectoryObject(
+            key=Callback(SubtitleReapplyMods, randomize=timestamp(), **kwargs),
+            title=pad_title("Reapply applied mods"),
+            summary=u"Currently applied mods: %s" % (", ".join(current_mods) if current_mods else "none")
+        ))

    oc.add(DirectoryObject(
        key=Callback(SubtitleSetMods, mods=None, mode="clear", randomize=timestamp(), **kwargs),
@@ -103,12 +109,12 @@ def SubtitleFPSModMenu(**kwargs):
    ))

    metadata = get_plex_metadata(rating_key, part_id, item_type)
-    scanned_parts = scan_videos([metadata], kind="series" if item_type == "episode" else "movie", ignore_all=True)
+    scanned_parts = scan_videos([metadata], ignore_all=True, skip_hashing=True)
    video, plex_part = scanned_parts.items()[0]

    target_fps = plex_part.fps

-    for fps in ["23.976", "24.000", "25.000", "29.970", "30.000", "50.000", "59.940", "60.000"]:
+    for fps in ["23.980", "23.976", "24.000", "25.000", "29.970", "30.000", "50.000", "59.940", "60.000"]:
        if float(fps) == float(target_fps):
            continue

@@ -227,6 +233,22 @@ def SubtitleSetMods(mods=None, mode=None, **kwargs):
    return SubtitleModificationsMenu(randomize=timestamp(), **kwargs)


+@route(PREFIX + '/item/sub_reapply_mods/{rating_key}/{part_id}', force=bool)
+@debounce
+def SubtitleReapplyMods(**kwargs):
+    rating_key = kwargs["rating_key"]
+    part_id = kwargs["part_id"]
+    lang_a2 = kwargs["language"]
+    item_type = kwargs["item_type"]
+
+    language = Language.fromietf(lang_a2)
+
+    set_mods_for_part(rating_key, part_id, language, item_type, [], mode="add")
+
+    kwargs.pop("randomize")
+    return SubtitleModificationsMenu(randomize=timestamp(), **kwargs)
+
+
@route(PREFIX + '/item/sub_list_mods/{rating_key}/{part_id}', force=bool)
@debounce
 def SubtitleListMods(**kwargs):
@@ -28,22 +28,25 @@ import items

 sys.modules["support.items"] = items

-import missing_subtitles
-
-sys.modules["support.missing_subtitles"] = missing_subtitles
-
 import scheduler

 sys.modules["support.scheduler"] = scheduler

-import tasks
-
-sys.modules["support.tasks"] = tasks
-
 import storage

 sys.modules["support.storage"] = storage

+import scanning
+sys.modules["support.scanning"] = scanning
+
+import missing_subtitles
+
+sys.modules["support.missing_subtitles"] = missing_subtitles
+
+import tasks
+
+sys.modules["support.tasks"] = tasks
+
 import ignore

 sys.modules["support.ignore"] = ignore
@@ -60,4 +63,4 @@ import activities
 sys.modules["support.activities"] = activities

 import download
-sys.modules["support.download"] = download
+sys.modules["support.download"] = download
@@ -3,14 +3,20 @@ from wraptor.decorators import throttle
 from config import config
 from items import get_item, get_item_kind_from_item, refresh_item

-from plex_activity import Activity
-from plex_activity.sources.s_logging.main import Logging as Activity_Logging
+Activity = None
+try:
+    from plex_activity import Activity
+except ImportError:
+    pass


 class PlexActivityManager(object):
    def start(self):
        activity_sources_enabled = None

+        if not Activity:
+            return
+
        if config.plex_token:
            from plex import Plex
            Plex.configuration.defaults.authentication(config.plex_token)
@@ -1,27 +1,41 @@
 # coding=utf-8
-
+import copy
 import os
 import re
 import inspect
 import sys
 import rarfile
-
+import jstyleson
 import datetime

 import subliminal
 import subliminal_patch
+import subzero.constants
+import lib
+from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded

+from subliminal_patch.core import is_windows_special_path
 from whichdb import whichdb
-from babelfish import Language
+
+from subliminal_patch.exceptions import TooManyRequests
+from subzero.language import Language
 from subliminal.cli import MutexLock
 from subzero.lib.io import FileIO, get_viable_encoding
+from subzero.lib.dict import Dicked
 from subzero.util import get_root_path
 from subzero.constants import PLUGIN_NAME, PLUGIN_IDENTIFIER, MOVIE, SHOW, MEDIA_TYPE_TO_STRING
+from dogpile.cache.region import register_backend as register_cache_backend
 from lib import Plex
-from helpers import check_write_permissions, cast_bool
+from helpers import check_write_permissions, cast_bool, cast_int, mswindows

-SUBTITLE_EXTS = ['utf', 'utf8', 'utf-8', 'srt', 'smi', 'rt', 'ssa', 'aqt', 'jss', 'ass', 'idx', 'sub', 'txt', 'psb',
-                 'vtt']
+register_cache_backend(
+    "subzero.cache.file", "subzero.cache_backends.file", "SZFileBackend")
+
+SUBTITLE_EXTS_BASE = ['utf', 'utf8', 'utf-8', 'srt', 'smi', 'rt', 'ssa', 'aqt', 'jss', 'ass', 'idx', 'sub', 'psb',
+                      'vtt']
+SUBTITLE_EXTS = SUBTITLE_EXTS_BASE + ["txt"]
+
+TEXT_SUBTITLE_EXTS = ("srt", "ass", "ssa", "vtt")
 VIDEO_EXTS = ['3g2', '3gp', 'asf', 'asx', 'avc', 'avi', 'avs', 'bivx', 'bup', 'divx', 'dv', 'dvr-ms', 'evo', 'fli',
              'flv',
              'm2t', 'm2ts', 'm2v', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'mpg', 'mts', 'nsv', 'nuv', 'ogm', 'ogv', 'tp',
@@ -42,6 +56,24 @@ def int_or_default(s, default):
        return default


+VALID_THROTTLE_EXCEPTIONS = (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable)
+
+PROVIDER_THROTTLE_MAP = {
+    "default": {
+        TooManyRequests: (datetime.timedelta(hours=1), "1 hour"),
+        DownloadLimitExceeded: (datetime.timedelta(hours=3), "3 hours"),
+        ServiceUnavailable: (datetime.timedelta(minutes=20), "20 minutes"),
+    },
+    "opensubtitles": {
+        TooManyRequests: (datetime.timedelta(hours=3), "3 hours"),
+        DownloadLimitExceeded: (datetime.timedelta(hours=6), "6 hours"),
+    },
+    "addic7ed": {
+        DownloadLimitExceeded: (datetime.timedelta(hours=24), "24 hours"),
+    }
+}
+
+
 class Config(object):
    libraries_root = None
    plugin_info = ""
@@ -56,6 +88,11 @@ class Config(object):
    plex_token = None
    is_development = False
    dbm_supported = False
+    pms_request_timeout = 15
+    low_impact_mode = False
+    new_style_cache = False
+    pack_cache_dir = None
+    advanced = None

    enable_channel = True
    enable_agent = True
@@ -64,11 +101,8 @@ class Config(object):
    lock_advanced_menu = False
    locked = False
    pin_valid_minutes = 10
-    lang_list = None
    subtitle_destination_folder = None
    subtitle_formats = None
-    providers = None
-    provider_settings = None
    max_recent_items_per_library = 200
    permissions_ok = False
    missing_permissions = None
@@ -79,6 +113,7 @@ class Config(object):
    sections = None
    enabled_sections = None
    remove_hi = False
+    remove_tags = False
    fix_ocr = False
    fix_common = False
    colors = ""
@@ -86,15 +121,21 @@ class Config(object):
    forced_only = False
    exotic_ext = False
    treat_und_as_first = False
+    subtitle_sub_dir = None, None
    ext_match_strictness = False
    default_mods = None
    debug_mods = False
    react_to_activities = False
    activity_mode = None
-    subtitles_save_to = None
    no_refresh = False
+    plex_transcoder = None
+    refiner_settings = None
+    exact_filenames = False
+    only_one = False
+    embedded_auto_extract = False
+    ietf_as_alpha3 = False

-    store_recently_played_amount = 20
+    store_recently_played_amount = 40

    initialized = False

@@ -102,6 +143,10 @@ class Config(object):
        self.libraries_root = os.path.abspath(os.path.join(get_root_path(), ".."))
        self.init_libraries()

+        if is_windows_special_path:
+            Log.Warn("The Plex metadata folder is residing inside a folder with special characters. "
+                     "Multithreading and playback activities will be disabled.")
+
        self.fs_encoding = get_viable_encoding()
        self.plugin_info = self.get_plugin_info()
        self.is_development = self.get_dev_mode()
@@ -113,20 +158,24 @@ class Config(object):
        self.data_items_path = os.path.join(self.data_path, "DataItems")
        self.universal_plex_token = self.get_universal_plex_token()
        self.plex_token = os.environ.get("PLEXTOKEN", self.universal_plex_token)
+        subzero.constants.DEFAULT_TIMEOUT = lib.DEFAULT_TIMEOUT = self.pms_request_timeout = \
+            min(cast_int(Prefs['pms_request_timeout'], 15), 45)
+        self.low_impact_mode = cast_bool(Prefs['low_impact_mode'])
+        self.new_style_cache = cast_bool(Prefs['new_style_cache'])
+        self.pack_cache_dir = self.get_pack_cache_dir()
+        self.advanced = self.get_advanced_config()

        os.environ["SZ_USER_AGENT"] = self.get_user_agent()

-        self.providers = self.get_providers()
-
+        self.setup_proxies()
        self.set_plugin_mode()
        self.set_plugin_lock()
        self.set_activity_modes()
+        self.parse_rename_mode()

-        self.lang_list = self.get_lang_list()
        self.subtitle_destination_folder = self.get_subtitle_destination_folder()
        self.subtitle_formats = self.get_subtitle_formats()
        self.forced_only = cast_bool(Prefs["subtitles.only_foreign"])
-        self.provider_settings = self.get_provider_settings()
        self.max_recent_items_per_library = int_or_default(Prefs["scheduler.max_recent_items_per_library"], 2000)
        self.sections = list(Plex["library"].sections())
        self.missing_permissions = []
@@ -136,17 +185,22 @@ class Config(object):
        self.permissions_ok = self.check_permissions()
        self.notify_executable = self.check_notify_executable()
        self.remove_hi = cast_bool(Prefs['subtitles.remove_hi'])
+        self.remove_tags = cast_bool(Prefs['subtitles.remove_tags'])
        self.fix_ocr = cast_bool(Prefs['subtitles.fix_ocr'])
        self.fix_common = cast_bool(Prefs['subtitles.fix_common'])
        self.colors = Prefs['subtitles.colors'] if Prefs['subtitles.colors'] != "don't change" else None
        self.chmod = self.check_chmod()
        self.exotic_ext = cast_bool(Prefs["subtitles.scan.exotic_ext"])
        self.treat_und_as_first = cast_bool(Prefs["subtitles.language.treat_und_as_first"])
+        self.subtitle_sub_dir = self.get_subtitle_sub_dir()
        self.ext_match_strictness = self.determine_ext_sub_strictness()
        self.default_mods = self.get_default_mods()
        self.debug_mods = cast_bool(Prefs['log_debug_mods'])
-        self.subtitles_save_to = Prefs['subtitles.save.filesystem']
        self.no_refresh = os.environ.get("SZ_NO_REFRESH", False)
+        self.plex_transcoder = self.get_plex_transcoder()
+        self.only_one = cast_bool(Prefs['subtitles.only_one'])
+        self.embedded_auto_extract = cast_bool(Prefs["subtitles.embedded.autoextract"])
+        self.ietf_as_alpha3 = cast_bool(Prefs["subtitles.language.ietf_normalize"])
        self.initialized = True

    def init_libraries(self):
@@ -162,6 +216,13 @@ class Config(object):
            Log.Info("Using UnRAR from: %s", custom_unrar)

    def init_cache(self):
+        if self.new_style_cache:
+            subliminal.region.configure('subzero.cache.file', expiration_time=datetime.timedelta(days=30),
+                                        arguments={'appname': "sz_cache",
+                                                   'app_cache_dir': self.data_path})
+            Log.Info("Using new style file based cache!")
+            return
+
        names = ['dbhash', 'gdbm', 'dbm']
        dbfn = None
        self.dbm_supported = False
@@ -207,12 +268,37 @@ class Config(object):
        Log.Warn("Not using file based cache!")
        subliminal.region.configure('dogpile.cache.memory')

+    def sync_cache(self):
+        if not self.new_style_cache:
+            return
+        Log.Debug("Syncing cache")
+        subliminal.region.backend.sync()
+
+    def get_pack_cache_dir(self):
+        pack_cache_dir = os.path.join(config.data_path, "pack_cache")
+        if not os.path.isdir(pack_cache_dir):
+            os.makedirs(pack_cache_dir)
+
+        return pack_cache_dir
+
+    def get_advanced_config(self):
+        path = os.path.join(config.data_path, "advanced_settings.json")
+        if os.path.isfile(path):
+            data = FileIO.read(path, "r")
+
+            return Dicked(**jstyleson.loads(data))
+
+        return Dicked()
+
    def set_log_paths(self):
        # find log handler
        for handler in Core.log.handlers:
-            if getattr(getattr(handler, "__class__"), "__name__") in (
-                    'FileHandler', 'RotatingFileHandler', 'TimedRotatingFileHandler'):
+            cls_name = getattr(getattr(handler, "__class__"), "__name__")
+            if cls_name in ('FileHandler', 'RotatingFileHandler', 'TimedRotatingFileHandler'):
                plugin_log_file = handler.baseFilename
+                if cls_name in ("RotatingFileHandler", "TimedRotatingFileHandler"):
+                    handler.backupCount = int_or_default(Prefs['log_rotate_keep'], 5)
+
                if os.path.isfile(os.path.realpath(plugin_log_file)):
                    self.plugin_log_path = plugin_log_file

@@ -284,7 +370,7 @@ class Config(object):
        self.permissions_ok = self.check_permissions()

    def check_permissions(self):
-        if not Prefs["subtitles.save.filesystem"] or not Prefs["check_permissions"]:
+        if not cast_bool(Prefs["subtitles.save.filesystem"]) or not cast_bool(Prefs["check_permissions"]):
            return True

        self.missing_permissions = []
@@ -300,6 +386,9 @@ class Config(object):
                if isinstance(path_str, unicode):
                    path_str = path_str.encode(self.fs_encoding)

+                if not os.path.exists(path_str):
+                    continue
+
                if use_ignore_fs:
                    # check whether we've got an ignore file inside the section path
                    if self.is_physically_ignored(path_str):
@@ -405,18 +494,44 @@ class Config(object):
        return enabled_sections

    # Prepare a list of languages we want subs for
-    def get_lang_list(self):
-        l = {Language.fromietf(Prefs["langPref1"])}
+    def get_lang_list(self, provider=None):
+        # advanced settings
+        if provider and self.advanced.providers and provider in self.advanced.providers:
+            adv_languages = self.advanced.providers[provider].get("languages", None)
+            if adv_languages:
+                adv_out = set()
+                for adv_lang in adv_languages:
+                    adv_lang = adv_lang.strip()
+                    try:
+                        real_lang = Language.fromietf(adv_lang)
+                    except:
+                        try:
+                            real_lang = Language.fromname(adv_lang)
+                        except:
+                            continue
+                    adv_out.update({real_lang})
+
+                # fallback to default languages if no valid language was found in advanced settings
+                if adv_out:
+                    return adv_out
+
+        l = {Language.fromietf(Prefs["langPref1a"])}
        lang_custom = Prefs["langPrefCustom"].strip()

        if Prefs['subtitles.only_one']:
            return l

-        if Prefs["langPref2"] != "None":
-            l.update({Language.fromietf(Prefs["langPref2"])})
+        if Prefs["langPref2a"] != "None":
+            try:
+                l.update({Language.fromietf(Prefs["langPref2a"])})
+            except:
+                pass

-        if Prefs["langPref3"] != "None":
-            l.update({Language.fromietf(Prefs["langPref3"])})
+        if Prefs["langPref3a"] != "None":
+            try:
+                l.update({Language.fromietf(Prefs["langPref3a"])})
+            except:
+                pass

        if len(lang_custom) and lang_custom != "None":
            for lang in lang_custom.split(u","):
@@ -432,6 +547,8 @@ class Config(object):

        return l

+    lang_list = property(get_lang_list)
+
    def get_subtitle_destination_folder(self):
        if not Prefs["subtitles.save.filesystem"]:
            return
@@ -450,18 +567,26 @@ class Config(object):
            out.append("vtt")
        return out

-    def get_providers(self):
+    def get_providers(self, media_type="series"):
        providers = {'opensubtitles': cast_bool(Prefs['provider.opensubtitles.enabled']),
                     # 'thesubdb': Prefs['provider.thesubdb.enabled'],
                     'podnapisi': cast_bool(Prefs['provider.podnapisi.enabled']),
+                     'titlovi': cast_bool(Prefs['provider.titlovi.enabled']),
                     'addic7ed': cast_bool(Prefs['provider.addic7ed.enabled']),
                     'tvsubtitles': cast_bool(Prefs['provider.tvsubtitles.enabled']),
                     'legendastv': cast_bool(Prefs['provider.legendastv.enabled']),
                     'napiprojekt': cast_bool(Prefs['provider.napiprojekt.enabled']),
-                     'shooter': cast_bool(Prefs['provider.shooter.enabled']),
-                     'subscenter': cast_bool(Prefs['provider.subscenter.enabled']),
+                     'shooter': False,
+                     'subscene': cast_bool(Prefs['provider.subscene.enabled']),
+                     'subscenter': False,
                     }

+        providers_by_prefs = copy.deepcopy(providers)
+
+        # disable subscene for movies by default
+        if media_type == "movies":
+            providers["subscene"] = False
+
        # ditch non-forced-subtitles-reporting providers
        if self.forced_only:
            providers["addic7ed"] = False
@@ -469,10 +594,41 @@ class Config(object):
            providers["legendastv"] = False
            providers["napiprojekt"] = False
            providers["shooter"] = False
-            providers["subscenter"] = False
+            providers["titlovi"] = False
+
+        # advanced settings
+        if media_type and self.advanced.providers:
+            for provider, data in self.advanced.providers.iteritems():
+                if provider not in providers or not providers_by_prefs[provider]:
+                    continue
+
+                if data["enabled_for"] is not None:
+                    providers[provider] = media_type in data["enabled_for"]
+
+        if "provider_throttle" not in Dict:
+            Dict["provider_throttle"] = {}
+
+        changed = False
+        for provider, enabled in dict(providers).iteritems():
+            reason, until, throttle_desc = Dict["provider_throttle"].get(provider, (None, None, None))
+            if reason:
+                now = datetime.datetime.now()
+                if now < until:
+                    Log.Info("Not using %s until %s, because of: %s", provider,
+                             until.strftime("%y/%m/%d %H:%M"), reason)
+                    providers[provider] = False
+                else:
+                    Log.Info("Using %s again after %s, (disabled because: %s)", provider, throttle_desc, reason)
+                    del Dict["provider_throttle"][provider]
+                    changed = True
+
+        if changed:
+            Dict.Save()

        return filter(lambda prov: providers[prov], providers)

+    providers = property(get_providers)
+
    def get_provider_settings(self):
        provider_settings = {'addic7ed': {'username': Prefs['provider.addic7ed.username'],
                                          'password': Prefs['provider.addic7ed.password'],
@@ -480,22 +636,54 @@ class Config(object):
                                          },
                             'opensubtitles': {'username': Prefs['provider.opensubtitles.username'],
                                               'password': Prefs['provider.opensubtitles.password'],
-                                               'use_tag_search': cast_bool(Prefs['provider.opensubtitles.use_tags']),
-                                               'only_foreign': cast_bool(Prefs['subtitles.only_foreign'])
+                                               'use_tag_search': self.exact_filenames,
+                                               'only_foreign': self.forced_only,
+                                               'is_vip': cast_bool(Prefs['provider.opensubtitles.is_vip'])
                                               },
                             'podnapisi': {
-                                 'only_foreign': cast_bool(Prefs['subtitles.only_foreign'])
+                                 'only_foreign': self.forced_only,
                             },
                             'legendastv': {'username': Prefs['provider.legendastv.username'],
                                            'password': Prefs['provider.legendastv.password'],
-                                            },
-                             'subscenter': {'username': Prefs['provider.subscenter.username'],
-                                            'password': Prefs['provider.subscenter.password'],
-                                            },
+                                            }
                             }

        return provider_settings

+    provider_settings = property(get_provider_settings)
+
+    def provider_throttle(self, name, exception):
+        """
+        throttle a provider :name: for X hours based on the :exception: type
+        :param name:
+        :param exception:
+        :return:
+        """
+        cls = getattr(exception, "__class__")
+        cls_name = getattr(cls, "__name__")
+        if cls not in VALID_THROTTLE_EXCEPTIONS:
+            for valid_cls in VALID_THROTTLE_EXCEPTIONS:
+                if isinstance(cls, valid_cls):
+                    cls = valid_cls
+
+        throttle_data = PROVIDER_THROTTLE_MAP.get(name, PROVIDER_THROTTLE_MAP["default"]).get(cls, None) or \
+            PROVIDER_THROTTLE_MAP["default"].get(cls, None)
+
+        if not throttle_data:
+            return
+
+        throttle_delta, throttle_description = throttle_data
+
+        if "provider_throttle" not in Dict:
+            Dict["provider_throttle"] = {}
+
+        throttle_until = datetime.datetime.now() + throttle_delta
+        Dict["provider_throttle"][name] = (cls_name, throttle_until, throttle_description)
+
+        Log.Info("Throttling %s for %s, until %s, because of: %s", name, throttle_description,
+                 throttle_until.strftime("%y/%m/%d %H:%M"), cls_name)
+        Dict.Save()
+
    @property
    def provider_pool(self):
        if cast_bool(Prefs['providers.multithreading']):
@@ -519,6 +707,22 @@ class Config(object):
        if wrong_chmod:
            Log.Warn("Chmod setting ignored, please use only 4-digit integers with leading 0 (e.g.: 775)")

+    def get_subtitle_sub_dir(self):
+        """
+
+        :return: folder, is_absolute
+        """
+        if not cast_bool(Prefs['subtitles.save.filesystem']):
+            return None, None
+
+        if Prefs["subtitles.save.subFolder.Custom"]:
+            return Prefs["subtitles.save.subFolder.Custom"], os.path.isabs(Prefs["subtitles.save.subFolder.Custom"])
+
+        if Prefs["subtitles.save.subFolder"] == "current folder":
+            return ".", False
+
+        return Prefs["subtitles.save.subFolder"], False
+
    def determine_ext_sub_strictness(self):
        val = Prefs["subtitles.scan.filename_strictness"]
        if val == "any":
@@ -531,6 +735,8 @@ class Config(object):
        mods = []
        if self.remove_hi:
            mods.append("remove_HI")
+        if self.remove_tags:
+            mods.append("remove_tags")
        if self.fix_ocr:
            mods.append("OCR_fixes")
        if self.fix_common:
@@ -540,6 +746,12 @@ class Config(object):

        return mods

+    def setup_proxies(self):
+        proxy = Prefs["proxy"]
+        if proxy:
+            os.environ["SZ_HTTP_PROXY"] = proxy.strip()
+            Log.Debug("Using HTTP Proxy: %s", proxy)
+
    def set_activity_modes(self):
        val = Prefs["activity.on_playback"]
        if val == "never":
@@ -556,6 +768,65 @@ class Config(object):
        else:
            self.activity_mode = "next_episode"

+    def get_plex_transcoder(self):
+        base_path = os.environ.get("PLEX_MEDIA_SERVER_HOME", None)
+        if not base_path:
+            # fall back to bundled plugins path
+            bundle_path = os.environ.get("PLEXBUNDLEDPLUGINSPATH", None)
+            if bundle_path:
+                base_path = os.path.normpath(os.path.join(bundle_path, "..", ".."))
+
+        if sys.platform == "darwin":
+            fn = os.path.join(base_path, "MacOS", "Plex Transcoder")
+        elif mswindows:
+            fn = os.path.join(base_path, "plextranscoder.exe")
+        else:
+            fn = os.path.join(base_path, "Plex Transcoder")
+
+        if os.path.isfile(fn):
+            return fn
+
+    def parse_rename_mode(self):
+        # fixme: exact_filenames should be determined via callback combined with info about the current video
+        # (original_name)
+
+        mode = str(Prefs["media_rename1"])
+        self.refiner_settings = {}
+
+        if cast_bool(Prefs['use_file_info_file']):
+            self.refiner_settings["file_info_file"] = True
+            self.exact_filenames = True
+
+        if mode == "none of the above":
+            return
+
+        elif mode == "Symlink to original file":
+            self.refiner_settings["symlinks"] = True
+            self.exact_filenames = True
+            return
+
+        elif mode == "I keep the original filenames":
+            self.exact_filenames = True
+            return
+
+        if mode in ("Filebot", "Sonarr/Radarr/Filebot"):
+            self.refiner_settings["filebot"] = True
+
+        if mode in ("Sonarr/Radarr (fill api info below)", "Sonarr/Radarr/Filebot"):
+            if Prefs["drone_api.sonarr.url"] and Prefs["drone_api.sonarr.api_key"]:
+                self.refiner_settings["sonarr"] = {
+                    "base_url": Prefs["drone_api.sonarr.url"],
+                    "api_key": Prefs["drone_api.sonarr.api_key"]
+                }
+                self.exact_filenames = True
+
+            if Prefs["drone_api.radarr.url"] and Prefs["drone_api.radarr.api_key"]:
+                self.refiner_settings["radarr"] = {
+                    "base_url": Prefs["drone_api.radarr.url"],
+                    "api_key": Prefs["drone_api.radarr.api_key"]
+                }
+                self.exact_filenames = True
+
    def init_subliminal_patches(self):
        # configure custom subtitle destination folders for scanning pre-existing subs
        Log.Debug("Patching subliminal ...")
@@ -564,7 +835,7 @@ class Config(object):
        subliminal_patch.core.INCLUDE_EXOTIC_SUBS = self.exotic_ext

        subliminal_patch.core.DOWNLOAD_TRIES = int(Prefs['subtitles.try_downloads'])
-        subliminal.score.episode_scores["addic7ed_boost"] = int(Prefs['provider.addic7ed.boost_by1'])
+        subliminal.score.episode_scores["addic7ed_boost"] = int(Prefs['provider.addic7ed.boost_by2'])


 config = Config()
@@ -1,4 +1,5 @@
 # coding=utf-8
+import traceback


 def dispatch_migrate():
@@ -6,6 +7,8 @@ def dispatch_migrate():
        migrate()
    except:
        Log.Error("Migration failed: %s" % traceback.format_exc())
+        del Dict["subs"]
+        Dict.Save()


 def migrate():
@@ -25,6 +28,7 @@ def migrate():
                        time=item.time)

        del Dict["history"]
+        history.destroy()
        Dict.Save()

    # migrate subtitle storage from Dict to Data
@@ -1,46 +1,120 @@
 # coding=utf-8
+import os
+
+from subzero.language import Language

 import subliminal_patch as subliminal

 from support.config import config
+from support.helpers import cast_bool
 from subtitlehelpers import get_subtitles_from_metadata
 from subliminal_patch import compute_score
+from support.plex_media import get_blacklist_from_part_map
+from subzero.video import refine_video
+from support.storage import get_pack_data, store_pack_data


-def download_best_subtitles(video_part_map, min_score=0):
+def get_missing_languages(video, part):
+    languages = set([Language.fromietf(str(l)) for l in config.lang_list])
+
+    # should we treat IETF as alpha3? (ditch the country part)
+    alpha3_map = {}
+    if config.ietf_as_alpha3:
+        for language in languages:
+            if language.country:
+                alpha3_map[language.alpha3] = language.country
+                language.country = None
+
+    if not Prefs['subtitles.save.filesystem']:
+        # scan for existing metadata subtitles
+        meta_subs = get_subtitles_from_metadata(part)
+        for language, subList in meta_subs.iteritems():
+            if subList:
+                video.subtitle_languages.add(language)
+                Log.Debug("Found metadata subtitle %s for %s", language, video)
+
+    have_languages = video.subtitle_languages.copy()
+    if config.ietf_as_alpha3:
+        for language in have_languages:
+            if language.country:
+                alpha3_map[language.alpha3] = language.country
+                language.country = None
+
+    missing_languages = (set(str(l) for l in languages) - set(str(l) for l in have_languages))
+
+    # all languages are found if we either really have subs for all languages or we only want to have exactly one language
+    # and we've only found one (the case for a selected language, Prefs['subtitles.only_one'] (one found sub matches any language))
+    found_one_which_is_enough = len(video.subtitle_languages) >= 1 and Prefs['subtitles.only_one']
+    if not missing_languages or found_one_which_is_enough:
+        if found_one_which_is_enough:
+            Log.Debug('Only one language was requested, and we\'ve got a subtitle for %s', video)
+        else:
+            Log.Debug('All languages %r exist for %s', languages, video)
+        return False
+
+    # re-add country codes to the missing languages, in case we've removed them above
+    if config.ietf_as_alpha3:
+        for language in languages:
+            language.country = alpha3_map.get(language.alpha3, None)
+
+    return missing_languages
+
+
+def pre_download_hook(subtitle):
+    if subtitle.is_pack:
+        # try retrieving the subtitle from a cached pack archive
+        pack_data = get_pack_data(subtitle)
+        if pack_data:
+            subtitle.pack_data = pack_data
+
+
+def post_download_hook(subtitle):
+    # if a new pack was downloaded, store it in the cache; providers' download method is responsible for
+    # setting subtitle.pack_data to None in case the cached pack data we provided was successfully used
+    if subtitle.is_pack and subtitle.pack_data:
+        # store pack data in cache
+        store_pack_data(subtitle, subtitle.pack_data)
+
+    # may be redundant
+    subtitle.pack_data = None
+
+
+def language_hook(provider):
+    return config.get_lang_list(provider=provider)
+
+
+def download_best_subtitles(video_part_map, min_score=0, throttle_time=None, providers=None):
    hearing_impaired = Prefs['subtitles.search.hearingImpaired']
-    languages = config.lang_list
+    languages = set([Language.fromietf(str(l)) for l in config.lang_list])
    if not languages:
        return

-    missing_languages = False
+    use_videos = []
    for video, part in video_part_map.iteritems():
-        if not Prefs['subtitles.save.filesystem']:
-            # scan for existing metadata subtitles
-            meta_subs = get_subtitles_from_metadata(part)
-            for language, subList in meta_subs.iteritems():
-                if subList:
-                    video.subtitle_languages.add(language)
-                    Log.Debug("Found metadata subtitle %s for %s", language, video)
+        if not video.ignore_all:
+            missing_languages = get_missing_languages(video, part)
+        else:
+            missing_languages = languages

-        missing_subs = (languages - video.subtitle_languages)
+        if missing_languages:
+            Log.Info(u"%s has missing languages: %s", os.path.basename(video.name), missing_languages)
+            refine_video(video, refiner_settings=config.refiner_settings)
+            use_videos.append(video)

-        # all languages are found if we either really have subs for all languages or we only want to have exactly one language
-        # and we've only found one (the case for a selected language, Prefs['subtitles.only_one'] (one found sub matches any language))
-        found_one_which_is_enough = len(video.subtitle_languages) >= 1 and Prefs['subtitles.only_one']
-        if not missing_subs or found_one_which_is_enough:
-            if found_one_which_is_enough:
-                Log.Debug('Only one language was requested, and we\'ve got a subtitle for %s', video)
-            else:
-                Log.Debug('All languages %r exist for %s', languages, video)
-            continue
-        missing_languages = True
-        break
+    # prepare blacklist
+    blacklist = get_blacklist_from_part_map(video_part_map, languages)

-    if missing_languages:
-        Log.Debug("Download best subtitles using settings: min_score: %s, hearing_impaired: %s" % (min_score, hearing_impaired))
+    if use_videos:
+        Log.Debug("Download best subtitles using settings: min_score: %s, hearing_impaired: %s, languages: %s" %
+                  (min_score, hearing_impaired, languages))

-        return subliminal.download_best_subtitles(video_part_map.keys(), languages, min_score, hearing_impaired, providers=config.providers,
-                                                  provider_configs=config.provider_settings, pool_class=config.provider_pool,
-                                                  compute_score=compute_score)
+        return subliminal.download_best_subtitles(set(use_videos), languages, min_score, hearing_impaired,
+                                                  providers=providers or config.providers,
+                                                  provider_configs=config.provider_settings,
+                                                  pool_class=config.provider_pool,
+                                                  compute_score=compute_score, throttle_time=throttle_time,
+                                                  blacklist=blacklist, throttle_callback=config.provider_throttle,
+                                                  pre_download_hook=pre_download_hook,
+                                                  post_download_hook=post_download_hook,
+                                                  language_hook=language_hook)
    Log.Debug("All languages for all requested videos exist. Doing nothing.")
@@ -15,7 +15,7 @@ from collections import OrderedDict
 import chardet

 from bs4 import UnicodeDammit
-from babelfish import Language
+from subzero.language import Language
 from subzero.analytics import track_event

 mswindows = (sys.platform == "win32")
@@ -44,6 +44,13 @@ def cast_bool(value):
    return str(value).strip() in ("true", "True")


+def cast_int(value, default=None):
+    try:
+        return int(value)
+    except ValueError:
+        return default
+
+
 # A platform independent way to split paths which might come in with different separators.
 def split_path(str):
    if str.find('\\') != -1:
@@ -151,10 +158,11 @@ def get_video_display_title(kind, title, section_title=None, parent_title=None,
    if add_section_title:
        section_add = ("%s: " % section_title) if section_title else ""

-    if kind == "show" and parent_title:
+    if kind in ("season", "show") and parent_title:
        if season and episode:
            return '%s%s S%02dE%02d%s' % (section_add, parent_title, season or 0, episode or 0,
                                          (", %s" % title if title else ""))
+
        return '%s%s%s' % (section_add, parent_title, (", %s" % title if title else ""))
    return "%s%s" % (section_add, title)

@@ -202,7 +210,7 @@ def decode_message(s):


 def timestamp():
-    return int(time.time())
+    return int(time.time()*1000)


 def df(d):
@@ -329,9 +337,12 @@ def track_usage(category=None, action=None, label=None, value=None):
            except:
                pass

-    Thread.Create(dispatch_track_usage, category, action, label, value,
-                  identifier=Dict["anon_id"], first_use=Dict["first_use"],
-                  add=Network.PublicAddress)
+    try:
+        Thread.Create(dispatch_track_usage, category, action, label, value,
+                      identifier=Dict["anon_id"], first_use=Dict["first_use"],
+                      add=Network.PublicAddress)
+    except:
+        Log.Debug("Something went wrong when reporting anonymous user statistics: %s", traceback.format_exc())


 def dispatch_track_usage(*args, **kwargs):
@@ -344,9 +355,27 @@ def dispatch_track_usage(*args, **kwargs):
        Log.Debug("Something went wrong when reporting anonymous user statistics: %s", traceback.format_exc())


+def get_language_from_stream(lang_code):
+    if lang_code:
+        lang = Locale.Language.Match(lang_code)
+        if lang and lang != "xx":
+            # Log.Debug("Found language: %r", lang)
+            return Language.fromietf(lang)
+
+
 def get_language(lang_short):
    return Language.fromietf(lang_short)


+def display_language(l):
+    addons = []
+    if l.country:
+        addons.append(l.country.alpha2)
+    if l.script:
+        addons.append(l.script.code)
+
+    return l.name if not addons else "%s (%s)" % (l.name, ", ".join(addons))
+
+
 class PartUnknownException(Exception):
    pass
@@ -1,4 +1,4 @@
 # coding=utf-8
 from subzero.history_storage import SubtitleHistory

-get_history = lambda: SubtitleHistory(Data, int(Prefs["history_size"]))
+get_history = lambda: SubtitleHistory(Data, Thread, int(Prefs["history_size"]))
@@ -11,7 +11,8 @@ class IgnoreDict(DictProxy):
        "section": "sections",
        "show": "series",
        "movie": "videos",
-        "episode": "videos"
+        "episode": "videos",
+        "season": "seasons",
    }

    # getItems types mapped to their verbose names
@@ -19,9 +20,10 @@ class IgnoreDict(DictProxy):
        "sections": "Section",
        "series": "Series",
        "videos": "Item",
+        "seasons": "Season",
    }

-    key_order = ("sections", "series", "videos")
+    key_order = ("sections", "series", "videos", "seasons")

    def __len__(self):
        try:
@@ -35,7 +37,7 @@ class IgnoreDict(DictProxy):
        return self.translate_keys.get(name)

    def verbose(self, name):
-        return self.keys_verbose.get(name)
+        return self.keys_verbose.get(self.translate_key(name) or name)

    def get_title_key(self, kind, key):
        return "%s_%s" % (kind, key)
@@ -57,6 +59,7 @@ class IgnoreDict(DictProxy):
        Dict.Save()

    def setup_defaults(self):
-        return {"sections": [], "series": [], "videos": [], "titles": {}}
+        return {"sections": [], "series": [], "videos": [], "titles": {}, "seasons": []}
+

 ignore_list = IgnoreDict(Dict)
@@ -5,6 +5,11 @@ import re
 import traceback
 import types
 import os
+
+import time
+
+import datetime
+
 from ignore import ignore_list
 from helpers import is_recent, get_plex_item_display_title, query_plex, PartUnknownException
 from lib import Plex, get_intent
@@ -54,6 +59,21 @@ def get_item_kind_from_item(item):
    return PLEX_API_TYPE_MAP.get(get_item_kind(item))


+def get_item_title(item):
+    kind = get_item_kind_from_item(item)
+    if kind not in ("episode", "movie", "season", "series"):
+        return
+
+    if kind == "episode":
+        return get_plex_item_display_title(item, "show", parent=item.season, section_title=None,
+                                                 parent_title=item.show.title)
+    elif kind == "season":
+        return get_plex_item_display_title(item, "season", parent=item.show, section_title="Season",
+                                           parent_title=item.show.title)
+    else:
+        return get_plex_item_display_title(item, kind, section_title=None)
+
+
 def get_item_thumb(item):
    kind = get_item_kind(item)
    if kind == "Episode":
@@ -240,7 +260,7 @@ def is_ignored(rating_key, item=None):
    :return:
    """
    # item in soft ignore list
-    if rating_key in ignore_list["videos"]:
+    if ignore_list["videos"] and rating_key in ignore_list["videos"]:
        Log.Debug("Item %s is in the soft ignore list" % rating_key)
        return True

@@ -248,12 +268,17 @@ def is_ignored(rating_key, item=None):
    kind = get_item_kind(item)

    # show in soft ignore list
-    if kind == "Episode" and item.show.rating_key in ignore_list["series"]:
+    if kind == "Episode" and ignore_list["series"] and item.show.rating_key in ignore_list["series"]:
        Log.Debug("Item %s's show is in the soft ignore list" % rating_key)
        return True

+    # season in soft ignore list
+    if kind == "Episode" and ignore_list["seasons"] and item.season.rating_key in ignore_list["seasons"]:
+        Log.Debug("Item %s's season is in the soft ignore list" % rating_key)
+        return True
+
    # section in soft ignore list
-    if item.section.key in ignore_list["sections"]:
+    if ignore_list["sections"] and item.section.key in ignore_list["sections"]:
        Log.Debug("Item %s's section is in the soft ignore list" % rating_key)
        return True

@@ -303,26 +328,87 @@ def refresh_item(rating_key, force=False, timeout=8000, refresh_kind=None, paren
        # season refresh, needs explicit per-episode refresh
        refresh = [item.rating_key for item in list(Plex["library/metadata"].children(int(rating_key)))]

+    multiple = len(refresh) > 1
    for key in refresh:
        Log.Info("%s item %s", "Refreshing" if not force else "Forced-refreshing", key)
        Plex["library/metadata"].refresh(key)
+        if multiple:
+            Thread.Sleep(10.0)


-def get_current_sub(rating_key, part_id, language):
+def get_current_sub(rating_key, part_id, language, plex_item=None):
    from support.storage import get_subtitle_storage

-    item = get_item(rating_key)
+    item = plex_item or get_item(rating_key)
    subtitle_storage = get_subtitle_storage()
    stored_subs = subtitle_storage.load_or_new(item)
    current_sub = stored_subs.get_any(part_id, language)
    return current_sub, stored_subs, subtitle_storage


-def set_mods_for_part(rating_key, part_id, language, item_type, mods, mode="add"):
-    from support.plex_media import get_plex_metadata, scan_videos
-    from support.storage import save_subtitles
+def save_stored_sub(stored_subtitle, rating_key, part_id, language, item_type, plex_item=None, storage=None,
+                    stored_subs=None):
+    from support.plex_media import get_plex_metadata
+    from support.scanning import scan_videos
+    from support.storage import save_subtitles, get_subtitle_storage

-    current_sub, stored_subs, storage = get_current_sub(rating_key, part_id, language)
+    plex_item = plex_item or get_item(rating_key)
+    storage = storage or get_subtitle_storage()
+
+    cleanup = not storage
+
+    stored_subs = stored_subs or storage.load(plex_item.rating_key)
+
+    if not all([plex_item, stored_subs]):
+        return
+
+    try:
+        metadata = get_plex_metadata(rating_key, part_id, item_type, plex_item=plex_item)
+    except PartUnknownException:
+        return
+
+    scanned_parts = scan_videos([metadata], ignore_all=True, skip_hashing=True)
+    video, plex_part = scanned_parts.items()[0]
+
+    subtitle = ModifiedSubtitle(language, mods=stored_subtitle.mods)
+    subtitle.content = stored_subtitle.content
+    if stored_subtitle.encoding:
+        # thanks plex
+        setattr(subtitle, "_guessed_encoding", stored_subtitle.encoding)
+
+        if stored_subtitle.encoding != "utf-8":
+            subtitle.normalize()
+            stored_subtitle.content = subtitle.content
+            stored_subtitle.encoding = "utf-8"
+            storage.save(stored_subs)
+
+    subtitle.plex_media_fps = plex_part.fps
+    subtitle.page_link = stored_subtitle.id
+    subtitle.language = language
+    subtitle.id = stored_subtitle.id
+
+    try:
+        save_subtitles(scanned_parts, {video: [subtitle]}, mode="m", bare_save=True)
+        Log.Debug("Modified %s subtitle for: %s:%s with: %s", language.name, rating_key, part_id,
+                  ", ".join(stored_subtitle.mods) if stored_subtitle.mods else "none")
+    except:
+        Log.Error("Something went wrong when modifying subtitle: %s", traceback.format_exc())
+
+    if subtitle.storage_path:
+        stored_subtitle.last_mod = datetime.datetime.fromtimestamp(os.path.getmtime(subtitle.storage_path))
+        storage.save(stored_subs)
+
+    if cleanup:
+        storage.destroy()
+
+
+def set_mods_for_part(rating_key, part_id, language, item_type, mods, mode="add"):
+    plex_item = get_item(rating_key)
+
+    if not plex_item:
+        return
+
+    current_sub, stored_subs, storage = get_current_sub(rating_key, part_id, language, plex_item=plex_item)
    if mode == "add":
        for mod in mods:
            identifier, args = SubtitleModifications.parse_identifier(mod)
@@ -352,37 +438,7 @@ def set_mods_for_part(rating_key, part_id, language, item_type, mods, mode="add"
        raise NotImplementedError("Wrong mode given")
    storage.save(stored_subs)

-    try:
-        metadata = get_plex_metadata(rating_key, part_id, item_type)
-    except PartUnknownException:
-        return
-
-    scanned_parts = scan_videos([metadata], kind="series" if item_type == "episode" else "movie", ignore_all=True,
-                                no_refining=True)
-    video, plex_part = scanned_parts.items()[0]
-
-    subtitle = ModifiedSubtitle(language, mods=current_sub.mods)
-    subtitle.content = current_sub.content
-    if current_sub.encoding:
-        # thanks plex
-        setattr(subtitle, "_guessed_encoding", current_sub.encoding)
-
-        if current_sub.encoding != "utf-8":
-            subtitle.set_encoding("utf-8")
-            current_sub.content = subtitle.content
-            current_sub.encoding = "utf-8"
-            storage.save(stored_subs)
+    save_stored_sub(current_sub, rating_key, part_id, language, item_type, plex_item=plex_item, storage=storage,
+                    stored_subs=stored_subs)

    storage.destroy()
-
-    subtitle.plex_media_fps = plex_part.fps
-    subtitle.page_link = "modify subtitles with: %s" % (", ".join(current_sub.mods) if current_sub.mods else "none")
-    subtitle.language = language
-    subtitle.id = current_sub.id
-
-    try:
-        save_subtitles(scanned_parts, {video: [subtitle]}, mode="m", bare_save=True)
-        Log.Debug("Modified %s subtitle for: %s:%s with: %s", language.name, rating_key, part_id,
-                  ", ".join(current_sub.mods) if current_sub.mods else "none")
-    except:
-        Log.Error("Something went wrong when modifying subtitle: %s", traceback.format_exc())
@@ -9,29 +9,33 @@ import subtitlehelpers
 from config import config as sz_config


+SECONDARY_TAGS = ['forced', 'normal', 'default', 'embedded', 'embedded-forced', 'custom', 'hi', 'cc', 'sdh']
+
+
 def find_subtitles(part):
    lang_sub_map = {}
    part_filename = helpers.unicodize(part.file)
    part_basename = os.path.splitext(os.path.basename(part_filename))[0]
    use_filesystem = helpers.cast_bool(Prefs["subtitles.save.filesystem"])
-    paths = [os.path.dirname(part_filename)] if use_filesystem else []
+    sub_dir_custom = Prefs["subtitles.save.subFolder.Custom"].strip() \
+        if Prefs["subtitles.save.subFolder.Custom"] else None

-    global_subtitle_folder = None
+    use_sub_subfolder = Prefs["subtitles.save.subFolder"] != "current folder" and not sub_dir_custom
+    sub_subfolder = None
+    paths = [os.path.dirname(part_filename)] if use_filesystem else []

    global_folders = []

    if use_filesystem:
        # Check for local subtitles subdirectory
        sub_dir_base = paths[0]
-
        sub_dir_list = []

-        if Prefs["subtitles.save.subFolder"] != "current folder":
+        if use_sub_subfolder:
            # got selected subfolder
-            sub_dir_list.append(os.path.join(sub_dir_base, Prefs["subtitles.save.subFolder"]))
-
-        sub_dir_custom = Prefs["subtitles.save.subFolder.Custom"].strip() \
-            if Prefs["subtitles.save.subFolder.Custom"] else None
+            sub_subfolder = os.path.join(sub_dir_base, Prefs["subtitles.save.subFolder"])
+            sub_dir_list.append(sub_subfolder)
+            sub_subfolder = os.path.normpath(helpers.unicodize(sub_subfolder))

        if sub_dir_custom:
            # got custom subfolder
@@ -84,8 +88,12 @@ def find_subtitles(part):
                media_files.append(root)

    # cleanup any leftover subtitle if no associated media file was found
-    if helpers.cast_bool(Prefs["subtitles.autoclean"]):
+    if use_filesystem and helpers.cast_bool(Prefs["subtitles.autoclean"]):
        for path in paths:
+            # only housekeep in sub_subfolder if sub_subfolder is used
+            if use_sub_subfolder and path != sub_subfolder and not sz_config.advanced.thorough_cleaning:
+                continue
+
            # we can't housekeep the global subtitle folders as we don't know about *all* media files
            # in a library; skip them
            skip_path = False
@@ -105,11 +113,10 @@ def find_subtitles(part):
                if os.path.isfile(enc_fn):
                    (root, ext) = os.path.splitext(file_path_listing)
                    # it's a subtitle file
-                    if ext.lower()[1:] in config.SUBTITLE_EXTS:
+                    if ext.lower()[1:] in config.SUBTITLE_EXTS_BASE:
                        # get fn without forced/default/normal tag
                        split_tag = root.rsplit(".", 1)
-                        if len(split_tag) > 1 and split_tag[1].lower() in ['forced', 'normal', 'default', 'embedded',
-                                                                           'custom']:
+                        if len(split_tag) > 1 and split_tag[1].lower() in SECONDARY_TAGS:
                            root = split_tag[0]

                        # get associated media file name without language
@@ -135,7 +142,7 @@ def find_subtitles(part):
        # get fn without forced/default/normal tag
        split_tag = local_basename.rsplit(".", 1)
        has_additional_tag = False
-        if len(split_tag) > 1 and split_tag[1].lower() in ['forced', 'normal', 'default', 'embedded', 'custom']:
+        if len(split_tag) > 1 and split_tag[1].lower() in SECONDARY_TAGS:
            local_basename = split_tag[0]
            has_additional_tag = True

@@ -159,7 +166,7 @@ def find_subtitles(part):
                continue

        # determine whether to pick up the subtitle based on our match strictness
-        elif not filename_matches_part:
+        if not filename_matches_part:
            if sz_config.ext_match_strictness == "strict" or (
                            sz_config.ext_match_strictness == "loose" and not filename_contains_part):
                # Log.Debug("%s doesn't match %s, skipping" % (helpers.unicodize(local_filename),
@@ -2,10 +2,17 @@
 import traceback
 import time

-from support.config import config
-from support.helpers import get_plex_item_display_title, cast_bool
+import os
+
+from babelfish import LanguageReverseError
+
+from support.config import config, TEXT_SUBTITLE_EXTS
+from support.helpers import get_plex_item_display_title, cast_bool, get_language_from_stream
 from support.items import get_item
 from support.lib import Plex
+from support.storage import get_subtitle_storage
+from subzero.video import has_external_subtitle
+from subzero.language import Language


 def item_discover_missing_subs(rating_key, kind="show", added_at=None, section_title=None, internal=False, external=True, languages=()):
@@ -17,11 +24,59 @@ def item_discover_missing_subs(rating_key, kind="show", added_at=None, section_t
    else:
        item_title = get_plex_item_display_title(item, kind, section_title=section_title)

+    subtitle_storage = get_subtitle_storage()
+    stored_subs = subtitle_storage.load(rating_key)
+    subtitle_storage.destroy()
+
+    subtitle_target_dir, tdir_is_absolute = config.subtitle_sub_dir
+
    missing = set()
-    languages_set = set(languages)
+    languages_set = set([Language.fromietf(str(l)) for l in languages])
    for media in item.media:
-        existing_subs = {"internal": [], "external": [], "count": 0}
+        existing_subs = {"internal": [], "external": [], "own_external": [], "count": 0}
        for part in media.parts:
+
+            # did we already download an external subtitle before?
+            if subtitle_target_dir and stored_subs:
+                for language in languages_set:
+                    if has_external_subtitle(part.id, stored_subs, language):
+                        # check the existence of the actual subtitle file
+
+                        # get media filename without extension
+                        part_basename = os.path.splitext(os.path.basename(part.file))[0]
+
+                        # compute target directory for subtitle
+                        # fixme: move to central location
+                        if tdir_is_absolute:
+                            possible_subtitle_path_base = subtitle_target_dir
+                        else:
+                            possible_subtitle_path_base = os.path.join(os.path.dirname(part.file), subtitle_target_dir)
+
+                        possible_subtitle_path_base = os.path.realpath(possible_subtitle_path_base)
+
+                        # folder actually exists?
+                        if not os.path.isdir(possible_subtitle_path_base):
+                            continue
+
+                        found_any = False
+                        for ext in config.subtitle_formats:
+                            if cast_bool(Prefs['subtitles.only_one']):
+                                possible_subtitle_path = os.path.join(possible_subtitle_path_base,
+                                                                      u"%s.%s" % (part_basename, ext))
+                            else:
+                                possible_subtitle_path = os.path.join(possible_subtitle_path_base,
+                                                                      u"%s.%s.%s" % (part_basename, language, ext))
+
+                            # check for subtitle existence
+                            if os.path.isfile(possible_subtitle_path):
+                                found_any = True
+                                Log.Debug(u"Found: %s", possible_subtitle_path)
+                                break
+
+                        if found_any:
+                            existing_subs["own_external"].append(language)
+                            existing_subs["count"] = existing_subs["count"] + 1
+
            for stream in part.streams:
                if stream.stream_type == 3:
                    if stream.index:
@@ -29,18 +84,72 @@ def item_discover_missing_subs(rating_key, kind="show", added_at=None, section_t
                    else:
                        key = "external"

-                    existing_subs[key].append(Locale.Language.Match(stream.language_code or ""))
-                    existing_subs["count"] = existing_subs["count"] + 1
+                    if not config.exotic_ext and stream.codec.lower() not in TEXT_SUBTITLE_EXTS:
+                        continue

-        missing_from_part = set(languages_set)
+                    # treat unknown language as lang1?
+                    if not stream.language_code and config.treat_und_as_first:
+                        lang = Language.fromietf(str(list(config.lang_list)[0]))
+
+                    # we can't parse empty language codes
+                    elif not stream.language_code or not stream.codec:
+                        continue
+
+                    else:
+                        # parse with internal language parser first
+                        try:
+                            lang = get_language_from_stream(stream.language_code)
+                            if not lang:
+                                if config.treat_und_as_first:
+                                    lang = Language.fromietf(str(list(config.lang_list)[0]))
+                                else:
+                                    continue
+
+                        except (ValueError, LanguageReverseError):
+                            continue
+
+                    if lang:
+                        # Log.Debug("Found babelfish language: %r", lang)
+                        existing_subs[key].append(lang)
+                        existing_subs["count"] = existing_subs["count"] + 1
+
+        missing_from_part = set([Language.fromietf(str(l)) for l in languages])
        if existing_subs["count"]:
-            existing_flat = set((existing_subs["internal"] if internal else []) + (existing_subs["external"] if external else []))
-            if languages_set.issubset(existing_flat) or (len(existing_flat) >= 1 and Prefs['subtitles.only_one']):
+
+            # fixme: this is actually somewhat broken with IETF, as Plex doesn't store the country portion
+            # (pt instead of pt-BR) inside the database. So it might actually download pt-BR if there's a local pt-BR
+            # subtitle but not our own.
+            existing_flat = set((existing_subs["internal"] if internal else [])
+                                + (existing_subs["external"] if external else [])
+                                + existing_subs["own_external"])
+
+            check_languages = set([Language.fromietf(str(l)) for l in languages])
+            alpha3_map = {}
+            if config.ietf_as_alpha3:
+                for language in existing_flat:
+                    if language.country:
+                        alpha3_map[language.alpha3] = language.country
+                        language.country = None
+
+                for language in check_languages:
+                    if language.country:
+                        alpha3_map[language.alpha3] = language.country
+                        language.country = None
+
+            # compare sets of strings, not sets of different Language instances
+            check_languages_str = set(str(l) for l in check_languages)
+            existing_flat_str = set(str(l) for l in existing_flat)
+
+            if check_languages_str.issubset(existing_flat_str) or \
+                    (len(existing_flat) >= 1 and Prefs['subtitles.only_one']):
                # all subs found
                #Log.Info(u"All subtitles exist for '%s'", item_title)
                continue

-            missing_from_part = languages_set - existing_flat
+            missing_from_part = set(Language.fromietf(l) for l in check_languages_str - existing_flat_str)
+            if config.ietf_as_alpha3:
+                for language in missing_from_part:
+                    language.country = alpha3_map.get(language.alpha3, None)

        if missing_from_part:
            Log.Info(u"Subs still missing for '%s' (%s: %s): %s", item_title, rating_key, media.id,
@@ -48,6 +157,8 @@ def item_discover_missing_subs(rating_key, kind="show", added_at=None, section_t
            missing.update(missing_from_part)

    if missing:
+        # deduplicate
+        missing = set(Language.fromietf(la) for la in set(str(l) for l in missing))
        return added_at, item_id, item_title, item, missing


@@ -60,7 +171,7 @@ def items_get_all_missing_subs(items, sleep_after_request=False):
                kind=kind,
                added_at=added_at,
                section_title=section_title,
-                languages=config.lang_list,
+                languages=config.lang_list.copy(),
                internal=cast_bool(Prefs["subtitles.scan.embedded"]),
                external=cast_bool(Prefs["subtitles.scan.external"])
            )
@@ -1,13 +1,12 @@
 # coding=utf-8

 import os
-from urllib2 import URLError

 import helpers
-from config import config
 from items import get_item
-from lib import get_intent, Plex
-from subzero.video import parse_video
+from lib import Plex
+from support.config import TEXT_SUBTITLE_EXTS, config
+

 def get_metadata_dict(item, part, add):
    data = {
@@ -45,10 +44,11 @@ def get_plexapi_stream_info(plex_item, part_id=None):
        return d

    data["video_codec"] = current_media.video_codec
-    data["audio_codec"] = current_media.audio_codec.upper()
+    if current_media.audio_codec:
+        data["audio_codec"] = current_media.audio_codec.upper()

-    if data["audio_codec"] == "DCA":
-        data["audio_codec"] = "DTS"
+        if data["audio_codec"] == "DCA":
+            data["audio_codec"] = "DTS"

    if current_media.audio_channels == 8:
        data["audio_channels"] = "7.1"
@@ -153,10 +153,9 @@ def get_stream_fps(streams):


 def get_media_item_ids(media, kind="series"):
-    ids = []
-    if kind == "movies":
-        ids.append(media.id)
-    else:
+    # fixme: does this work correctly for full series force-refreshes and its intents?
+    ids = [media.id]
+    if kind == "series":
        for season in media.seasons:
            for episode in media.seasons[season].episodes:
                ids.append(media.seasons[season].episodes[episode].id)
@@ -164,98 +163,51 @@ def get_media_item_ids(media, kind="series"):
    return ids


-def scan_video(pms_video_info, ignore_all=False, hints=None, rating_key=None, no_refining=False):
-    """
-    returnes a subliminal/guessit-refined parsed video
-    :param pms_video_info: 
-    :param ignore_all: 
-    :param hints: 
-    :param rating_key: 
-    :return: 
-    """
-    embedded_subtitles = not ignore_all and Prefs['subtitles.scan.embedded']
-    external_subtitles = not ignore_all and Prefs['subtitles.scan.external']
-
-    plex_part = pms_video_info["plex_part"]
-
-    if ignore_all:
-        Log.Debug("Force refresh intended.")
-
-    Log.Debug("Scanning video: %s, external_subtitles=%s, embedded_subtitles=%s" % (
-        plex_part.file, external_subtitles, embedded_subtitles))
-
-    known_embedded = []
+def get_all_parts(plex_item):
    parts = []
-    for media in list(Plex["library"].metadata(rating_key))[0].media:
+    for media in plex_item.media:
        parts += media.parts

-    plexpy_part = None
-    for part in parts:
-        if int(part.id) == int(plex_part.id):
-            plexpy_part = part
-
-    # embedded subtitles
-    if plexpy_part:
-        for stream in plexpy_part.streams:
-            # subtitle stream
-            if stream.stream_type == 3:
-                if (config.forced_only and getattr(stream, "forced")) or \
-                        (not config.forced_only and not getattr(stream, "forced")):
-
-                    # embedded subtitle
-                    if not stream.stream_key:
-                        if config.exotic_ext or stream.codec in ("srt", "ass", "ssa"):
-                            lang_code = stream.language_code
-
-                            # treat unknown language as lang1?
-                            if not lang_code and config.treat_und_as_first:
-                                lang_code = list(config.lang_list)[0].alpha3
-                            known_embedded.append(lang_code)
-    else:
-        Log.Warn("Part %s missing of %s, not able to scan internal streams", plex_part.id, rating_key)
-
-    try:
-        # get basic video info scan (filename)
-        video = parse_video(plex_part.file, pms_video_info, hints, external_subtitles=external_subtitles,
-                            embedded_subtitles=embedded_subtitles, known_embedded=known_embedded,
-                            forced_only=config.forced_only, no_refining=no_refining)
-
-        # add video fps info
-        video.fps = plex_part.fps
-        return video
-
-    except ValueError:
-        Log.Warn("File could not be guessed by subliminal: %s" % plex_part.file)
+    return parts


-def scan_videos(videos, kind="series", ignore_all=False, no_refining=False):
-    """
-    receives a list of videos containing dictionaries returned by media_to_videos
-    :param videos:
-    :param kind: series or movies
-    :return: dictionary of subliminal.video.scan_video, key=subliminal scanned video, value=plex file part
-    """
-    ret = {}
-    for video in videos:
-        intent = get_intent()
-        force_refresh = intent.get("force", video["id"], video["series_id"], video["season_id"])
-        Log.Debug("Determining force-refresh (video: %s, series: %s, season: %s), result: %s"
-                  % (video["id"], video["series_id"], video["season_id"], force_refresh))
+def get_embedded_subtitle_streams(part, requested_language=None, skip_duplicate_unknown=True, get_forced=None):
+    streams = []
+    has_unknown = False
+    for stream in part.streams:
+        # subtitle stream
+        if stream.stream_type == 3 and not stream.stream_key and stream.codec in TEXT_SUBTITLE_EXTS:
+            language = helpers.get_language_from_stream(stream.language_code)
+            is_unknown = False
+            found_requested_language = requested_language and requested_language == language

-        hints = helpers.get_item_hints(video)
-        video["plex_part"].fps = get_stream_fps(video["plex_part"].streams)
-        scanned_video = scan_video(video, ignore_all=force_refresh or ignore_all, hints=hints,
-                                   rating_key=video["id"], no_refining=no_refining)
+            if get_forced is not None:
+                if (get_forced and not stream.forced) or (not get_forced and stream.forced):
+                    continue

-        if not scanned_video:
-            continue
+            if not language and config.treat_und_as_first:
+                # only consider first unknown subtitle stream
+                if has_unknown and skip_duplicate_unknown:
+                    continue

-        scanned_video.id = video["id"]
-        part_metadata = video.copy()
-        del part_metadata["plex_part"]
-        scanned_video.plexapi_metadata = part_metadata
-        ret[scanned_video] = video["plex_part"]
-    return ret
+                language = list(config.lang_list)[0]
+                is_unknown = True
+                has_unknown = True
+
+            if not requested_language or found_requested_language:
+                streams.append({"stream": stream, "is_unknown": is_unknown, "language": language})
+
+                if found_requested_language:
+                    break
+
+    return streams
+
+
+def get_part(plex_item, part_id):
+    for media in plex_item.media:
+        for part in media.parts:
+            if str(part.id) == str(part_id):
+                return part


 def get_plex_metadata(rating_key, part_id, item_type, plex_item=None):
@@ -275,11 +227,7 @@ def get_plex_metadata(rating_key, part_id, item_type, plex_item=None):
        return

    # find current part
-    current_part = None
-    for media in plex_item.media:
-        for part in media.parts:
-            if str(part.id) == str(part_id):
-                current_part = part
+    current_part = get_part(plex_item, part_id)

    if not current_part:
        raise helpers.PartUnknownException("Part unknown")
@@ -334,6 +282,24 @@ def get_plex_metadata(rating_key, part_id, item_type, plex_item=None):
    return metadata


+def get_blacklist_from_part_map(video_part_map, languages):
+    from support.storage import get_subtitle_storage
+    subtitle_storage = get_subtitle_storage()
+    blacklist = []
+    for video, part in video_part_map.iteritems():
+        stored_subs = subtitle_storage.load_or_new(video.plexapi_metadata["item"])
+        for language in languages:
+            current_bl, subs = stored_subs.get_blacklist(part.id, language)
+            if not current_bl:
+                continue
+
+            blacklist = blacklist + [(str(a), str(b)) for a, b in current_bl.keys()]
+
+    subtitle_storage.destroy()
+
+    return blacklist
+
+
 class PMSMediaProxy(object):
    """
    Proxy object for getting data from a mediatree items "internally" via the PMS
@@ -0,0 +1,124 @@
+# coding=utf-8
+import traceback
+import helpers
+
+from support.lib import Plex, get_intent
+from support.plex_media import get_stream_fps
+from support.storage import get_subtitle_storage
+from support.config import config, TEXT_SUBTITLE_EXTS
+
+from subzero.video import parse_video, set_existing_languages
+
+
+def scan_video(pms_video_info, ignore_all=False, hints=None, rating_key=None, providers=None, skip_hashing=False):
+    """
+    returnes a subliminal/guessit-refined parsed video
+    :param pms_video_info:
+    :param ignore_all:
+    :param hints:
+    :param rating_key:
+    :return:
+    """
+    embedded_subtitles = not ignore_all and Prefs['subtitles.scan.embedded']
+    external_subtitles = not ignore_all and Prefs['subtitles.scan.external']
+
+    plex_part = pms_video_info["plex_part"]
+
+    if ignore_all:
+        Log.Debug("Force refresh intended.")
+
+    Log.Debug("Scanning video: %s, external_subtitles=%s, embedded_subtitles=%s" % (
+        plex_part.file, external_subtitles, embedded_subtitles))
+
+    known_embedded = []
+    parts = []
+    for media in list(Plex["library"].metadata(rating_key))[0].media:
+        parts += media.parts
+
+    plexpy_part = None
+    for part in parts:
+        if int(part.id) == int(plex_part.id):
+            plexpy_part = part
+
+    # embedded subtitles
+    if plexpy_part:
+        for stream in plexpy_part.streams:
+            # subtitle stream
+            if stream.stream_type == 3:
+                if (config.forced_only and getattr(stream, "forced")) or \
+                        (not config.forced_only and not getattr(stream, "forced")):
+
+                    # embedded subtitle
+                    # fixme: tap into external subtitles here instead of scanning for ourselves later?
+                    if not stream.stream_key and stream.codec:
+                        if config.exotic_ext or stream.codec.lower() in TEXT_SUBTITLE_EXTS:
+                            lang = helpers.get_language_from_stream(stream.language_code)
+
+                            # treat unknown language as lang1?
+                            if not lang and config.treat_und_as_first:
+                                lang = list(config.lang_list)[0]
+
+                            if lang:
+                                known_embedded.append(lang.alpha3)
+    else:
+        Log.Warn("Part %s missing of %s, not able to scan internal streams", plex_part.id, rating_key)
+
+    subtitle_storage = get_subtitle_storage()
+    stored_subs = subtitle_storage.load(rating_key)
+    subtitle_storage.destroy()
+
+    try:
+        # get basic video info scan (filename)
+        # video = parse_video(plex_part.file, pms_video_info, hints, external_subtitles=external_subtitles,
+        #                     embedded_subtitles=embedded_subtitles, known_embedded=known_embedded,
+        #                     forced_only=config.forced_only, no_refining=no_refining, ignore_all=ignore_all,
+        #                     stored_subs=stored_subs, refiner_settings=config.refiner_settings, providers=providers,
+        #                     skip_hashing=config.low_impact_mode)
+        video = parse_video(plex_part.file, hints, skip_hashing=config.low_impact_mode or skip_hashing,
+                            providers=providers)
+
+        if not ignore_all:
+            set_existing_languages(video, pms_video_info, external_subtitles=external_subtitles,
+                                   embedded_subtitles=embedded_subtitles, known_embedded=known_embedded,
+                                   forced_only=config.forced_only, stored_subs=stored_subs, languages=config.lang_list,
+                                   only_one=config.only_one)
+
+        # add video fps info
+        video.fps = plex_part.fps
+        return video
+
+    except ValueError:
+        Log.Warn("File could not be guessed: %s: %s", plex_part.file, traceback.format_exc())
+
+
+def scan_videos(videos, ignore_all=False, providers=None, skip_hashing=False):
+    """
+    receives a list of videos containing dictionaries returned by media_to_videos
+    :param videos:
+    :param kind: series or movies
+    :return: dictionary of subliminal.video.scan_video, key=subliminal scanned video, value=plex file part
+    """
+    ret = {}
+    for video in videos:
+        intent = get_intent()
+        force_refresh = intent.get("force", video["id"], video["series_id"], video["season_id"])
+        Log.Debug("Determining force-refresh (video: %s, series: %s, season: %s), result: %s"
+                  % (video["id"], video["series_id"], video["season_id"], force_refresh))
+
+        hints = helpers.get_item_hints(video)
+        video["plex_part"].fps = get_stream_fps(video["plex_part"].streams)
+        p = providers or config.get_providers(media_type="series" if video["type"] == "episode" else "movies")
+        scanned_video = scan_video(video, ignore_all=force_refresh or ignore_all, hints=hints,
+                                   rating_key=video["id"], providers=p,
+                                   skip_hashing=skip_hashing)
+
+        if not scanned_video:
+            continue
+
+        scanned_video.id = video["id"]
+        part_metadata = video.copy()
+        del part_metadata["plex_part"]
+        scanned_video.plexapi_metadata = part_metadata
+        scanned_video.ignore_all = force_refresh
+        ret[scanned_video] = video["plex_part"]
+    return ret
@@ -4,21 +4,24 @@ import datetime
 import logging
 import traceback

+from config import config

 def parse_frequency(s):
-    if s == "never" or s == None:
+    if s == "never" or s is None:
        return None, None
    kind, num, unit = s.split()
    return int(num), unit


 class DefaultScheduler(object):
-    thread = None
+    queue_thread = None
+    scheduler_thread = None
    running = False
    registry = None

    def __init__(self):
-        self.thread = None
+        self.queue_thread = None
+        self.scheduler_thread = None
        self.running = False
        self.registry = []

@@ -47,6 +50,7 @@ class DefaultScheduler(object):
            if Dict["tasks"]:
                for task_name in Dict["tasks"].keys():
                    if task_name == "queue":
+                        Dict["tasks"][task_name] = []
                        continue

                    Dict["tasks"][task_name]["data"] = {}
@@ -58,6 +62,7 @@ class DefaultScheduler(object):
            raise NotImplementedError("Task missing! %s" % name)

        Dict["tasks"][name]["data"] = {}
+        Dict["tasks"][name]["running"] = False
        Dict.Save()
        Log.Debug("Task data cleared: %s", name)

@@ -78,7 +83,8 @@ class DefaultScheduler(object):

    def run(self):
        self.running = True
-        self.thread = Thread.Create(self.worker)
+        self.scheduler_thread = Thread.Create(self.scheduler_worker)
+        self.queue_thread = Thread.Create(self.queue_worker)

    def stop(self):
        self.running = False
@@ -113,6 +119,7 @@ class DefaultScheduler(object):

    def run_task(self, name, *args, **kwargs):
        task = self.tasks[name]["task"]
+
        if task.running:
            Log.Debug("Scheduler: Not running %s, as it's currently running.", name)
            return False
@@ -124,8 +131,12 @@ class DefaultScheduler(object):
        except Exception, e:
            Log.Error("Scheduler: Something went wrong when running %s: %s", name, traceback.format_exc())
        finally:
-            task.post_run(Dict["tasks"][name]["data"])
+            try:
+                task.post_run(Dict["tasks"][name]["data"])
+            except:
+                Log.Error("Scheduler: task.post_run failed for %s: %s", name, traceback.format_exc())
            Dict.Save()
+            config.sync_cache()

    def dispatch_task(self, *args, **kwargs):
        if "queue" not in Dict["tasks"]:
@@ -157,7 +168,7 @@ class DefaultScheduler(object):
                continue
            Log.Debug("Scheduler: Not sending signal %s to task %s, because: not running", name, task_name)

-    def worker(self):
+    def queue_worker(self):
        Thread.Sleep(10.0)
        while 1:
            if not self.running:
@@ -170,10 +181,18 @@ class DefaultScheduler(object):
                Dict["tasks"]["queue"] = []
                Dict.Save()
                for args, kwargs in queue:
-                    Log.Debug("Dispatching single task: %s, %s", args, kwargs)
+                    Log.Debug("Queue: Dispatching single task: %s, %s", args, kwargs)
                    Thread.Create(self.run_task, True, *args, **kwargs)
                    Thread.Sleep(5.0)

+            Thread.Sleep(1)
+
+    def scheduler_worker(self):
+        Thread.Sleep(10.0)
+        while 1:
+            if not self.running:
+                break
+
            # scheduled tasks
            for name in self.tasks.keys():
                now = datetime.datetime.now()
@@ -193,6 +212,14 @@ class DefaultScheduler(object):
                if not frequency_num:
                    continue

+                # run legacy SARAM once
+                if name == "SearchAllRecentlyAddedMissing" and ("hasRunLSARAM" not in Dict or not Dict["hasRunLSARAM"]):
+                    task = self.tasks["LegacySearchAllRecentlyAddedMissing"]["task"]
+                    task.last_run = None
+                    name = "LegacySearchAllRecentlyAddedMissing"
+                    Dict["hasRunLSARAM"] = True
+                    Dict.Save()
+
                if not task.last_run or (task.last_run + datetime.timedelta(**{frequency_key: frequency_num}) <= now):
                    # fixme: scheduled tasks run synchronously. is this the best idea?
                    Thread.Create(self.run_task, True, name)
@@ -4,9 +4,12 @@ import datetime
 import os
 import pprint
 import copy
+import traceback
+import types

 from subliminal_patch.core import save_subtitles as subliminal_save_subtitles
 from subzero.subtitle_storage import StoredSubtitlesManager
+from subzero.lib.io import FileIO

 from subtitlehelpers import force_utf8
 from config import config
@@ -16,13 +19,14 @@ from support.items import get_item


 def get_subtitle_storage():
-    return StoredSubtitlesManager(Data, get_item)
+    return StoredSubtitlesManager(Data, Thread, get_item)


-def store_subtitle_info(scanned_video_part_map, downloaded_subtitles, storage_type, mode="a"):
+def store_subtitle_info(scanned_video_part_map, downloaded_subtitles, storage_type, mode="a", set_current=True):
    """
    stores information about downloaded subtitles in plex's Dict()
    """
+    subtitle_storage = get_subtitle_storage()
    for video, video_subtitles in downloaded_subtitles.items():
        part = scanned_video_part_map[video]
        part_id = str(part.id)
@@ -31,15 +35,25 @@ def store_subtitle_info(scanned_video_part_map, downloaded_subtitles, storage_ty
        metadata = video.plexapi_metadata
        title = get_title_for_video_metadata(metadata)

-        subtitle_storage = get_subtitle_storage()
-        stored_subs = subtitle_storage.load_or_new(plex_item)
+        stored_subs = subtitle_storage.load(video_id)
+        is_new = False
+        if not stored_subs:
+            is_new = True
+            Log.Debug(u"Creating new subtitle storage: %s, %s", video_id, part_id)
+            stored_subs = subtitle_storage.new(plex_item)

        for subtitle in video_subtitles:
            lang = str(subtitle.language)
-            subtitle.set_encoding("utf-8")
-            Log.Debug(u"Adding subtitle to storage: %s, %s, %s, %s" % (video_id, part_id, title,
-                                                                       subtitle.guess_encoding()))
-            ret_val = stored_subs.add(part_id, lang, subtitle, storage_type, mode=mode)
+            subtitle.normalize()
+            Log.Debug(u"Adding subtitle to storage: %s, %s, %s, %s, %s" % (video_id, part_id, lang, title,
+                                                                           subtitle.guess_encoding()))
+
+            last_mod = None
+            if subtitle.storage_path:
+                last_mod = datetime.datetime.fromtimestamp(os.path.getmtime(subtitle.storage_path))
+
+            ret_val = stored_subs.add(part_id, lang, subtitle, storage_type, mode=mode, last_mod=last_mod,
+                                      set_current=set_current)

            if ret_val:
                Log.Debug("Subtitle stored")
@@ -47,9 +61,11 @@ def store_subtitle_info(scanned_video_part_map, downloaded_subtitles, storage_ty
            else:
                Log.Debug("Subtitle already existing in storage")

-        Log.Debug("Saving subtitle storage for %s" % video_id)
-        subtitle_storage.save(stored_subs)
-        subtitle_storage.destroy()
+        if is_new or video_subtitles:
+            Log.Debug("Saving subtitle storage for %s" % video_id)
+            subtitle_storage.save(stored_subs)
+
+    subtitle_storage.destroy()


 def reset_storage(key):
@@ -71,32 +87,43 @@ def log_storage(key):
        Log.Debug(pprint.pformat(Dict[key]))


-def save_subtitles_to_file(subtitles):
+def get_target_folder(file_path):
+    fld = None
    fld_custom = Prefs["subtitles.save.subFolder.Custom"].strip() \
        if Prefs["subtitles.save.subFolder.Custom"] else None

+    if fld_custom or Prefs["subtitles.save.subFolder"] != "current folder":
+        # specific subFolder requested, create it if it doesn't exist
+        fld_base = os.path.split(file_path)[0]
+        if fld_custom:
+            if fld_custom.startswith("/"):
+                # absolute folder
+                fld = fld_custom
+            else:
+                fld = os.path.join(fld_base, fld_custom)
+        else:
+            fld = os.path.join(fld_base, Prefs["subtitles.save.subFolder"])
+        fld = force_unicode(fld)
+        if not os.path.exists(fld):
+            os.makedirs(fld)
+    return fld
+
+
+def save_subtitles_to_file(subtitles, tags=None, forced_tag=None):
+    forced_tag = forced_tag or config.forced_only
    for video, video_subtitles in subtitles.items():
        if not video_subtitles:
            continue

-        fld = None
-        if fld_custom or Prefs["subtitles.save.subFolder"] != "current folder":
-            # specific subFolder requested, create it if it doesn't exist
-            fld_base = os.path.split(video.name)[0]
-            if fld_custom:
-                if fld_custom.startswith("/"):
-                    # absolute folder
-                    fld = fld_custom
-                else:
-                    fld = os.path.join(fld_base, fld_custom)
-            else:
-                fld = os.path.join(fld_base, Prefs["subtitles.save.subFolder"])
-            fld = force_unicode(fld)
-            if not os.path.exists(fld):
-                os.makedirs(fld)
-        subliminal_save_subtitles(video, video_subtitles, directory=fld, single=cast_bool(Prefs['subtitles.only_one']),
-                                  chmod=config.chmod, forced_tag=config.forced_only, path_decoder=force_unicode,
-                                  debug_mods=config.debug_mods, formats=config.subtitle_formats)
+        if not isinstance(video, types.StringTypes):
+            file_path = video.name
+        else:
+            file_path = video
+
+        fld = get_target_folder(file_path)
+        subliminal_save_subtitles(file_path, video_subtitles, directory=fld, single=cast_bool(Prefs['subtitles.only_one']),
+                                  chmod=config.chmod, forced_tag=forced_tag, path_decoder=force_unicode,
+                                  debug_mods=config.debug_mods, formats=config.subtitle_formats, tags=tags)
    return True


@@ -116,10 +143,12 @@ def save_subtitles_to_metadata(videos, subtitles):
    return True


-def save_subtitles(scanned_video_part_map, downloaded_subtitles, mode="a", bare_save=False, mods=None):
+def save_subtitles(scanned_video_part_map, downloaded_subtitles, mode="a", bare_save=False, mods=None,
+                   set_current=True):
    """
     
-    :param scanned_video_part_map: 
+    :param set_current: save the subtitle as the current one
+    :param scanned_video_part_map:
    :param downloaded_subtitles: 
    :param mode: 
    :param bare_save: don't trigger anything; don't store information
@@ -129,6 +158,8 @@ def save_subtitles(scanned_video_part_map, downloaded_subtitles, mode="a", bare_
    meta_fallback = False
    save_successful = False

+    # big fixme: scanned_video_part_map isn't needed to the current extent. rewrite.
+
    if mods:
        for video, video_subtitles in downloaded_subtitles.items():
            if not video_subtitles:
@@ -140,31 +171,66 @@ def save_subtitles(scanned_video_part_map, downloaded_subtitles, mode="a", bare_
                subtitle.plex_media_fps = video.fps

    storage = "metadata"
-    if Prefs['subtitles.save.filesystem']:
+    save_to_fs = cast_bool(Prefs['subtitles.save.filesystem'])
+    if save_to_fs:
        storage = "filesystem"
-        try:
-            Log.Debug("Using filesystem as subtitle storage")
-            save_subtitles_to_file(downloaded_subtitles)
-        except OSError:
-            if Prefs["subtitles.save.metadata_fallback"]:
-                meta_fallback = True
+
+    if set_current:
+        if save_to_fs:
+            try:
+                Log.Debug("Using filesystem as subtitle storage")
+                save_subtitles_to_file(downloaded_subtitles)
+            except OSError:
+                if cast_bool(Prefs["subtitles.save.metadata_fallback"]):
+                    meta_fallback = True
+                    storage = "metadata"
+                else:
+                    raise
            else:
-                raise
-        else:
-            save_successful = True
+                save_successful = True

-    if not Prefs['subtitles.save.filesystem'] or meta_fallback:
-        if meta_fallback:
-            Log.Debug("Using metadata as subtitle storage, because filesystem storage failed")
-        else:
-            Log.Debug("Using metadata as subtitle storage")
-        save_successful = save_subtitles_to_metadata(scanned_video_part_map, downloaded_subtitles)
+        if not save_to_fs or meta_fallback:
+            if meta_fallback:
+                Log.Debug("Using metadata as subtitle storage, because filesystem storage failed")
+            else:
+                Log.Debug("Using metadata as subtitle storage")
+            save_successful = save_subtitles_to_metadata(scanned_video_part_map, downloaded_subtitles)

-    if not bare_save and save_successful and config.notify_executable:
-        notify_executable(config.notify_executable, scanned_video_part_map, downloaded_subtitles, storage)
+        if not bare_save and save_successful and config.notify_executable:
+            notify_executable(config.notify_executable, scanned_video_part_map, downloaded_subtitles, storage)

-    if not bare_save and save_successful:
-        store_subtitle_info(scanned_video_part_map, downloaded_subtitles, storage, mode=mode)
+    if not bare_save and (save_successful or not set_current):
+        store_subtitle_info(scanned_video_part_map, downloaded_subtitles, storage, mode=mode, set_current=set_current)

    return save_successful

+
+def get_pack_id(subtitle):
+    return "%s_%s" % (subtitle.provider_name, subtitle.numeric_id)
+
+
+def get_pack_data(subtitle):
+    subtitle_id = get_pack_id(subtitle)
+
+    archive = os.path.join(config.pack_cache_dir, subtitle_id + ".archive")
+    if os.path.isfile(archive):
+        Log.Info("Loading archive from pack cache: %s", subtitle_id)
+        try:
+            data = FileIO.read(archive, 'rb')
+
+            return data
+        except:
+            Log.Error("Couldn't load archive from pack cache: %s: %s", subtitle_id, traceback.format_exc())
+
+
+def store_pack_data(subtitle, data):
+    subtitle_id = get_pack_id(subtitle)
+
+    archive = os.path.join(config.pack_cache_dir, subtitle_id + ".archive")
+
+    Log.Info("Storing archive in pack cache: %s", subtitle_id)
+    try:
+        FileIO.write(archive, data, 'wb')
+
+    except:
+        Log.Error("Couldn't store archive in pack cache: %s: %s", subtitle_id, traceback.format_exc())
@@ -1,9 +1,9 @@
 # coding=utf-8

 import re, os
-import config
 import helpers

+from config import config, SUBTITLE_EXTS, TEXT_SUBTITLE_EXTS
 from bs4 import UnicodeDammit


@@ -90,7 +90,7 @@ ENDSWITH_LANGUAGECODE_RE = re.compile("\.([^-.]{2,3})(?:-[A-Za-z]{2,})?$")


 def match_ietf_language(s):
-    language_match = re.match(".+\.([^\.]+)$" if not helpers.cast_bool(Prefs["subtitles.language.ietf"])
+    language_match = re.match(".+\.([^\.]+)$" if not helpers.cast_bool(Prefs["subtitles.language.ietf_display"])
                              else IETF_MATCH, s)
    if language_match and len(language_match.groups()) == 1:
        language = language_match.groups()[0]
@@ -102,7 +102,7 @@ class DefaultSubtitleHelper(SubtitleHelper):
    @classmethod
    def is_helper_for(cls, filename):
        (file, file_extension) = os.path.splitext(filename)
-        return file_extension.lower()[1:] in config.SUBTITLE_EXTS
+        return file_extension.lower()[1:] in SUBTITLE_EXTS

    def process_subtitles(self, part):

@@ -120,21 +120,29 @@ class DefaultSubtitleHelper(SubtitleHelper):
        forced = ''
        default = ''
        split_tag = file.rsplit('.', 1)
-        if len(split_tag) > 1 and split_tag[1].lower() in ['forced', 'normal', 'default', 'embedded', 'custom']:
+        if len(split_tag) > 1 and split_tag[1].lower() in ['forced', 'normal', 'default', 'embedded', 'embedded-forced',
+                                                           'custom']:
            file = split_tag[0]
+            sub_tag = split_tag[1].lower()
            # don't do anything with 'normal', we don't need it
-            if 'forced' == split_tag[1].lower():
+            if 'forced' in sub_tag:
                forced = '1'
-            if 'default' == split_tag[1].lower():
+            elif 'default' == sub_tag:
                default = '1'

        # Attempt to extract the language from the filename (e.g. Avatar (2009).eng)
        # IETF support thanks to
        # https://github.com/hpsbranco/LocalMedia.bundle/commit/4fad9aefedece78a1fa96401304351347f644369
-        language = Locale.Language.Match(match_ietf_language(file))
+        lang_part = match_ietf_language(file)
+        if lang_part != file:
+            language = Locale.Language.Match(lang_part)
+        elif config.only_one:
+            language = Locale.Language.Match(list(config.lang_list)[0].alpha2)
+        else:
+            language = Locale.Language.Match("xx")

        # skip non-SRT if wanted
-        if not helpers.cast_bool(Prefs["subtitles.scan.exotic_ext"]) and ext not in ["srt", "ass", "ssa", "vtt"]:
+        if not config.exotic_ext and ext not in TEXT_SUBTITLE_EXTS:
            return lang_sub_map

        codec = None
@@ -157,7 +165,8 @@ class DefaultSubtitleHelper(SubtitleHelper):
                Log("An error occurred while attempting to parse the subtitle file, skipping... : " + self.filename)
                return lang_sub_map

-        if codec is None and ext in ['ass', 'ssa', 'smi', 'srt', 'psb', 'vtt']:
+        # fixme: re-add vtt once Plex Inc. fixes this line in LocalMedia.bundle
+        if codec is None and ext in ['ass', 'ssa', 'smi', 'srt', 'psb']:
            codec = ext.replace('ass', 'ssa')

        if format is None:
@@ -1,24 +1,26 @@
 # coding=utf-8
-
+import glob
+import os
 import datetime
-import time
 import operator
 import traceback
+from urllib2 import URLError

 from subliminal_patch.score import compute_score
 from subliminal_patch.core import download_subtitles
-from subliminal import list_subtitles as list_all_subtitles
-from babelfish import Language
+from subliminal import list_subtitles as list_all_subtitles, region as subliminal_cache_region
+from subzero.language import Language
+from subzero.video import refine_video

 from missing_subtitles import items_get_all_missing_subs, refresh_item
 from scheduler import scheduler
 from storage import save_subtitles, get_subtitle_storage
 from support.config import config
-from support.items import get_recent_items, get_item, is_ignored
+from support.items import get_recent_items, get_item, is_ignored, get_item_title
 from support.helpers import track_usage, get_title_for_video_metadata, cast_bool, PartUnknownException
-from support.plex_media import scan_videos, get_plex_metadata
-from download import download_best_subtitles
-
+from support.plex_media import get_plex_metadata
+from support.scanning import scan_videos
+from download import download_best_subtitles, pre_download_hook, post_download_hook, language_hook

 PROVIDER_SLACK = 30
 DL_PROVIDER_SLACK = 30
@@ -97,7 +99,7 @@ class Task(object):

 class SubtitleListingMixin(object):
    def list_subtitles(self, rating_key, item_type, part_id, language, skip_wrong_fps=True, metadata=None,
-                       scanned_parts=None):
+                       scanned_parts=None, air_date_cutoff=None):

        if not metadata:
            metadata = get_plex_metadata(rating_key, part_id, item_type)
@@ -105,18 +107,26 @@ class SubtitleListingMixin(object):
        if not metadata:
            return

+        providers = config.get_providers(media_type="series" if item_type == "episode" else "movies")
        if not scanned_parts:
-            scanned_parts = scan_videos([metadata], kind="series" if item_type == "episode" else "movie", ignore_all=True)
+            scanned_parts = scan_videos([metadata], ignore_all=True, providers=providers)
            if not scanned_parts:
                Log.Error(u"%s: Couldn't list available subtitles for %s", self.name, rating_key)
                return

        video, plex_part = scanned_parts.items()[0]
+        refine_video(video, refiner_settings=config.refiner_settings)
+
+        if air_date_cutoff is not None and metadata["item"].year and \
+            metadata["item"].year + air_date_cutoff < datetime.date.today().year:
+            Log.Debug("Skipping searching for subtitles: %s, it aired over %s year(s) ago.", rating_key,
+                      air_date_cutoff)
+            return
+
        config.init_subliminal_patches()

-        provider_settings = config.provider_settings.copy()
+        provider_settings = config.provider_settings
        if not skip_wrong_fps:
-            provider_settings = config.provider_settings.copy()
            provider_settings["opensubtitles"]["skip_wrong_fps"] = False

        if item_type == "episode":
@@ -126,10 +136,14 @@ class SubtitleListingMixin(object):
        else:
            min_score = 60

-        available_subs = list_all_subtitles(scanned_parts, {Language.fromietf(language)},
-                                            providers=config.providers,
+        languages = {Language.fromietf(language)}
+
+        available_subs = list_all_subtitles([video], languages,
+                                            providers=providers,
                                            provider_configs=provider_settings,
-                                            pool_class=config.provider_pool)
+                                            pool_class=config.provider_pool,
+                                            throttle_callback=config.provider_throttle,
+                                            language_hook=language_hook)

        use_hearing_impaired = Prefs['subtitles.search.hearingImpaired'] in ("prefer", "force HI")

@@ -143,6 +157,11 @@ class SubtitleListingMixin(object):
                Log.Error(u"%s: Match computation failed for %s: %s", self.name, s, traceback.format_exc())
                continue

+            # skip wrong season/episodes
+            if item_type == "episode" and not {"series", "season", "episode"}.issubset(matches):
+                Log.Debug(u"%s: Skipping %s, because it doesn't match our series/episode", self.name, s)
+                continue
+
            unsorted_subtitles.append(
                (s, compute_score(matches, s, video, hearing_impaired=use_hearing_impaired), matches))
        scored_subtitles = sorted(unsorted_subtitles, key=operator.itemgetter(1), reverse=True)
@@ -168,21 +187,36 @@ class DownloadSubtitleMixin(object):
        item_type = subtitle.item_type
        part_id = subtitle.part_id
        metadata = get_plex_metadata(rating_key, part_id, item_type)
-        scanned_parts = scan_videos([metadata], kind="series" if item_type == "episode" else "movie", ignore_all=True)
+        providers = config.get_providers(media_type="series" if item_type == "episode" else "movies")
+        scanned_parts = scan_videos([metadata], ignore_all=True, providers=providers)
        video, plex_part = scanned_parts.items()[0]

+        pre_download_hook(subtitle)
+
        # downloaded_subtitles = {subliminal.Video: [subtitle, subtitle, ...]}
-        download_subtitles([subtitle], providers=config.providers, provider_configs=config.provider_settings,
-                           pool_class=config.provider_pool)
+        download_subtitles([subtitle], providers=providers,
+                           provider_configs=config.provider_settings,
+                           pool_class=config.provider_pool, throttle_callback=config.provider_throttle)
+
+        post_download_hook(subtitle)
+
+        # may be redundant
+        subtitle.pack_data = None
+
        download_successful = False

        if subtitle.content:
            try:
                save_subtitles(scanned_parts, {video: [subtitle]}, mode=mode, mods=config.default_mods)
-                Log.Debug(u"%s: Manually downloaded subtitle for: %s", self.name, rating_key)
+                if mode == "m":
+                    Log.Debug(u"%s: Manually downloaded subtitle for: %s", self.name, rating_key)
+                    track_usage("Subtitle", "manual", "download", 1)
+                elif mode == "b":
+                    Log.Debug(u"%s: Downloaded better subtitle for: %s", self.name, rating_key)
+                    track_usage("Subtitle", "better", "download", 1)
                download_successful = True
                refresh_item(rating_key)
-                track_usage("Subtitle", "manual", "download", 1)
+
            except:
                Log.Error(u"%s: Something went wrong when downloading specific subtitle: %s",
                          self.name, traceback.format_exc())
@@ -197,8 +231,13 @@ class DownloadSubtitleMixin(object):
                    history.add(item_title, video.id, section_title=video.plexapi_metadata["section"],
                                subtitle=subtitle,
                                mode=mode)
+                    history.destroy()
+
+                    # clear missing subtitles menu data
+                    if not scheduler.is_task_running("MissingSubtitles"):
+                        scheduler.clear_task_data("MissingSubtitles")
        else:
-            set_refresh_menu_state(u"%s: Subtitle download failed (%s)", self.name, rating_key)
+            set_refresh_menu_state(u"%s: Subtitle download failed (%s)" % (self.name, rating_key))
        return download_successful


@@ -223,7 +262,12 @@ class AvailableSubsForItem(SubtitleListingMixin, Task):
    def run(self):
        super(AvailableSubsForItem, self).run()
        self.running = True
-        track_usage("Subtitle", "manual", "list", 1)
+        try:
+            track_usage("Subtitle", "manual", "list", 1)
+        except:
+            Log.Error("Something went wrong with track_usage: %s", traceback.format_exc())
+
+        Log.Debug("Listing available subtitles for: %s", self.rating_key)
        subs = self.list_subtitles(self.rating_key, self.item_type, self.part_id, self.language, skip_wrong_fps=False)
        if not subs:
            self.data = "found_none"
@@ -309,6 +353,8 @@ class SearchAllRecentlyAddedMissing(Task):
        now = datetime.datetime.now()
        min_score_series = int(Prefs["subtitles.search.minimumTVScore2"].strip())
        min_score_movies = int(Prefs["subtitles.search.minimumMovieScore2"].strip())
+        series_providers = config.get_providers(media_type="series")
+        movie_providers = config.get_providers(media_type="movies")

        is_recent_str = Prefs["scheduler.item_is_recent_age"]
        num, ident = is_recent_str.split()
@@ -320,24 +366,9 @@ class SearchAllRecentlyAddedMissing(Task):
            max_search_days = int(num) * 7

        subtitle_storage = get_subtitle_storage()
-        recent_sub_fns = subtitle_storage.get_recent_files(age_days=max_search_days)
-        viable_items = {}
+        recent_files = subtitle_storage.get_recent_files(age_days=max_search_days)

-        # determine viable items
-        for fn in recent_sub_fns:
-            # added_date <= max_search_days?
-            stored_subs = subtitle_storage.load(filename=fn)
-            if not stored_subs:
-                continue
-
-            if stored_subs.added_at + datetime.timedelta(days=max_search_days) <= now:
-                continue
-
-            viable_items[fn] = stored_subs
-
-        subtitle_storage.destroy()
-
-        self.items_searching = len(viable_items)
+        self.items_searching = len(recent_files)

        download_count = 0
        videos_with_downloads = 0
@@ -346,98 +377,129 @@ class SearchAllRecentlyAddedMissing(Task):

        Log.Info(u"%s: Searching for subtitles for %s items", self.name, self.items_searching)

-        # search for subtitles in viable items
-        for fn, stored_subs in viable_items.iteritems():
-            video_id = stored_subs.video_id
-
-            if stored_subs.item_type == "episode":
-                min_score = min_score_series
-            else:
-                min_score = min_score_movies
-
-            parts = []
-            plex_item = get_item(video_id)
-
-            if not plex_item:
-                Log.Info(u"%s: Item %s unknown, skipping", self.name, video_id)
-                continue
-
-            if is_ignored(video_id, item=plex_item):
-                continue
-
-            for media in plex_item.media:
-                parts += media.parts
-
-            downloads_per_video = 0
-            hit_providers = False
-            for part in parts:
-                part_id = part.id
-
-                try:
-                    metadata = get_plex_metadata(video_id, part_id, stored_subs.item_type)
-                except PartUnknownException:
-                    Log.Info(u"%s: Part %s:%s unknown, skipping", self.name, video_id, part_id)
-                    continue
-
-                if not metadata:
-                    Log.Info(u"%s: Part %s:%s unknown, skipping", self.name, video_id, part_id)
-                    continue
-
-                Log.Debug(u"%s: Looking for missing subtitles: %s:%s", self.name, video_id, part_id)
-                scanned_parts = scan_videos([metadata], kind="series"
-                                            if stored_subs.item_type == "episode" else "movie")
-
-                downloaded_subtitles = download_best_subtitles(scanned_parts, min_score=min_score)
-                hit_providers = downloaded_subtitles is not None
-                download_successful = False
-
-                if downloaded_subtitles:
-                    downloaded_any = any(downloaded_subtitles.values())
-                    if not downloaded_any:
-                        continue
-
-                    try:
-                        save_subtitles(scanned_parts, downloaded_subtitles, mode="a", mods=config.default_mods)
-                        Log.Debug(u"%s: Downloaded subtitle for item with missing subs: %s", self.name, video_id)
-                        download_successful = True
-                        refresh_item(video_id)
-                        track_usage("Subtitle", "manual", "download", 1)
-                    except:
-                        Log.Error(u"%s: Something went wrong when downloading specific subtitle: %s", self.name,
-                                  traceback.format_exc())
-                    finally:
-                        item_title = get_title_for_video_metadata(metadata, add_section_title=False)
-                        if download_successful:
-                            # store item in history
-                            for video, video_subtitles in downloaded_subtitles.items():
-                                if not video_subtitles:
-                                    continue
-
-                                for subtitle in video_subtitles:
-                                    downloads_per_video += 1
-                                    history.add(item_title, video.id, section_title=metadata["section"],
-                                                subtitle=subtitle,
-                                                mode="a")
-
-                    Log.Debug(u"%s: Waiting %s seconds before continuing", self.name, PROVIDER_SLACK)
-                    time.sleep(PROVIDER_SLACK)
-
-            download_count += downloads_per_video
-
-            if downloads_per_video:
-                videos_with_downloads += 1
-
-            self.items_done = self.items_done + 1
+        def skip_item():
+            self.items_searching = self.items_searching - 1
            self.percentage = int(self.items_done * 100 / self.items_searching)

-            if downloads_per_video:
-                Log.Debug(u"%s: Subtitles have been downloaded, "
-                          u"waiting %s seconds before continuing", self.name, DL_PROVIDER_SLACK)
-                time.sleep(DL_PROVIDER_SLACK)
-            else:
-                if hit_providers:
-                    Log.Debug(u"%s: Waiting %s seconds before continuing", self.name, PROVIDER_SLACK)
-                    time.sleep(PROVIDER_SLACK)
+        # search for subtitles in viable items
+        try:
+            for fn in recent_files:
+                stored_subs = subtitle_storage.load(filename=fn)
+                video_id = stored_subs.video_id
+                if not stored_subs:
+                    Log.Debug("Skipping item %s because storage is empty", video_id)
+                    skip_item()
+                    continue
+
+                # added_date <= max_search_days?
+                if stored_subs.added_at + datetime.timedelta(days=max_search_days) <= now:
+                    Log.Debug("Skipping item %s because it's too old", video_id)
+                    skip_item()
+                    continue
+
+                if stored_subs.item_type == "episode":
+                    min_score = min_score_series
+                    providers = series_providers
+                else:
+                    min_score = min_score_movies
+                    providers = movie_providers
+
+                parts = []
+                plex_item = get_item(video_id)
+
+                if not plex_item:
+                    Log.Info(u"%s: Item %s unknown, skipping", self.name, video_id)
+                    skip_item()
+                    continue
+
+                if is_ignored(video_id, item=plex_item):
+                    skip_item()
+                    continue
+
+                for media in plex_item.media:
+                    parts += media.parts
+
+                downloads_per_video = 0
+                hit_providers = False
+                for part in parts:
+                    part_id = part.id
+
+                    try:
+                        metadata = get_plex_metadata(video_id, part_id, stored_subs.item_type)
+                    except PartUnknownException:
+                        Log.Info(u"%s: Part %s:%s unknown, skipping", self.name, video_id, part_id)
+                        continue
+
+                    if not metadata:
+                        Log.Info(u"%s: Part %s:%s unknown, skipping", self.name, video_id, part_id)
+                        continue
+
+                    Log.Debug(u"%s: Looking for missing subtitles: %s", self.name, get_item_title(plex_item))
+                    scanned_parts = scan_videos([metadata], providers=providers)
+
+                    downloaded_subtitles = download_best_subtitles(scanned_parts, min_score=min_score,
+                                                                   providers=providers)
+                    hit_providers = downloaded_subtitles is not None
+                    download_successful = False
+
+                    if downloaded_subtitles:
+                        downloaded_any = any(downloaded_subtitles.values())
+                        if not downloaded_any:
+                            continue
+
+                        try:
+                            save_subtitles(scanned_parts, downloaded_subtitles, mode="a", mods=config.default_mods)
+                            Log.Debug(u"%s: Downloaded subtitle for item with missing subs: %s", self.name, video_id)
+                            download_successful = True
+                            refresh_item(video_id)
+                            track_usage("Subtitle", "manual", "download", 1)
+                        except:
+                            Log.Error(u"%s: Something went wrong when downloading specific subtitle: %s", self.name,
+                                      traceback.format_exc())
+                        finally:
+                            scanned_parts = None
+                            try:
+                                item_title = get_title_for_video_metadata(metadata, add_section_title=False)
+                                if download_successful:
+                                    # store item in history
+                                    for video, video_subtitles in downloaded_subtitles.items():
+                                        if not video_subtitles:
+                                            continue
+
+                                        for subtitle in video_subtitles:
+                                            downloads_per_video += 1
+                                            history.add(item_title, video.id, section_title=metadata["section"],
+                                                        subtitle=subtitle,
+                                                        mode="a")
+
+                                    downloaded_subtitles = None
+                            except:
+                                Log.Error(u"%s: DEBUG HIT: %s", self.name, traceback.format_exc())
+
+                        Log.Debug(u"%s: Waiting %s seconds before continuing", self.name, PROVIDER_SLACK)
+                        Thread.Sleep(PROVIDER_SLACK)
+
+                download_count += downloads_per_video
+
+                if downloads_per_video:
+                    videos_with_downloads += 1
+
+                self.items_done = self.items_done + 1
+                self.percentage = int(self.items_done * 100 / self.items_searching)
+
+                stored_subs = None
+
+                if downloads_per_video:
+                    Log.Debug(u"%s: Subtitles have been downloaded, "
+                              u"waiting %s seconds before continuing", self.name, DL_PROVIDER_SLACK)
+                    Thread.Sleep(DL_PROVIDER_SLACK)
+                else:
+                    if hit_providers:
+                        Log.Debug(u"%s: Waiting %s seconds before continuing", self.name, PROVIDER_SLACK)
+                        Thread.Sleep(PROVIDER_SLACK)
+        finally:
+            subtitle_storage.destroy()
+            history.destroy()

        if download_count:
            Log.Debug(u"%s: done. Missing subtitles found for %s/%s items (%s subs downloaded)", self.name,
@@ -453,6 +515,103 @@ class SearchAllRecentlyAddedMissing(Task):
        self.items_searching = None


+class LegacySearchAllRecentlyAddedMissing(Task):
+    periodic = True
+    frequency = "never"
+    items_done = None
+    items_searching = None
+    items_searching_ids = None
+    items_failed = None
+    percentage = 0
+
+    stall_time = 30
+
+    def __init__(self):
+        super(LegacySearchAllRecentlyAddedMissing, self).__init__()
+        self.items_done = None
+        self.items_searching = None
+        self.items_searching_ids = None
+        self.items_failed = None
+        self.percentage = 0
+
+    def signal(self, signal_name, *args, **kwargs):
+        handler = getattr(self, "signal_%s" % signal_name)
+        return handler(*args, **kwargs) if handler else None
+
+    def signal_updated_metadata(self, *args, **kwargs):
+        item_id = int(args[0])
+
+        if self.items_searching_ids is not None and item_id in self.items_searching_ids:
+            self.items_done.append(item_id)
+            return True
+
+    def prepare(self, *args, **kwargs):
+        self.items_done = []
+        recent_items = get_recent_items()
+        missing = items_get_all_missing_subs(recent_items, sleep_after_request=0.2)
+        ids = set([id for added_at, id, title, item, missing_languages in missing if not is_ignored(id, item=item)])
+        self.items_searching = missing
+        self.items_searching_ids = ids
+        self.items_failed = []
+        self.percentage = 0
+        self.ready_for_display = True
+
+    def run(self):
+        super(LegacySearchAllRecentlyAddedMissing, self).run()
+        self.running = True
+        missing_count = len(self.items_searching)
+        items_done_count = 0
+
+        for added_at, item_id, title, item, missing_languages in self.items_searching:
+            Log.Debug(u"Task: %s, triggering refresh for %s (%s)", self.name, title, item_id)
+            try:
+                refresh_item(item_id)
+            except URLError:
+                # timeout
+                pass
+            search_started = datetime.datetime.now()
+            tries = 1
+            while 1:
+                if item_id in self.items_done:
+                    items_done_count += 1
+                    self.percentage = int(items_done_count * 100 / missing_count)
+                    Log.Debug(u"Task: %s, item %s done (%s%%, %s/%s)", self.name, item_id, self.percentage,
+                              items_done_count, missing_count)
+                    break
+
+                # item considered stalled after self.stall_time seconds passed after last refresh
+                if (datetime.datetime.now() - search_started).total_seconds() > self.stall_time:
+                    if tries > 3:
+                        self.items_failed.append(item_id)
+                        Log.Debug(u"Task: %s, item stalled for %s times: %s, skipping", self.name, tries, item_id)
+                        break
+
+                    Log.Debug(u"Task: %s, item stalled for %s seconds: %s, retrying", self.name, self.stall_time,
+                              item_id)
+                    tries += 1
+                    try:
+                        refresh_item(item_id)
+                    except URLError:
+                        pass
+                    search_started = datetime.datetime.now()
+                    Thread.Sleep(1)
+                Thread.Sleep(0.1)
+            # we can't hammer the PMS, otherwise requests will be stalled
+            Thread.Sleep(5)
+
+        Log.Debug("Task: %s, done (%s%%, %s/%s). Failed items: %s", self.name, self.percentage,
+                  items_done_count, missing_count, self.items_failed)
+
+    def post_run(self, task_data):
+        super(LegacySearchAllRecentlyAddedMissing, self).post_run(task_data)
+        self.ready_for_display = False
+        self.percentage = 0
+        self.items_done = None
+        self.items_failed = None
+        self.items_searching = None
+        self.items_searching_ids = None
+
+
 class FindBetterSubtitles(DownloadSubtitleMixin, SubtitleListingMixin, Task):
    periodic = True

@@ -487,129 +646,146 @@ class FindBetterSubtitles(DownloadSubtitleMixin, SubtitleListingMixin, Task):
        overwrite_manually_selected = cast_bool(
            Prefs["scheduler.tasks.FindBetterSubtitles.overwrite_manually_selected"])

+        air_date_cutoff_pref = Prefs["scheduler.tasks.FindBetterSubtitles.air_date_cutoff"]
+        if air_date_cutoff_pref == "don't limit":
+            air_date_cutoff = None
+        else:
+            air_date_cutoff = int(air_date_cutoff_pref.split()[0])
+
        subtitle_storage = get_subtitle_storage()
-        recent_subs = subtitle_storage.load_recent_files(age_days=max_search_days)
        viable_item_count = 0

-        for fn, stored_subs in recent_subs.iteritems():
-            video_id = stored_subs.video_id
+        try:
+            for fn in subtitle_storage.get_recent_files(age_days=max_search_days):
+                stored_subs = subtitle_storage.load(filename=fn)
+                if not stored_subs:
+                    continue

-            if stored_subs.item_type == "episode":
-                cutoff = self.series_cutoff
-                min_score = min_score_series
-            else:
-                cutoff = self.movies_cutoff
-                min_score = min_score_movies
+                video_id = stored_subs.video_id

-            # don't search for better subtitles until at least 30 minutes have passed
-            if stored_subs.added_at + datetime.timedelta(minutes=30) > now:
-                Log.Debug(u"%s: Item %s too new, skipping", self.name, video_id)
-                continue
+                if stored_subs.item_type == "episode":
+                    cutoff = self.series_cutoff
+                    min_score = min_score_series
+                else:
+                    cutoff = self.movies_cutoff
+                    min_score = min_score_movies

-            # added_date <= max_search_days?
-            if stored_subs.added_at + datetime.timedelta(days=max_search_days) <= now:
-                continue
+                # don't search for better subtitles until at least 30 minutes have passed
+                if stored_subs.added_at + datetime.timedelta(minutes=30) > now:
+                    Log.Debug(u"%s: Item %s too new, skipping", self.name, video_id)
+                    continue

-            viable_item_count += 1
-            ditch_parts = []
+                # added_date <= max_search_days?
+                if stored_subs.added_at + datetime.timedelta(days=max_search_days) <= now:
+                    continue

-            # look through all stored subtitle data
-            for part_id, languages in stored_subs.parts.iteritems():
-                part_id = str(part_id)
+                viable_item_count += 1
+                ditch_parts = []

-                # all languages
-                for language, current_subs in languages.iteritems():
-                    current_key = current_subs.get("current")
-                    current = current_subs.get(current_key)
+                # look through all stored subtitle data
+                for part_id, languages in stored_subs.parts.iteritems():
+                    part_id = str(part_id)

-                    # currently got subtitle?
-                    if not current:
-                        continue
-                    current_score = current.score
-                    current_mode = current.mode
+                    # all languages
+                    for language, current_subs in languages.iteritems():
+                        current_key = current_subs.get("current")
+                        current = current_subs.get(current_key)

-                    # late cutoff met? skip
-                    if current_score >= cutoff:
-                        Log.Debug(u"%s: Skipping finding better subs, "
-                                  u"cutoff met (current: %s, cutoff: %s): %s (%s)",
-                                  self.name, current_score, cutoff, stored_subs.title, video_id)
-                        continue
+                        # currently got subtitle?
+                        # fixme: check for existence
+                        if not current:
+                            continue
+                        current_score = current.score
+                        current_mode = current.mode

-                    # got manual subtitle but don't want to touch those?
-                    if current_mode == "m" and not overwrite_manually_selected:
-                        Log.Debug(u"%s: Skipping finding better subs, "
-                                  u"had manual: %s (%s)", self.name, stored_subs.title, video_id)
-                        continue
+                        # late cutoff met? skip
+                        if current_score >= cutoff:
+                            Log.Debug(u"%s: Skipping finding better subs, "
+                                      u"cutoff met (current: %s, cutoff: %s): %s (%s)",
+                                      self.name, current_score, cutoff, stored_subs.title, video_id)
+                            continue

-                    # subtitle modifications different from default
-                    if not overwrite_manually_modified and current.mods \
-                            and set(current.mods).difference(set(config.default_mods)):
-                        Log.Debug(u"%s: Skipping finding better subs, it has manual modifications: %s (%s)",
-                                  self.name, stored_subs.title, video_id)
-                        continue
+                        # got manual subtitle but don't want to touch those?
+                        if current_mode == "m" and not overwrite_manually_selected:
+                            Log.Debug(u"%s: Skipping finding better subs, "
+                                      u"had manual: %s (%s)", self.name, stored_subs.title, video_id)
+                            continue

-                    try:
-                        subs = self.list_subtitles(video_id, stored_subs.item_type, part_id, language)
-                    except PartUnknownException:
-                        Log.Info(u"%s: Part %s unknown/gone; ditching subtitle info", self.name, part_id)
-                        ditch_parts.append(part_id)
-                        continue
+                        # subtitle modifications different from default
+                        if not overwrite_manually_modified and current.mods \
+                                and set(current.mods).difference(set(config.default_mods)):
+                            Log.Debug(u"%s: Skipping finding better subs, it has manual modifications: %s (%s)",
+                                      self.name, stored_subs.title, video_id)
+                            continue

-                    hit_providers = subs is not None
+                        try:
+                            subs = self.list_subtitles(video_id, stored_subs.item_type, part_id, language,
+                                                       air_date_cutoff=air_date_cutoff)
+                        except PartUnknownException:
+                            Log.Info(u"%s: Part %s unknown/gone; ditching subtitle info", self.name, part_id)
+                            ditch_parts.append(part_id)
+                            continue

-                    if subs:
-                        # subs are already sorted by score
-                        better_downloaded = False
-                        better_tried_download = 0
-                        better_visited = 0
-                        for sub in subs:
-                            if sub.score > current_score and sub.score > min_score:
-                                Log.Debug(u"%s: Better subtitle found for %s, downloading", self.name, video_id)
-                                better_tried_download += 1
-                                ret = self.download_subtitle(sub, video_id, mode="b")
-                                if ret:
-                                    better_found += 1
-                                    better_downloaded = True
-                                    break
-                                else:
-                                    Log.Debug(u"%s: Couldn't download/save subtitle. "
-                                              u"Continuing to the next one", self.name)
-                                    Log.Debug(u"%s: Waiting %s seconds before continuing",
-                                              self.name, DL_PROVIDER_SLACK)
-                                    time.sleep(DL_PROVIDER_SLACK)
-                            better_visited += 1
+                        hit_providers = subs is not None

-                        if better_tried_download and not better_downloaded:
-                            Log.Debug(u"%s: Tried downloading better subtitle for %s, "
-                                      u"but every try failed.", self.name, video_id)
+                        if subs:
+                            # subs are already sorted by score
+                            better_downloaded = False
+                            better_tried_download = 0
+                            better_visited = 0
+                            for sub in subs:
+                                if sub.score > current_score and sub.score > min_score:
+                                    Log.Debug(u"%s: Better subtitle found for %s, downloading", self.name, video_id)
+                                    better_tried_download += 1
+                                    ret = self.download_subtitle(sub, video_id, mode="b")
+                                    if ret:
+                                        better_found += 1
+                                        better_downloaded = True
+                                        break
+                                    else:
+                                        Log.Debug(u"%s: Couldn't download/save subtitle. "
+                                                  u"Continuing to the next one", self.name)
+                                        Log.Debug(u"%s: Waiting %s seconds before continuing",
+                                                  self.name, DL_PROVIDER_SLACK)
+                                        Thread.Sleep(DL_PROVIDER_SLACK)
+                                better_visited += 1

-                        elif better_downloaded:
-                            Log.Debug(u"%s: Better subtitle downloaded for %s", self.name, video_id)
+                            if better_tried_download and not better_downloaded:
+                                Log.Debug(u"%s: Tried downloading better subtitle for %s, "
+                                          u"but every try failed.", self.name, video_id)

-                        if better_tried_download or better_downloaded:
-                            Log.Debug(u"%s: Waiting %s seconds before continuing", self.name, DL_PROVIDER_SLACK)
-                            time.sleep(DL_PROVIDER_SLACK)
+                            elif better_downloaded:
+                                Log.Debug(u"%s: Better subtitle downloaded for %s", self.name, video_id)

-                        elif better_visited:
+                            if better_tried_download or better_downloaded:
+                                Log.Debug(u"%s: Waiting %s seconds before continuing", self.name, DL_PROVIDER_SLACK)
+                                Thread.Sleep(DL_PROVIDER_SLACK)
+
+                            elif better_visited:
+                                Log.Debug(u"%s: Waiting %s seconds before continuing", self.name, PROVIDER_SLACK)
+                                Thread.Sleep(PROVIDER_SLACK)
+
+                            subs = None
+
+                        elif hit_providers:
+                            # hit the providers but didn't try downloading? wait.
                            Log.Debug(u"%s: Waiting %s seconds before continuing", self.name, PROVIDER_SLACK)
-                            time.sleep(PROVIDER_SLACK)
+                            Thread.Sleep(PROVIDER_SLACK)

-                    elif hit_providers:
-                        # hit the providers but didn't try downloading? wait.
-                        Log.Debug(u"%s: Waiting %s seconds before continuing", self.name, PROVIDER_SLACK)
-                        time.sleep(PROVIDER_SLACK)
+                if ditch_parts:
+                    for part_id in ditch_parts:
+                        try:
+                            del stored_subs.parts[part_id]
+                        except KeyError:
+                            pass
+                    subtitle_storage.save(stored_subs)
+                    ditch_parts = None

-            if ditch_parts:
-                for part_id in ditch_parts:
-                    try:
-                        del stored_subs.parts[part_id]
-                    except KeyError:
-                        pass
-                subtitle_storage.save(stored_subs)
+                stored_subs = None

-            time.sleep(1)
-
-        subtitle_storage.destroy()
+                Thread.Sleep(1)
+        finally:
+            subtitle_storage.destroy()

        if better_found:
            Log.Debug(u"%s: done. Better subtitles found for %s/%s items", self.name, better_found,
@@ -674,6 +850,38 @@ class MigrateSubtitleStorage(Task):
        storage.destroy()


+class CacheMaintenance(Task):
+    periodic = True
+    frequency = "every 1 days"
+
+    main_cache_validity = 14  # days
+    pack_cache_validity = 4  # days
+
+    def run(self):
+        super(CacheMaintenance, self).run()
+        self.running = True
+        Log.Info(u"%s: Running cache maintenance", self.name)
+        now = datetime.datetime.now()
+
+        def remove_expired(path, expiry):
+            mtime = datetime.datetime.fromtimestamp(os.path.getmtime(path))
+            if mtime + datetime.timedelta(days=expiry) < now:
+                try:
+                    os.remove(path)
+                except (IOError, OSError):
+                    Log.Debug("Couldn't remove cache file: %s", os.path.basename(path))
+
+        # main cache
+        if config.new_style_cache:
+            for fn in subliminal_cache_region.backend.all_filenames:
+                remove_expired(fn, self.main_cache_validity)
+
+        # archive cache
+        for fn in glob.iglob(os.path.join(config.pack_cache_dir, "*.archive")):
+            remove_expired(fn, self.pack_cache_validity)
+
+
+scheduler.register(LegacySearchAllRecentlyAddedMissing)
 scheduler.register(SearchAllRecentlyAddedMissing)
 scheduler.register(AvailableSubsForItem)
 scheduler.register(DownloadSubtitleForItem)
@@ -682,3 +890,4 @@ scheduler.register(FindBetterSubtitles)
 scheduler.register(SubtitleStorageMaintenance)
 scheduler.register(MigrateSubtitleStorage)
 scheduler.register(MenuHistoryMaintenance)
+scheduler.register(CacheMaintenance)
@@ -1,6 +1,6 @@
 [
  {
-    "id": "langPref1",
+    "id": "langPref1a",
    "label": "Subtitle Language (1)",
    "type": "enum",
    "values": [
@@ -55,7 +55,7 @@
    "default": "en"
  },
  {
-    "id": "langPref2",
+    "id": "langPref2a",
    "label": "Subtitle Language (2)",
    "type": "enum",
    "values": [
@@ -111,7 +111,7 @@
    "default": "None"
  },
  {
-    "id": "langPref3",
+    "id": "langPref3a",
    "label": "Subtitle Language (3)",
    "type": "enum",
    "values": [
@@ -179,11 +179,17 @@
    "default": "false"
  },
  {
-    "id": "subtitles.language.ietf",
-    "label": "Treat IETF language tags as ISO 639-1 (e.g. pt-BR = pt)",
+    "id": "subtitles.language.ietf_display",
+    "label": "Display languages with country attribute as ISO 639-1 (e.g. pt-BR = pt)",
    "type": "bool",
    "default": "true"
  },
+  {
+    "id": "subtitles.language.ietf_normalize",
+    "label": "Treat languages with country attribute as ISO 639-1 (e.g. don't download pt-BR if pt subtitle exists)",
+    "type": "bool",
+    "default": "false"
+  },
  {
    "id": "subtitles.only_one",
    "label": "Restrict to one language (skips adding \".lang.\" to the subtitle filename; only uses \"Subtitle Language (1)\")",
@@ -196,6 +202,50 @@
    "type": "bool",
    "default": "true"
  },
+  {
+    "id": "media_rename1",
+    "label": "I rename my files using",
+    "type": "enum",
+    "values": [
+      "Sonarr/Radarr (fill api info below)",
+      "Filebot",
+      "Sonarr/Radarr/Filebot",
+      "Symlink to original file",
+      "I keep the original filenames",
+      "none of the above"
+    ],
+    "default": "I keep the original filenames"
+  },
+  {
+    "id": "use_file_info_file",
+    "label": "Retrieve original filename from .file_info/file_info index files (see wiki)",
+    "type": "bool",
+    "default": "false"
+  },
+  {
+    "id": "drone_api.sonarr.url",
+    "label": "Sonarr URL (add URL base if configured)",
+    "type": "text",
+    "default": "http://127.0.0.1:8989"
+  },
+  {
+    "id": "drone_api.sonarr.api_key",
+    "label": "Sonarr API key",
+    "type": "text",
+    "default": ""
+  },
+  {
+    "id": "drone_api.radarr.url",
+    "label": "Radarr URL (add URL base if configured, min. version: 0.2.0.897)",
+    "type": "text",
+    "default": "http://127.0.0.1:7878"
+  },
+  {
+    "id": "drone_api.radarr.api_key",
+    "label": "Radarr API key",
+    "type": "text",
+    "default": ""
+  },
  {
    "id": "provider.opensubtitles.enabled",
    "label": "Provider: Enable OpenSubtitles",
@@ -204,7 +254,7 @@
  },
  {
    "id": "provider.opensubtitles.username",
-    "label": "Opensubtitles Username (VIP)",
+    "label": "Opensubtitles Username",
    "type": "text",
    "default": ""
  },
@@ -216,12 +266,24 @@
    "default": "",
    "secure": "true"
  },
+  {
+    "id": "provider.opensubtitles.is_vip",
+    "label": "OpenSubtitles VIP? (ad-free subs, 1000 subs/day, no-cache VIP server: http://v.ht/osvip)",
+    "type": "bool",
+    "default": "false"
+  },
  {
    "id": "provider.podnapisi.enabled",
    "label": "Provider: Enable Podnapisi.NET",
    "type": "bool",
    "default": "true"
  },
+  {
+    "id": "provider.titlovi.enabled",
+    "label": "Provider: Enable Titlovi.com",
+    "type": "bool",
+    "default": "true"
+  },
  {
    "id": "provider.addic7ed.enabled",
    "label": "Provider: Enable Addic7ed",
@@ -243,7 +305,7 @@
    "secure": "true"
  },
  {
-    "id": "provider.addic7ed.boost_by1",
+    "id": "provider.addic7ed.boost_by2",
    "label": "Addic7ed: boost score (if requirements met)",
    "type": "enum",
    "values": [
@@ -266,12 +328,13 @@
      "25",
      "21",
      "20",
+      "19",
      "15",
      "10",
      "5",
      "0"
    ],
-    "default": "21"
+    "default": "19"
  },
  {
    "id": "provider.addic7ed.use_random_agents",
@@ -312,31 +375,11 @@
    "default": "false"
  },
  {
-    "id": "provider.shooter.enabled",
-    "label": "Provider: Enable Shooter.cn (Chinese)",
+    "id": "provider.subscene.enabled",
+    "label": "Provider: Enable SubScene",
    "type": "bool",
    "default": "false"
  },
-  {
-    "id": "provider.subscenter.enabled",
-    "label": "Provider: Enable SubsCenter (Hebrew)",
-    "type": "bool",
-    "default": "false"
-  },
-  {
-    "id": "provider.subscenter.username",
-    "label": "SubsCenter Username",
-    "type": "text",
-    "default": ""
-  },
-  {
-    "id": "provider.subscenter.password",
-    "label": "SubsCenter Password",
-    "type": "text",
-    "option": "hidden",
-    "default": "",
-    "secure": "true"
-  },
  {
    "id": "providers.multithreading",
    "label": "Search enabled providers simultaneously (multithreading)",
@@ -344,32 +387,26 @@
    "default": "true"
  },
  {
-    "id": "provider.opensubtitles.use_tags",
-    "label": "I keep the exact (release-) filename of my media files",
+    "id": "subtitles.embedded.autoextract",
+    "label": "Automatically extract and use embedded subtitles upon media addition (with configured default mods)",
    "type": "bool",
-    "default": "true"
+    "default": "false"
  },
  {
    "id": "subtitles.scan.embedded",
-    "label": "Scan: include embedded subtitles (in the media file (MKV/MP4), don't download if existing)",
+    "label": "Don't search for subtitles of a language if there are embedded subtitles inside the media file (MKV/MP4)?",
    "type": "bool",
    "default": "false"
  },
  {
    "id": "subtitles.scan.external",
-    "label": "Scan: include external subtitles (metadata/filesystem, don't download if existing)",
+    "label": "Don't search for subtitles of a language if they already exist on the filesystem (metadata/filesystem)?",
    "type": "bool",
    "default": "true"
  },
-  {
-    "id": "subtitles.scan.exotic_ext",
-    "label": "Scan: include \"exotic\" subtitle formats (anything else than .srt/.ssa/.ass/.vtt; embedded or external)",
-    "type": "bool",
-    "default": "false"
-  },
  {
    "id": "subtitles.scan.filename_strictness",
-    "label": "Scan: which external subtitles should be picked up?",
+    "label": "How strict should these subtitles existing on the filesystem be detected?",
    "type": "enum",
    "values": [
      "exact: media filename match",
@@ -378,6 +415,12 @@
    ],
    "default": "loose: filename contains media filename"
  },
+  {
+    "id": "subtitles.scan.exotic_ext",
+    "label": "Include non-text subtitle formats (anything else than .srt/.ssa/.ass/.vtt; embedded or external) in the above?",
+    "type": "bool",
+    "default": "false"
+  },
  {
    "id": "subtitles.search.minimumTVScore2",
    "label": "Minimum score for TV (min: 240, def/sane: 337, min-ideal: 352; see http://v.ht/szscores)",
@@ -408,6 +451,12 @@
    "type": "bool",
    "default": "false"
  },
+  {
+    "id": "subtitles.remove_tags",
+    "label": "Remove style tags from downloaded subtitles (bold, italic, underline, colors, ...)",
+    "type": "bool",
+    "default": "false"
+  },
  {
    "id": "subtitles.fix_common",
    "label": "Fix common whitespace/punctuation issues in subtitles",
@@ -518,8 +567,6 @@
    "type": "enum",
    "values": [
      "never",
-      "every 1 hours",
-      "every 3 hours",
      "every 6 hours",
      "every 12 hours",
      "every 24 hours"
@@ -540,7 +587,8 @@
      "3 weeks",
      "4 weeks",
      "5 weeks",
-      "6 weeks"
+      "6 weeks",
+      "12 weeks"
    ],
    "default": "2 weeks"
  },
@@ -568,11 +616,30 @@
    "type": "text",
    "default": "7"
  },
+  {
+    "id": "scheduler.tasks.FindBetterSubtitles.air_date_cutoff",
+    "label": "Scheduler: Don't search for better subtitles if the item's air date is older than",
+    "type": "enum",
+    "values": [
+      "don't limit",
+      "1 year",
+      "2 years",
+      "3 years",
+      "4 years",
+      "5 years",
+      "6 years",
+      "7 years",
+      "8 years",
+      "9 years",
+      "10 years"
+    ],
+    "default": "1 year"
+  },
  {
    "id": "scheduler.tasks.FindBetterSubtitles.overwrite_manually_selected",
    "label": "Scheduler: Overwrite manually selected subtitles when better found",
    "type": "bool",
-    "default": "false"
+    "default": "true"
  },
  {
    "id": "scheduler.tasks.FindBetterSubtitles.overwrite_manually_modified",
@@ -665,6 +732,30 @@
    "type": "bool",
    "default": "true"
  },
+  {
+    "id": "new_style_cache",
+    "label": "Use new style caching (for subliminal)",
+    "type": "bool",
+    "default": "true"
+  },
+  {
+    "id": "low_impact_mode",
+    "label": "Low impact mode (for remote filesystems)",
+    "type": "bool",
+    "default": "false"
+  },
+  {
+    "id": "pms_request_timeout",
+    "label": "Timeout for API requests sent to the PMS",
+    "type": "text",
+    "default": "15"
+  },
+  {
+    "id": "proxy",
+    "label": "HTTP proxy to use for providers (supports credentials)",
+    "type": "text",
+    "default": ""
+  },
  {
    "id": "log_level",
    "label": "How verbose should the logging be?",
@@ -678,6 +769,12 @@
    ],
    "default": "WARNING"
  },
+  {
+    "id": "log_rotate_keep",
+    "label": "How many log backups to keep?",
+    "type": "text",
+    "default": "5"
+  },
  {
    "id": "log_debug_mods",
    "label": "Log subtitle modification (debug)",
@@ -9,11 +9,11 @@
        <key>CFBundleInfoDictionaryVersion</key>
        <string>6.0</string>
        <key>CFBundleShortVersionString</key>
-        <string>2.0.25</string>
+        <string>2.5.0</string>
        <key>CFBundleSignature</key>
        <string>????</string>
        <key>CFBundleVersion</key>
-        <string>2.0.25.1635</string>
+        <string>2.5.0.2241</string>
        <key>PlexFrameworkVersion</key>
        <string>2</string>
        <key>PlexPluginClass</key>
@@ -32,7 +32,7 @@

 &lt;h1&gt;Sub-Zero for Plex&lt;/h1&gt;&lt;i&gt;Subtitles done right&lt;/i&gt;

-Version 2.0.25.1635
+Version 2.5.0.2241

 Originally based on @bramwalet's awesome &lt;a href=&quot;https://github.com/bramwalet/Subliminal.bundle&quot;&gt;Subliminal.bundle&lt;/a&gt;

@@ -4,7 +4,6 @@
 # Use of this source code is governed by the 3-clause BSD license
 # that can be found in the LICENSE file.
 #
-from __future__ import unicode_literals
 from collections import namedtuple
 from functools import partial
 from pkg_resources import resource_stream  # @UnresolvedImport
@@ -4,7 +4,6 @@
 # Use of this source code is governed by the 3-clause BSD license
 # that can be found in the LICENSE file.
 #
-from __future__ import unicode_literals
 from collections import namedtuple
 from functools import partial
 from pkg_resources import resource_stream  # @UnresolvedImport
@@ -4,7 +4,6 @@
 # Use of this source code is governed by the 3-clause BSD license
 # that can be found in the LICENSE file.
 #
-from __future__ import unicode_literals
 from collections import namedtuple
 from pkg_resources import resource_stream  # @UnresolvedImport
 from . import basestr
@@ -5,26 +5,31 @@ http://www.crummy.com/software/BeautifulSoup/

 Beautiful Soup uses a pluggable XML or HTML parser to parse a
 (possibly invalid) document into a tree representation. Beautiful Soup
-provides provides methods and Pythonic idioms that make it easy to
-navigate, search, and modify the parse tree.
+provides methods and Pythonic idioms that make it easy to navigate,
+search, and modify the parse tree.

-Beautiful Soup works with Python 2.6 and up. It works better if lxml
+Beautiful Soup works with Python 2.7 and up. It works better if lxml
 and/or html5lib is installed.

 For more than you ever wanted to know about Beautiful Soup, see the
 documentation:
 http://www.crummy.com/software/BeautifulSoup/bs4/doc/
+
 """

+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.4.1"
-__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
+__version__ = "4.6.0"
+__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
 __license__ = "MIT"

 __all__ = ['BeautifulSoup']

 import os
 import re
+import traceback
 import warnings

 from .builder import builder_registry, ParserRejectedMarkup
@@ -77,7 +82,7 @@ class BeautifulSoup(Tag):

    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'

-    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"

    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, exclude_encodings=None,
@@ -137,6 +142,10 @@ class BeautifulSoup(Tag):
        from_encoding = from_encoding or deprecated_argument(
            "fromEncoding", "from_encoding")

+        if from_encoding and isinstance(markup, unicode):
+            warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
+            from_encoding = None
+
        if len(kwargs) > 0:
            arg = kwargs.keys().pop()
            raise TypeError(
@@ -161,19 +170,29 @@ class BeautifulSoup(Tag):
                    markup_type = "XML"
                else:
                    markup_type = "HTML"
+
+                caller = traceback.extract_stack()[0]
+                filename = caller[0]
+                line_number = caller[1]
                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
+                    filename=filename,
+                    line_number=line_number,
                    parser=builder.NAME,
                    markup_type=markup_type))

        self.builder = builder
        self.is_xml = builder.is_xml
+        self.known_xml = self.is_xml
        self.builder.soup = self

        self.parse_only = parse_only

        if hasattr(markup, 'read'):        # It's a file-type object.
            markup = markup.read()
-        elif len(markup) <= 256:
+        elif len(markup) <= 256 and (
+                (isinstance(markup, bytes) and not b'<' in markup)
+                or (isinstance(markup, unicode) and not u'<' in markup)
+        ):
            # Print out warnings for a couple beginner problems
            # involving passing non-markup to Beautiful Soup.
            # Beautiful Soup will still parse the input as markup,
@@ -195,16 +214,10 @@ class BeautifulSoup(Tag):
                if isinstance(markup, unicode):
                    markup = markup.encode("utf8")
                warnings.warn(
-                    '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
-            if markup[:5] == "http:" or markup[:6] == "https:":
-                # TODO: This is ugly but I couldn't get it to work in
-                # Python 3 otherwise.
-                if ((isinstance(markup, bytes) and not b' ' in markup)
-                    or (isinstance(markup, unicode) and not u' ' in markup)):
-                    if isinstance(markup, unicode):
-                        markup = markup.encode("utf8")
-                    warnings.warn(
-                        '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
+                    '"%s" looks like a filename, not markup. You should'
+                    ' probably open this file and pass the filehandle into'
+                    ' Beautiful Soup.' % markup)
+            self._check_markup_is_url(markup)

        for (self.markup, self.original_encoding, self.declared_html_encoding,
         self.contains_replacement_characters) in (
@@ -223,15 +236,52 @@ class BeautifulSoup(Tag):
        self.builder.soup = None

    def __copy__(self):
-        return type(self)(self.encode(), builder=self.builder)
+        copy = type(self)(
+            self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
+        )
+
+        # Although we encoded the tree to UTF-8, that may not have
+        # been the encoding of the original markup. Set the copy's
+        # .original_encoding to reflect the original object's
+        # .original_encoding.
+        copy.original_encoding = self.original_encoding
+        return copy

    def __getstate__(self):
        # Frequently a tree builder can't be pickled.
        d = dict(self.__dict__)
        if 'builder' in d and not self.builder.picklable:
-            del d['builder']
+            d['builder'] = None
        return d

+    @staticmethod
+    def _check_markup_is_url(markup):
+        """ 
+        Check if markup looks like it's actually a url and raise a warning 
+        if so. Markup can be unicode or str (py2) / bytes (py3).
+        """
+        if isinstance(markup, bytes):
+            space = b' '
+            cant_start_with = (b"http:", b"https:")
+        elif isinstance(markup, unicode):
+            space = u' '
+            cant_start_with = (u"http:", u"https:")
+        else:
+            return
+
+        if any(markup.startswith(prefix) for prefix in cant_start_with):
+            if not space in markup:
+                if isinstance(markup, bytes):
+                    decoded_markup = markup.decode('utf-8', 'replace')
+                else:
+                    decoded_markup = markup
+                warnings.warn(
+                    '"%s" looks like a URL. Beautiful Soup is not an'
+                    ' HTTP client. You should probably use an HTTP client like'
+                    ' requests to get the document behind the URL, and feed'
+                    ' that document to Beautiful Soup.' % decoded_markup
+                )
+
    def _feed(self):
        # Convert the document to Unicode.
        self.builder.reset()
@@ -335,7 +385,18 @@ class BeautifulSoup(Tag):
        if parent.next_sibling:
            # This node is being inserted into an element that has
            # already been parsed. Deal with any dangling references.
-            index = parent.contents.index(o)
+            index = len(parent.contents)-1
+            while index >= 0:
+                if parent.contents[index] is o:
+                    break
+                index -= 1
+            else:
+                raise ValueError(
+                    "Error building tree: supposedly %r was inserted "
+                    "into %r after the fact, but I don't see it!" % (
+                        o, parent
+                    )
+                )
            if index == 0:
                previous_element = parent
                previous_sibling = None
@@ -387,7 +448,7 @@ class BeautifulSoup(Tag):
        """Push a start tag on to the stack.

        If this method returns None, the tag was rejected by the
-        SoupStrainer. You should proceed as if the tag had not occured
+        SoupStrainer. You should proceed as if the tag had not occurred
        in the document. For instance, if this was a self-closing tag,
        don't call handle_endtag.
        """
@@ -1,9 +1,13 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 from collections import defaultdict
 import itertools
 import sys
 from bs4.element import (
    CharsetMetaAttributeValue,
    ContentMetaAttributeValue,
+    HTMLAwareEntitySubstitution,
    whitespace_re
    )

@@ -227,9 +231,14 @@ class HTMLTreeBuilder(TreeBuilder):
    Such as which tags are empty-element tags.
    """

-    preserve_whitespace_tags = set(['pre', 'textarea'])
-    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
-                              'spacer', 'link', 'frame', 'base'])
+    preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
+    empty_element_tags = set([
+        # These are from HTML5.
+        'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
+
+        # These are from HTML4, removed in HTML5.
+        'spacer', 'frame'
+    ])

    # The HTML standard defines these attributes as containing a
    # space-separated list of values, not a single value. That is,
@@ -1,9 +1,12 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 __all__ = [
    'HTML5TreeBuilder',
    ]

-from pdb import set_trace
 import warnings
+import re
 from bs4.builder import (
    PERMISSIVE,
    HTML,
@@ -15,7 +18,10 @@ from bs4.element import (
    whitespace_re,
 )
 import html5lib
-from html5lib.constants import namespaces
+from html5lib.constants import (
+    namespaces,
+    prefixes,
+    )
 from bs4.element import (
    Comment,
    Doctype,
@@ -23,6 +29,15 @@ from bs4.element import (
    Tag,
    )

+try:
+    # Pre-0.99999999
+    from html5lib.treebuilders import _base as treebuilder_base
+    new_html5lib = False
+except ImportError, e:
+    # 0.99999999 and up
+    from html5lib.treebuilders import base as treebuilder_base
+    new_html5lib = True
+
 class HTML5TreeBuilder(HTMLTreeBuilder):
    """Use html5lib to build a tree."""

@@ -47,7 +62,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
-        doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+        extra_kwargs = dict()
+        if not isinstance(markup, unicode):
+            if new_html5lib:
+                extra_kwargs['override_encoding'] = self.user_specified_encoding
+            else:
+                extra_kwargs['encoding'] = self.user_specified_encoding
+        doc = parser.parse(markup, **extra_kwargs)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
@@ -55,11 +77,17 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
-            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+            original_encoding = parser.tokenizer.stream.charEncoding[0]
+            if not isinstance(original_encoding, basestring):
+                # In 0.99999999 and up, the encoding is an html5lib
+                # Encoding object. We want to use a string for compatibility
+                # with other tree builders.
+                original_encoding = original_encoding.name
+            doc.original_encoding = original_encoding

    def create_treebuilder(self, namespaceHTMLElements):
        self.underlying_builder = TreeBuilderForHtml5lib(
-            self.soup, namespaceHTMLElements)
+            namespaceHTMLElements, self.soup)
        return self.underlying_builder

    def test_fragment_to_document(self, fragment):
@@ -67,10 +95,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
        return u'<html><head></head><body>%s</body></html>' % fragment


-class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):

-    def __init__(self, soup, namespaceHTMLElements):
-        self.soup = soup
+    def __init__(self, namespaceHTMLElements, soup=None):
+        if soup:
+            self.soup = soup
+        else:
+            from bs4 import BeautifulSoup
+            self.soup = BeautifulSoup("", "html.parser")
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

    def documentClass(self):
@@ -93,7 +125,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
        return TextNode(Comment(data), self.soup)

    def fragmentClass(self):
-        self.soup = BeautifulSoup("")
+        from bs4 import BeautifulSoup
+        self.soup = BeautifulSoup("", "html.parser")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)

@@ -105,7 +138,57 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
        return self.soup

    def getFragment(self):
-        return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+        return treebuilder_base.TreeBuilder.getFragment(self).element
+
+    def testSerializer(self, element):
+        from bs4 import BeautifulSoup
+        rv = []
+        doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
+
+        def serializeElement(element, indent=0):
+            if isinstance(element, BeautifulSoup):
+                pass
+            if isinstance(element, Doctype):
+                m = doctype_re.match(element)
+                if m:
+                    name = m.group(1)
+                    if m.lastindex > 1:
+                        publicId = m.group(2) or ""
+                        systemId = m.group(3) or m.group(4) or ""
+                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (' ' * indent, name, publicId, systemId))
+                    else:
+                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
+                else:
+                    rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
+            elif isinstance(element, Comment):
+                rv.append("|%s<!-- %s -->" % (' ' * indent, element))
+            elif isinstance(element, NavigableString):
+                rv.append("|%s\"%s\"" % (' ' * indent, element))
+            else:
+                if element.namespace:
+                    name = "%s %s" % (prefixes[element.namespace],
+                                      element.name)
+                else:
+                    name = element.name
+                rv.append("|%s<%s>" % (' ' * indent, name))
+                if element.attrs:
+                    attributes = []
+                    for name, value in element.attrs.items():
+                        if isinstance(name, NamespacedAttribute):
+                            name = "%s %s" % (prefixes[name.namespace], name.name)
+                        if isinstance(value, list):
+                            value = " ".join(value)
+                        attributes.append((name, value))
+
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+                indent += 2
+                for child in element.children:
+                    serializeElement(child, indent)
+        serializeElement(element, 0)
+
+        return "\n".join(rv)

 class AttrList(object):
    def __init__(self, element):
@@ -137,9 +220,9 @@ class AttrList(object):
        return name in list(self.attrs.keys())


-class Element(html5lib.treebuilders._base.Node):
+class Element(treebuilder_base.Node):
    def __init__(self, element, soup, namespace):
-        html5lib.treebuilders._base.Node.__init__(self, element.name)
+        treebuilder_base.Node.__init__(self, element.name)
        self.element = element
        self.soup = soup
        self.namespace = namespace
@@ -158,8 +241,10 @@ class Element(html5lib.treebuilders._base.Node):
            child = node
        elif node.element.__class__ == NavigableString:
            string_child = child = node.element
+            node.parent = self
        else:
            child = node.element
+            node.parent = self

        if not isinstance(child, basestring) and child.parent is not None:
            node.element.extract()
@@ -197,6 +282,8 @@ class Element(html5lib.treebuilders._base.Node):
                most_recent_element=most_recent_element)

    def getAttributes(self):
+        if isinstance(self.element, Comment):
+            return {}
        return AttrList(self.element)

    def setAttributes(self, attributes):
@@ -224,11 +311,11 @@ class Element(html5lib.treebuilders._base.Node):
    attributes = property(getAttributes, setAttributes)

    def insertText(self, data, insertBefore=None):
+        text = TextNode(self.soup.new_string(data), self.soup)
        if insertBefore:
-            text = TextNode(self.soup.new_string(data), self.soup)
-            self.insertBefore(data, insertBefore)
+            self.insertBefore(text, insertBefore)
        else:
-            self.appendChild(data)
+            self.appendChild(text)

    def insertBefore(self, node, refNode):
        index = self.element.index(refNode.element)
@@ -250,6 +337,7 @@ class Element(html5lib.treebuilders._base.Node):
        # print "MOVE", self.element.contents
        # print "FROM", self.element
        # print "TO", new_parent.element
+
        element = self.element
        new_parent_element = new_parent.element
        # Determine what this tag's next_element will be once all the children
@@ -268,7 +356,6 @@ class Element(html5lib.treebuilders._base.Node):
            new_parents_last_descendant_next_element = new_parent_element.next_element

        to_append = element.contents
-        append_after = new_parent_element.contents
        if len(to_append) > 0:
            # Set the first child's previous_element and previous_sibling
            # to elements within the new parent
@@ -285,12 +372,19 @@ class Element(html5lib.treebuilders._base.Node):
            if new_parents_last_child:
                new_parents_last_child.next_sibling = first_child

-            # Fix the last child's next_element and next_sibling
-            last_child = to_append[-1]
-            last_child.next_element = new_parents_last_descendant_next_element
+            # Find the very last element being moved. It is now the
+            # parent's last descendant. It has no .next_sibling and
+            # its .next_element is whatever the previous last
+            # descendant had.
+            last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
+
+            last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
            if new_parents_last_descendant_next_element:
-                new_parents_last_descendant_next_element.previous_element = last_child
-            last_child.next_sibling = None
+                # TODO: This code has no test coverage and I'm not sure
+                # how to get html5lib to go through this path, but it's
+                # just the other side of the previous line.
+                new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
+            last_childs_last_descendant.next_sibling = None

        for child in to_append:
            child.parent = new_parent_element
@@ -324,7 +418,7 @@ class Element(html5lib.treebuilders._base.Node):

 class TextNode(Element):
    def __init__(self, element, soup):
-        html5lib.treebuilders._base.Node.__init__(self, None)
+        treebuilder_base.Node.__init__(self, None)
        self.element = element
        self.soup = soup

@@ -1,5 +1,8 @@
 """Use the HTMLParser library to parse HTML files that aren't too bad."""

+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 __all__ = [
    'HTMLParserTreeBuilder',
    ]
@@ -49,7 +52,31 @@ from bs4.builder import (
 HTMLPARSER = 'html.parser'

 class BeautifulSoupHTMLParser(HTMLParser):
-    def handle_starttag(self, name, attrs):
+
+    def __init__(self, *args, **kwargs):
+        HTMLParser.__init__(self, *args, **kwargs)
+
+        # Keep a list of empty-element tags that were encountered
+        # without an explicit closing tag. If we encounter a closing tag
+        # of this type, we'll associate it with one of those entries.
+        #
+        # This isn't a stack because we don't care about the
+        # order. It's a list of closing tags we've already handled and
+        # will ignore, assuming they ever show up.
+        self.already_closed_empty_element = []
+    
+    def handle_startendtag(self, name, attrs):
+        # This is only called when the markup looks like
+        # <tag/>.
+
+        # is_startend() tells handle_starttag not to close the tag
+        # just because its name matches a known empty-element tag. We
+        # know that this is an empty-element tag and we want to call
+        # handle_endtag ourselves.
+        tag = self.handle_starttag(name, attrs, handle_empty_element=False)
+        self.handle_endtag(name)
+        
+    def handle_starttag(self, name, attrs, handle_empty_element=True):
        # XXX namespace
        attr_dict = {}
        for key, value in attrs:
@@ -59,10 +86,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
                value = ''
            attr_dict[key] = value
            attrvalue = '""'
-        self.soup.handle_starttag(name, None, None, attr_dict)
+        #print "START", name
+        tag = self.soup.handle_starttag(name, None, None, attr_dict)
+        if tag and tag.is_empty_element and handle_empty_element:
+            # Unlike other parsers, html.parser doesn't send separate end tag
+            # events for empty-element tags. (It's handled in
+            # handle_startendtag, but only if the original markup looked like
+            # <tag/>.)
+            #
+            # So we need to call handle_endtag() ourselves. Since we
+            # know the start event is identical to the end event, we
+            # don't want handle_endtag() to cross off any previous end
+            # events for tags of this name.
+            self.handle_endtag(name, check_already_closed=False)

-    def handle_endtag(self, name):
-        self.soup.handle_endtag(name)
+            # But we might encounter an explicit closing tag for this tag
+            # later on. If so, we want to ignore it.
+            self.already_closed_empty_element.append(name)
+            
+    def handle_endtag(self, name, check_already_closed=True):
+        #print "END", name
+        if check_already_closed and name in self.already_closed_empty_element:
+            # This is a redundant end tag for an empty-element tag.
+            # We've already called handle_endtag() for it, so just
+            # check it off the list.
+            # print "ALREADY CLOSED", name
+            self.already_closed_empty_element.remove(name)
+        else:
+            self.soup.handle_endtag(name)

    def handle_data(self, data):
        self.soup.handle_data(data)
@@ -166,6 +217,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e
+        parser.already_closed_empty_element = []

 # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
 # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
@@ -1,3 +1,5 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 __all__ = [
    'LXMLTreeBuilderForXML',
    'LXMLTreeBuilder',
@@ -12,6 +14,7 @@ from bs4.element import (
    Doctype,
    NamespacedAttribute,
    ProcessingInstruction,
+    XMLProcessingInstruction,
 )
 from bs4.builder import (
    FAST,
@@ -29,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    DEFAULT_PARSER_CLASS = etree.XMLParser

    is_xml = True
+    processing_instruction_class = XMLProcessingInstruction

    NAME = "lxml-xml"
    ALTERNATE_NAMES = ["xml"]
@@ -87,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder):

        Each 4-tuple represents a strategy for parsing the document.
        """
+        # Instead of using UnicodeDammit to convert the bytestring to
+        # Unicode using different encodings, use EncodingDetector to
+        # iterate over the encodings, and tell lxml to try to parse
+        # the document as each one in turn.
+        is_html = not self.is_xml
+        if is_html:
+            self.processing_instruction_class = ProcessingInstruction
+        else:
+            self.processing_instruction_class = XMLProcessingInstruction
+
        if isinstance(markup, unicode):
            # We were given Unicode. Maybe lxml can parse Unicode on
            # this system?
@@ -98,11 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            yield (markup.encode("utf8"), "utf8",
                   document_declared_encoding, False)

-        # Instead of using UnicodeDammit to convert the bytestring to
-        # Unicode using different encodings, use EncodingDetector to
-        # iterate over the encodings, and tell lxml to try to parse
-        # the document as each one in turn.
-        is_html = not self.is_xml
        try_encodings = [user_specified_encoding, document_declared_encoding]
        detector = EncodingDetector(
            markup, try_encodings, is_html, exclude_encodings)
@@ -201,7 +210,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    def pi(self, target, data):
        self.soup.endData()
        self.soup.handle_data(target + ' ' + data)
-        self.soup.endData(ProcessingInstruction)
+        self.soup.endData(self.processing_instruction_class)

    def data(self, content):
        self.soup.handle_data(content)
@@ -229,6 +238,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):

    features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
    is_xml = False
+    processing_instruction_class = ProcessingInstruction

    def default_parser(self, encoding):
        return etree.HTMLParser
@@ -6,9 +6,10 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
 Feed Parser. It works best on XML and HTML, but it does not rewrite the
 XML or HTML to reflect a new encoding; that's the tree builder's job.
 """
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 __license__ = "MIT"

-from pdb import set_trace
 import codecs
 from htmlentitydefs import codepoint2name
 import re
@@ -309,7 +310,7 @@ class EncodingDetector:
        else:
            xml_endpos = 1024
            html_endpos = max(2048, int(len(markup) * 0.05))
-            
+
        declared_encoding = None
        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
        if not declared_encoding_match and is_html:
@@ -346,7 +347,7 @@ class UnicodeDammit:
        self.tried_encodings = []
        self.contains_replacement_characters = False
        self.is_html = is_html
-
+        self.log = logging.getLogger(__name__)
        self.detector = EncodingDetector(
            markup, override_encodings, is_html, exclude_encodings)

@@ -376,9 +377,10 @@ class UnicodeDammit:
                if encoding != "ascii":
                    u = self._convert_from(encoding, "replace")
                if u is not None:
-                    logging.warning(
+                    self.log.warning(
                            "Some characters could not be decoded, and were "
-                            "replaced with REPLACEMENT CHARACTER.")
+                            "replaced with REPLACEMENT CHARACTER."
+                    )
                    self.contains_replacement_characters = True
                    break

@@ -734,7 +736,7 @@ class UnicodeDammit:
        0xde : b'\xc3\x9e',     # Þ
        0xdf : b'\xc3\x9f',     # ß
        0xe0 : b'\xc3\xa0',     # à
-        0xe1 : b'\xa1',     # á
+        0xe1 : b'\xa1',         # á
        0xe2 : b'\xc3\xa2',     # â
        0xe3 : b'\xc3\xa3',     # ã
        0xe4 : b'\xc3\xa4',     # ä
@@ -1,5 +1,7 @@
 """Diagnostic functions, mainly for use when doing tech support."""

+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 __license__ = "MIT"

 import cProfile
@@ -56,7 +58,8 @@ def diagnose(data):
        data = data.read()
    elif os.path.exists(data):
        print '"%s" looks like a filename. Reading data from the file.' % data
-        data = open(data).read()
+        with open(data) as fp:
+            data = fp.read()
    elif data.startswith("http:") or data.startswith("https:"):
        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
@@ -1,8 +1,10 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 __license__ = "MIT"

-from pdb import set_trace
 import collections
 import re
+import shlex
 import sys
 import warnings
 from bs4.dammit import EntitySubstitution
@@ -99,6 +101,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):

    preformatted_tags = set(["pre"])

+    preserve_whitespace_tags = set(['pre', 'textarea'])
+
    @classmethod
    def _substitute_if_appropriate(cls, ns, f):
        if (isinstance(ns, NavigableString)
@@ -127,8 +131,8 @@ class PageElement(object):
    # to methods like encode() and prettify():
    #
    # "html" - All Unicode characters with corresponding HTML entities
-    #   are converted to those entities on output.
-    # "minimal" - Bare ampersands and angle brackets are converted to
+    #   are converted to those entities on output. 
+   # "minimal" - Bare ampersands and angle brackets are converted to
    #   XML entities: &amp; &lt; &gt;
    # None - The null formatter. Unicode characters are never
    #   converted to entities.  This is not recommended, but it's
@@ -169,11 +173,19 @@ class PageElement(object):

        This is used when mapping a formatter name ("minimal") to an
        appropriate function (one that performs entity-substitution on
-        the contents of <script> and <style> tags, or not). It's
+        the contents of <script> and <style> tags, or not). It can be
        inefficient, but it should be called very rarely.
        """
+        if self.known_xml is not None:
+            # Most of the time we will have determined this when the
+            # document is parsed.
+            return self.known_xml
+
+        # Otherwise, it's likely that this element was created by
+        # direct invocation of the constructor from within the user's
+        # Python code.
        if self.parent is None:
-            # This is the top-level object. It should have .is_xml set
+            # This is the top-level object. It should have .known_xml set
            # from tree creation. If not, take a guess--BS is usually
            # used on HTML markup.
            return getattr(self, 'is_xml', False)
@@ -523,9 +535,16 @@ class PageElement(object):
                return ResultSet(strainer, result)
            elif isinstance(name, basestring):
                # Optimization to find all tags with a given name.
+                if name.count(':') == 1:
+                    # This is a name with a prefix.
+                    prefix, name = name.split(':', 1)
+                else:
+                    prefix = None
                result = (element for element in generator
                          if isinstance(element, Tag)
-                            and element.name == name)
+                            and element.name == name
+                          and (prefix is None or element.prefix == prefix)
+                )
                return ResultSet(strainer, result)
        results = ResultSet(strainer)
        while True:
@@ -637,7 +656,7 @@ class PageElement(object):
            return lambda el: el._attr_value_as_string(
                attribute, '').startswith(value)
        elif operator == '$':
-            # string represenation of `attribute` ends with `value`
+            # string representation of `attribute` ends with `value`
            return lambda el: el._attr_value_as_string(
                attribute, '').endswith(value)
        elif operator == '*':
@@ -677,6 +696,11 @@ class NavigableString(unicode, PageElement):
    PREFIX = ''
    SUFFIX = ''

+    # We can't tell just by looking at a string whether it's contained
+    # in an XML document or an HTML document.
+
+    known_xml = None
+
    def __new__(cls, value):
        """Create a new NavigableString.

@@ -743,10 +767,16 @@ class CData(PreformattedString):
    SUFFIX = u']]>'

 class ProcessingInstruction(PreformattedString):
+    """A SGML processing instruction."""

    PREFIX = u'<?'
    SUFFIX = u'>'

+class XMLProcessingInstruction(ProcessingInstruction):
+    """An XML processing instruction."""
+    PREFIX = u'<?'
+    SUFFIX = u'?>'
+
 class Comment(PreformattedString):

    PREFIX = u'<!--'
@@ -781,7 +811,8 @@ class Tag(PageElement):
    """Represents a found HTML tag with its attributes and contents."""

    def __init__(self, parser=None, builder=None, name=None, namespace=None,
-                 prefix=None, attrs=None, parent=None, previous=None):
+                 prefix=None, attrs=None, parent=None, previous=None,
+                 is_xml=None):
        "Basic constructor."

        if parser is None:
@@ -795,6 +826,14 @@ class Tag(PageElement):
        self.name = name
        self.namespace = namespace
        self.prefix = prefix
+        if builder is not None:
+            preserve_whitespace_tags = builder.preserve_whitespace_tags
+        else:
+            if is_xml:
+                preserve_whitespace_tags = []
+            else:
+                preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
+        self.preserve_whitespace_tags = preserve_whitespace_tags
        if attrs is None:
            attrs = {}
        elif attrs:
@@ -805,6 +844,13 @@ class Tag(PageElement):
                attrs = dict(attrs)
        else:
            attrs = dict(attrs)
+
+        # If possible, determine ahead of time whether this tag is an
+        # XML tag.
+        if builder:
+            self.known_xml = builder.is_xml
+        else:
+            self.known_xml = is_xml
        self.attrs = attrs
        self.contents = []
        self.setup(parent, previous)
@@ -824,7 +870,7 @@ class Tag(PageElement):
        Its contents are a copy of the old Tag's contents.
        """
        clone = type(self)(None, self.builder, self.name, self.namespace,
-                           self.nsprefix, self.attrs)
+                           self.prefix, self.attrs, is_xml=self._is_xml)
        for attr in ('can_be_empty_element', 'hidden'):
            setattr(clone, attr, getattr(self, attr))
        for child in self.contents:
@@ -946,6 +992,13 @@ class Tag(PageElement):
        attribute."""
        return self.attrs.get(key, default)

+    def get_attribute_list(self, key, default=None):
+        """The same as get(), but always returns a list."""
+        value = self.get(key, default)
+        if not isinstance(value, list):
+            value = [value]
+        return value
+    
    def has_attr(self, key):
        return key in self.attrs

@@ -997,7 +1050,7 @@ class Tag(PageElement):
                    tag_name, tag_name))
            return self.find(tag_name)
        # We special case contents to avoid recursion.
-        elif not tag.startswith("__") and not tag=="contents":
+        elif not tag.startswith("__") and not tag == "contents":
            return self.find(tag)
        raise AttributeError(
            "'%s' object has no attribute '%s'" % (self.__class__, tag))
@@ -1057,10 +1110,11 @@ class Tag(PageElement):

    def _should_pretty_print(self, indent_level):
        """Should this tag be pretty-printed?"""
+
        return (
-            indent_level is not None and
-            (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
-             or self._is_xml))
+            indent_level is not None
+            and self.name not in self.preserve_whitespace_tags
+        )

    def decode(self, indent_level=None,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
@@ -1280,6 +1334,7 @@ class Tag(PageElement):

    _selector_combinators = ['>', '+', '~']
    _select_debug = False
+    quoted_colon = re.compile('"[^"]*:[^"]*"')
    def select_one(self, selector):
        """Perform a CSS selection operation on the current element."""
        value = self.select(selector, limit=1)
@@ -1305,8 +1360,7 @@ class Tag(PageElement):
                if limit and len(context) >= limit:
                    break
            return context
-
-        tokens = selector.split()
+        tokens = shlex.split(selector)
        current_context = [self]

        if tokens[-1] in self._selector_combinators:
@@ -1358,7 +1412,7 @@ class Tag(PageElement):
                    return classes.issubset(candidate.get('class', []))
                checker = classes_match

-            elif ':' in token:
+            elif ':' in token and not self.quoted_colon.search(token):
                # Pseudo-class
                tag_name, pseudo = token.split(':', 1)
                if tag_name == '':
@@ -1389,11 +1443,8 @@ class Tag(PageElement):
                            self.count += 1
                            if self.count == self.destination:
                                return True
-                            if self.count > self.destination:
-                                # Stop the generator that's sending us
-                                # these things.
-                                raise StopIteration()
-                            return False
+                            else:
+                                return False
                    checker = Counter(pseudo_value).nth_child_of_type
                else:
                    raise NotImplementedError(
@@ -1498,13 +1549,12 @@ class Tag(PageElement):
                            # don't include it in the context more than once.
                            new_context.append(candidate)
                            new_context_ids.add(id(candidate))
-                            if limit and len(new_context) >= limit:
-                                break
                    elif self._select_debug:
                        print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))

-
            current_context = new_context
+        if limit and len(current_context) >= limit:
+            current_context = current_context[:limit]

        if self._select_debug:
            print "Final verdict:"
@@ -1662,28 +1712,22 @@ class SoupStrainer(object):
                "I don't know how to match against a %s" % markup.__class__)
        return found

-    def _matches(self, markup, match_against):
+    def _matches(self, markup, match_against, already_tried=None):
        # print u"Matching %s against %s" % (markup, match_against)
        result = False
        if isinstance(markup, list) or isinstance(markup, tuple):
            # This should only happen when searching a multi-valued attribute
            # like 'class'.
-            if (isinstance(match_against, unicode)
-                and ' ' in match_against):
-                # A bit of a special case. If they try to match "foo
-                # bar" on a multivalue attribute's value, only accept
-                # the literal value "foo bar"
-                #
-                # XXX This is going to be pretty slow because we keep
-                # splitting match_against. But it shouldn't come up
-                # too often.
-                return (whitespace_re.split(match_against) == markup)
-            else:
-                for item in markup:
-                    if self._matches(item, match_against):
-                        return True
-                return False
-
+            for item in markup:
+                if self._matches(item, match_against):
+                    return True
+            # We didn't match any particular value of the multivalue
+            # attribute, but maybe we match the attribute value when
+            # considered as a string.
+            if self._matches(' '.join(markup), match_against):
+                return True
+            return False
+        
        if match_against is True:
            # True matches any non-None value.
            return markup is not None
@@ -1693,6 +1737,7 @@ class SoupStrainer(object):

        # Custom callables take the tag as an argument, but all
        # other ways of matching match the tag name as a string.
+        original_markup = markup
        if isinstance(markup, Tag):
            markup = markup.name

@@ -1703,18 +1748,51 @@ class SoupStrainer(object):
            # None matches None, False, an empty string, an empty list, and so on.
            return not match_against

-        if isinstance(match_against, unicode):
+        if (hasattr(match_against, '__iter__')
+            and not isinstance(match_against, basestring)):
+            # We're asked to match against an iterable of items.
+            # The markup must be match at least one item in the
+            # iterable. We'll try each one in turn.
+            #
+            # To avoid infinite recursion we need to keep track of
+            # items we've already seen.
+            if not already_tried:
+                already_tried = set()
+            for item in match_against:
+                if item.__hash__:
+                    key = item
+                else:
+                    key = id(item)
+                if key in already_tried:
+                    continue
+                else:
+                    already_tried.add(key)
+                    if self._matches(original_markup, item, already_tried):
+                        return True
+            else:
+                return False
+        
+        # Beyond this point we might need to run the test twice: once against
+        # the tag's name and once against its prefixed name.
+        match = False
+        
+        if not match and isinstance(match_against, unicode):
            # Exact string match
-            return markup == match_against
+            match = markup == match_against

-        if hasattr(match_against, 'match'):
+        if not match and hasattr(match_against, 'search'):
            # Regexp match
            return match_against.search(markup)

-        if hasattr(match_against, '__iter__'):
-            # The markup must be an exact match against something
-            # in the iterable.
-            return markup in match_against
+        if (not match
+            and isinstance(original_markup, Tag)
+            and original_markup.prefix):
+            # Try the whole thing again with the prefixed tag name.
+            return self._matches(
+                original_markup.prefix + ':' + original_markup.name, match_against
+            )
+
+        return match


 class ResultSet(list):
@@ -1723,3 +1801,8 @@ class ResultSet(list):
    def __init__(self, source, result=()):
        super(ResultSet, self).__init__(result)
        self.source = source
+
+    def __getattr__(self, key):
+        raise AttributeError(
+            "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
+        )
@@ -1,5 +1,7 @@
 """Helper classes for tests."""

+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 __license__ = "MIT"

 import pickle
@@ -67,6 +69,18 @@ class HTMLTreeBuilderSmokeTest(object):
    markup in these tests, there's not much room for interpretation.
    """

+    def test_empty_element_tags(self):
+        """Verify that all HTML4 and HTML5 empty element (aka void element) tags
+        are handled correctly.
+        """
+        for name in [
+                'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
+                'spacer', 'frame'
+        ]:
+            soup = self.soup("")
+            new_tag = soup.new_tag(name)
+            self.assertEqual(True, new_tag.is_empty_element)
+    
    def test_pickle_and_unpickle_identity(self):
        # Pickling a tree, then unpickling it, yields a tree identical
        # to the original.
@@ -137,6 +151,14 @@ class HTMLTreeBuilderSmokeTest(object):
            markup.replace(b"\n", b""))

    def test_processing_instruction(self):
+        # We test both Unicode and bytestring to verify that
+        # process_markup correctly sets processing_instruction_class
+        # even when the markup is already Unicode and there is no
+        # need to process anything.
+        markup = u"""<?PITarget PIContent?>"""
+        soup = self.soup(markup)
+        self.assertEqual(markup, soup.decode())
+
        markup = b"""<?PITarget PIContent?>"""
        soup = self.soup(markup)
        self.assertEqual(markup, soup.encode("utf8"))
@@ -215,9 +237,22 @@ Hello, world!
        self.assertEqual(comment, baz.previous_element)

    def test_preserved_whitespace_in_pre_and_textarea(self):
-        """Whitespace must be preserved in <pre> and <textarea> tags."""
-        self.assertSoupEquals("<pre>   </pre>")
-        self.assertSoupEquals("<textarea> woo  </textarea>")
+        """Whitespace must be preserved in <pre> and <textarea> tags,
+        even if that would mean not prettifying the markup.
+        """
+        pre_markup = "<pre>   </pre>"
+        textarea_markup = "<textarea> woo\nwoo  </textarea>"
+        self.assertSoupEquals(pre_markup)
+        self.assertSoupEquals(textarea_markup)
+
+        soup = self.soup(pre_markup)
+        self.assertEqual(soup.pre.prettify(), pre_markup)
+
+        soup = self.soup(textarea_markup)
+        self.assertEqual(soup.textarea.prettify(), textarea_markup)
+
+        soup = self.soup("<textarea></textarea>")
+        self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>")

    def test_nested_inline_elements(self):
        """Inline elements can be nested indefinitely."""
@@ -307,6 +342,13 @@ Hello, world!
        self.assertEqual("p", soup.p.name)
        self.assertConnectedness(soup)

+    def test_empty_element_tags(self):
+        """Verify consistent handling of empty-element tags,
+        no matter how they come in through the markup.
+        """
+        self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
+        self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
+        
    def test_head_tag_between_head_and_body(self):
        "Prevent recurrence of a bug in the html5lib treebuilder."
        content = """<html><head></head>
@@ -480,7 +522,9 @@ Hello, world!
        hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
        soup = self.soup(
            hebrew_document, from_encoding="iso8859-8")
-        self.assertEqual(soup.original_encoding, 'iso8859-8')
+        # Some tree builders call it iso8859-8, others call it iso-8859-9.
+        # That's not a difference we really care about.
+        assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
        self.assertEqual(
            soup.encode('utf-8'),
            hebrew_document.decode("iso8859-8").encode("utf-8"))
@@ -563,6 +607,11 @@ class XMLTreeBuilderSmokeTest(object):
        soup = self.soup(markup)
        self.assertEqual(markup, soup.encode("utf8"))

+    def test_processing_instruction(self):
+        markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
+        soup = self.soup(markup)
+        self.assertEqual(markup, soup.encode("utf8"))
+
    def test_real_xhtml_document(self):
        """A real XHTML document should come out *exactly* the same as it went in."""
        markup = b"""<?xml version="1.0" encoding="utf-8"?>
@@ -639,6 +688,40 @@ class XMLTreeBuilderSmokeTest(object):
        soup = self.soup(markup)
        self.assertEqual(unicode(soup.foo), markup)

+    def test_find_by_prefixed_name(self):
+        doc = """<?xml version="1.0" encoding="utf-8"?>
+<Document xmlns="http://example.com/ns0"
+    xmlns:ns1="http://example.com/ns1"
+    xmlns:ns2="http://example.com/ns2"
+    <ns1:tag>foo</ns1:tag>
+    <ns1:tag>bar</ns1:tag>
+    <ns2:tag key="value">baz</ns2:tag>
+</Document>
+"""
+        soup = self.soup(doc)
+
+        # There are three <tag> tags.
+        self.assertEqual(3, len(soup.find_all('tag')))
+
+        # But two of them are ns1:tag and one of them is ns2:tag.
+        self.assertEqual(2, len(soup.find_all('ns1:tag')))
+        self.assertEqual(1, len(soup.find_all('ns2:tag')))
+        
+        self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
+        self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
+        
+    def test_copy_tag_preserves_namespace(self):
+        xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<w:document xmlns:w="http://example.com/ns0"/>"""
+    
+        soup = self.soup(xml)
+        tag = soup.document
+        duplicate = copy.copy(tag)
+
+        # The two tags have the same namespace prefix.
+        self.assertEqual(tag.prefix, duplicate.prefix)
+
+
 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
    """Smoke test for a tree builder that supports HTML5."""

@@ -84,6 +84,33 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
        self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
        self.assertEqual(2, len(soup.find_all('p')))

+    def test_reparented_markup_containing_identical_whitespace_nodes(self):
+        """Verify that we keep the two whitespace nodes in this
+        document distinct when reparenting the adjacent <tbody> tags.
+        """
+        markup = '<table> <tbody><tbody><ims></tbody> </table>'
+        soup = self.soup(markup)
+        space1, space2 = soup.find_all(string=' ')
+        tbody1, tbody2 = soup.find_all('tbody')
+        assert space1.next_element is tbody1
+        assert tbody2.next_element is space2
+
+    def test_reparented_markup_containing_children(self):
+        markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>'
+        soup = self.soup(markup)
+        noscript = soup.noscript
+        self.assertEqual("target", noscript.next_element)
+        target = soup.find(string='target')
+
+        # The 'aftermath' string was duplicated; we want the second one.
+        final_aftermath = soup.find_all(string='aftermath')[-1]
+
+        # The <noscript> tag was moved beneath a copy of the <a> tag,
+        # but the 'target' string within is still connected to the
+        # (second) 'aftermath' string.
+        self.assertEqual(final_aftermath, target.next_element)
+        self.assertEqual(target, final_aftermath.previous_element)
+        
    def test_processing_instruction(self):
        """Processing instructions become comments."""
        markup = b"""<?PITarget PIContent?>"""
@@ -96,3 +123,8 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
        a1, a2 = soup.find_all('a')
        self.assertEqual(a1, a2)
        assert a1 is not a2
+
+    def test_foster_parenting(self):
+        markup = b"""<table><td></tbody>A"""
+        soup = self.soup(markup)
+        self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
@@ -29,4 +29,6 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
        loaded = pickle.loads(dumped)
        self.assertTrue(isinstance(loaded.builder, type(tree.builder)))

-
+    def test_redundant_empty_element_closing_tags(self):
+        self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
+        self.assertSoupEquals('</br></br></br>', "")
@@ -35,7 +35,6 @@ try:
 except ImportError, e:
    LXML_PRESENT = False

-PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
 PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))

 class TestConstructor(SoupTest):
@@ -77,7 +76,7 @@ class TestWarnings(SoupTest):
    def test_no_warning_if_explicit_parser_specified(self):
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup("<a><b></b></a>", "html.parser")
-        self.assertEquals([], w)
+        self.assertEqual([], w)

    def test_parseOnlyThese_renamed_to_parse_only(self):
        with warnings.catch_warnings(record=True) as w:
@@ -118,15 +117,34 @@ class TestWarnings(SoupTest):
            soup = self.soup(filename)
        self.assertEqual(0, len(w))

-    def test_url_warning(self):
-        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup("http://www.crummy.com/")
-        msg = str(w[0].message)
-        self.assertTrue("looks like a URL" in msg)
+    def test_url_warning_with_bytes_url(self):
+        with warnings.catch_warnings(record=True) as warning_list:
+            soup = self.soup(b"http://www.crummybytes.com/")
+        # Be aware this isn't the only warning that can be raised during
+        # execution..
+        self.assertTrue(any("looks like a URL" in str(w.message) 
+            for w in warning_list))
+
+    def test_url_warning_with_unicode_url(self):
+        with warnings.catch_warnings(record=True) as warning_list:
+            # note - this url must differ from the bytes one otherwise
+            # python's warnings system swallows the second warning
+            soup = self.soup(u"http://www.crummyunicode.com/")
+        self.assertTrue(any("looks like a URL" in str(w.message) 
+            for w in warning_list))
+
+    def test_url_warning_with_bytes_and_space(self):
+        with warnings.catch_warnings(record=True) as warning_list:
+            soup = self.soup(b"http://www.crummybytes.com/ is great")
+        self.assertFalse(any("looks like a URL" in str(w.message) 
+            for w in warning_list))
+
+    def test_url_warning_with_unicode_and_space(self):
+        with warnings.catch_warnings(record=True) as warning_list:
+            soup = self.soup(u"http://www.crummyuncode.com/ is great")
+        self.assertFalse(any("looks like a URL" in str(w.message) 
+            for w in warning_list))

-        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup("http://www.crummy.com/ is great")
-        self.assertEqual(0, len(w))

 class TestSelectiveParsing(SoupTest):

@@ -260,7 +278,7 @@ class TestEncodingConversion(SoupTest):
        self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)

    @skipIf(
-        PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
+        PYTHON_3_PRE_3_2,
        "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
    def test_attribute_name_containing_unicode_characters(self):
        markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
@@ -1,3 +1,4 @@
+
 # -*- coding: utf-8 -*-
 """Tests for Beautiful Soup's tree traversal methods.

@@ -222,7 +223,19 @@ class TestFindAllByName(TreeTest):
        self.assertSelects(
            tree.find_all(id_matches_name), ["Match 1.", "Match 2."])

+    def test_find_with_multi_valued_attribute(self):
+        soup = self.soup(
+            "<div class='a b'>1</div><div class='a c'>2</div><div class='a d'>3</div>"
+        )
+        r1 = soup.find('div', 'a d');
+        r2 = soup.find('div', re.compile(r'a d'));
+        r3, r4 = soup.find_all('div', ['a b', 'a d']);
+        self.assertEqual('3', r1.string)
+        self.assertEqual('3', r2.string)
+        self.assertEqual('1', r3.string)
+        self.assertEqual('3', r4.string)

+        
 class TestFindAllByAttribute(TreeTest):

    def test_find_all_by_attribute_name(self):
@@ -294,10 +307,10 @@ class TestFindAllByAttribute(TreeTest):
        f = tree.find_all("gar", class_=re.compile("a"))
        self.assertSelects(f, ["Found it"])

-        # Since the class is not the string "foo bar", but the two
-        # strings "foo" and "bar", this will not find anything.
+        # If the search fails to match the individual strings "foo" and "bar",
+        # it will be tried against the combined string "foo bar".
        f = tree.find_all("gar", class_=re.compile("o b"))
-        self.assertSelects(f, [])
+        self.assertSelects(f, ["Found it"])

    def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
        soup = self.soup("<a class='bar'>Found it</a>")
@@ -335,7 +348,7 @@ class TestFindAllByAttribute(TreeTest):
        strainer = SoupStrainer(attrs={'id' : 'first'})
        self.assertSelects(tree.find_all(strainer), ['Match.'])

-    def test_find_all_with_missing_atribute(self):
+    def test_find_all_with_missing_attribute(self):
        # You can pass in None as the value of an attribute to find_all.
        # This will match tags that do not have that attribute set.
        tree = self.soup("""<a id="1">ID present.</a>
@@ -1273,6 +1286,10 @@ class TestCDAtaListAttributes(SoupTest):
        soup = self.soup("<a class='foo\tbar'>")
        self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())

+    def test_get_attribute_list(self):
+        soup = self.soup("<a id='abc def'>")
+        self.assertEqual(['abc def'], soup.a.get_attribute_list('id'))
+        
    def test_accept_charset(self):
        soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
        self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
@@ -1328,6 +1345,13 @@ class TestPersistence(SoupTest):
        copied = copy.deepcopy(self.tree)
        self.assertEqual(copied.decode(), self.tree.decode())

+    def test_copy_preserves_encoding(self):
+        soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
+        encoding = soup.original_encoding
+        copy = soup.__copy__()
+        self.assertEqual(u"<p> </p>", unicode(copy))
+        self.assertEqual(encoding, copy.original_encoding)
+
    def test_unicode_pickle(self):
        # A tree containing Unicode characters can be pickled.
        html = u"<b>\N{SNOWMAN}</b>"
@@ -1676,8 +1700,8 @@ class TestSoupSelector(TreeTest):
    def setUp(self):
        self.soup = BeautifulSoup(self.HTML, 'html.parser')

-    def assertSelects(self, selector, expected_ids):
-        el_ids = [el['id'] for el in self.soup.select(selector)]
+    def assertSelects(self, selector, expected_ids, **kwargs):
+        el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)]
        el_ids.sort()
        expected_ids.sort()
        self.assertEqual(expected_ids, el_ids,
@@ -1720,6 +1744,13 @@ class TestSoupSelector(TreeTest):
        for selector in ('html div', 'html body div', 'body div'):
            self.assertSelects(selector, ['data1', 'main', 'inner', 'footer'])

+
+    def test_limit(self):
+        self.assertSelects('html div', ['main'], limit=1)
+        self.assertSelects('html body div', ['inner', 'main'], limit=2)
+        self.assertSelects('body div', ['data1', 'main', 'inner', 'footer'],
+                           limit=10)
+
    def test_tag_no_match(self):
        self.assertEqual(len(self.soup.select('del')), 0)

@@ -1902,6 +1933,14 @@ class TestSoupSelector(TreeTest):
            ('div[data-tag]', ['data1'])
        )

+    def test_quoted_space_in_selector_name(self):
+        html = """<div style="display: wrong">nope</div>
+        <div style="display: right">yes</div>
+        """
+        soup = BeautifulSoup(html, 'html.parser')
+        [chosen] = soup.select('div[style="display: right"]')
+        self.assertEqual("yes", chosen.string)
+
    def test_unsupported_pseudoclass(self):
        self.assertRaises(
            NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
@@ -1,3 +1,3 @@
 from .core import where, old_where

-__version__ = "2017.04.17"
+__version__ = "2017.11.05"
@@ -19,17 +19,18 @@ class DeprecatedBundleWarning(DeprecationWarning):


 def where():
-    f = os.path.split(__file__)[0]
+    f = os.path.dirname(__file__)

    return os.path.join(f, 'cacert.pem')


 def old_where():
    warnings.warn(
-        "The weak security bundle is being deprecated.",
+        "The weak security bundle is being deprecated. It will be removed in "
+        "2018.",
        DeprecatedBundleWarning
    )
-    f = os.path.split(__file__)[0]
+    f = os.path.dirname(__file__)
    return os.path.join(f, 'weak.pem')

 if __name__ == '__main__':
@@ -0,0 +1,436 @@
+"""contextlib2 - backports and enhancements to the contextlib module"""
+
+import sys
+import warnings
+from collections import deque
+from functools import wraps
+
+__all__ = ["contextmanager", "closing", "ContextDecorator", "ExitStack",
+           "redirect_stdout", "redirect_stderr", "suppress"]
+
+# Backwards compatibility
+__all__ += ["ContextStack"]
+
+class ContextDecorator(object):
+    "A base class or mixin that enables context managers to work as decorators."
+
+    def refresh_cm(self):
+        """Returns the context manager used to actually wrap the call to the
+        decorated function.
+
+        The default implementation just returns *self*.
+
+        Overriding this method allows otherwise one-shot context managers
+        like _GeneratorContextManager to support use as decorators via
+        implicit recreation.
+
+        DEPRECATED: refresh_cm was never added to the standard library's
+                    ContextDecorator API
+        """
+        warnings.warn("refresh_cm was never added to the standard library",
+                      DeprecationWarning)
+        return self._recreate_cm()
+
+    def _recreate_cm(self):
+        """Return a recreated instance of self.
+
+        Allows an otherwise one-shot context manager like
+        _GeneratorContextManager to support use as
+        a decorator via implicit recreation.
+
+        This is a private interface just for _GeneratorContextManager.
+        See issue #11647 for details.
+        """
+        return self
+
+    def __call__(self, func):
+        @wraps(func)
+        def inner(*args, **kwds):
+            with self._recreate_cm():
+                return func(*args, **kwds)
+        return inner
+
+
+class _GeneratorContextManager(ContextDecorator):
+    """Helper for @contextmanager decorator."""
+
+    def __init__(self, func, args, kwds):
+        self.gen = func(*args, **kwds)
+        self.func, self.args, self.kwds = func, args, kwds
+        # Issue 19330: ensure context manager instances have good docstrings
+        doc = getattr(func, "__doc__", None)
+        if doc is None:
+            doc = type(self).__doc__
+        self.__doc__ = doc
+        # Unfortunately, this still doesn't provide good help output when
+        # inspecting the created context manager instances, since pydoc
+        # currently bypasses the instance docstring and shows the docstring
+        # for the class instead.
+        # See http://bugs.python.org/issue19404 for more details.
+
+    def _recreate_cm(self):
+        # _GCM instances are one-shot context managers, so the
+        # CM must be recreated each time a decorated function is
+        # called
+        return self.__class__(self.func, self.args, self.kwds)
+
+    def __enter__(self):
+        try:
+            return next(self.gen)
+        except StopIteration:
+            raise RuntimeError("generator didn't yield")
+
+    def __exit__(self, type, value, traceback):
+        if type is None:
+            try:
+                next(self.gen)
+            except StopIteration:
+                return
+            else:
+                raise RuntimeError("generator didn't stop")
+        else:
+            if value is None:
+                # Need to force instantiation so we can reliably
+                # tell if we get the same exception back
+                value = type()
+            try:
+                self.gen.throw(type, value, traceback)
+                raise RuntimeError("generator didn't stop after throw()")
+            except StopIteration as exc:
+                # Suppress StopIteration *unless* it's the same exception that
+                # was passed to throw().  This prevents a StopIteration
+                # raised inside the "with" statement from being suppressed.
+                return exc is not value
+            except RuntimeError as exc:
+                # Don't re-raise the passed in exception
+                if exc is value:
+                    return False
+                # Likewise, avoid suppressing if a StopIteration exception
+                # was passed to throw() and later wrapped into a RuntimeError
+                # (see PEP 479).
+                if _HAVE_EXCEPTION_CHAINING and exc.__cause__ is value:
+                    return False
+                raise
+            except:
+                # only re-raise if it's *not* the exception that was
+                # passed to throw(), because __exit__() must not raise
+                # an exception unless __exit__() itself failed.  But throw()
+                # has to raise the exception to signal propagation, so this
+                # fixes the impedance mismatch between the throw() protocol
+                # and the __exit__() protocol.
+                #
+                if sys.exc_info()[1] is not value:
+                    raise
+
+
+def contextmanager(func):
+    """@contextmanager decorator.
+
+    Typical usage:
+
+        @contextmanager
+        def some_generator(<arguments>):
+            <setup>
+            try:
+                yield <value>
+            finally:
+                <cleanup>
+
+    This makes this:
+
+        with some_generator(<arguments>) as <variable>:
+            <body>
+
+    equivalent to this:
+
+        <setup>
+        try:
+            <variable> = <value>
+            <body>
+        finally:
+            <cleanup>
+
+    """
+    @wraps(func)
+    def helper(*args, **kwds):
+        return _GeneratorContextManager(func, args, kwds)
+    return helper
+
+
+class closing(object):
+    """Context to automatically close something at the end of a block.
+
+    Code like this:
+
+        with closing(<module>.open(<arguments>)) as f:
+            <block>
+
+    is equivalent to this:
+
+        f = <module>.open(<arguments>)
+        try:
+            <block>
+        finally:
+            f.close()
+
+    """
+    def __init__(self, thing):
+        self.thing = thing
+    def __enter__(self):
+        return self.thing
+    def __exit__(self, *exc_info):
+        self.thing.close()
+
+
+class _RedirectStream(object):
+
+    _stream = None
+
+    def __init__(self, new_target):
+        self._new_target = new_target
+        # We use a list of old targets to make this CM re-entrant
+        self._old_targets = []
+
+    def __enter__(self):
+        self._old_targets.append(getattr(sys, self._stream))
+        setattr(sys, self._stream, self._new_target)
+        return self._new_target
+
+    def __exit__(self, exctype, excinst, exctb):
+        setattr(sys, self._stream, self._old_targets.pop())
+
+
+class redirect_stdout(_RedirectStream):
+    """Context manager for temporarily redirecting stdout to another file.
+
+        # How to send help() to stderr
+        with redirect_stdout(sys.stderr):
+            help(dir)
+
+        # How to write help() to a file
+        with open('help.txt', 'w') as f:
+            with redirect_stdout(f):
+                help(pow)
+    """
+
+    _stream = "stdout"
+
+
+class redirect_stderr(_RedirectStream):
+    """Context manager for temporarily redirecting stderr to another file."""
+
+    _stream = "stderr"
+
+
+class suppress(object):
+    """Context manager to suppress specified exceptions
+
+    After the exception is suppressed, execution proceeds with the next
+    statement following the with statement.
+
+         with suppress(FileNotFoundError):
+             os.remove(somefile)
+         # Execution still resumes here if the file was already removed
+    """
+
+    def __init__(self, *exceptions):
+        self._exceptions = exceptions
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exctype, excinst, exctb):
+        # Unlike isinstance and issubclass, CPython exception handling
+        # currently only looks at the concrete type hierarchy (ignoring
+        # the instance and subclass checking hooks). While Guido considers
+        # that a bug rather than a feature, it's a fairly hard one to fix
+        # due to various internal implementation details. suppress provides
+        # the simpler issubclass based semantics, rather than trying to
+        # exactly reproduce the limitations of the CPython interpreter.
+        #
+        # See http://bugs.python.org/issue12029 for more details
+        return exctype is not None and issubclass(exctype, self._exceptions)
+
+
+# Context manipulation is Python 3 only
+_HAVE_EXCEPTION_CHAINING = sys.version_info[0] >= 3
+if _HAVE_EXCEPTION_CHAINING:
+    def _make_context_fixer(frame_exc):
+        def _fix_exception_context(new_exc, old_exc):
+            # Context may not be correct, so find the end of the chain
+            while 1:
+                exc_context = new_exc.__context__
+                if exc_context is old_exc:
+                    # Context is already set correctly (see issue 20317)
+                    return
+                if exc_context is None or exc_context is frame_exc:
+                    break
+                new_exc = exc_context
+            # Change the end of the chain to point to the exception
+            # we expect it to reference
+            new_exc.__context__ = old_exc
+        return _fix_exception_context
+
+    def _reraise_with_existing_context(exc_details):
+        try:
+            # bare "raise exc_details[1]" replaces our carefully
+            # set-up context
+            fixed_ctx = exc_details[1].__context__
+            raise exc_details[1]
+        except BaseException:
+            exc_details[1].__context__ = fixed_ctx
+            raise
+else:
+    # No exception context in Python 2
+    def _make_context_fixer(frame_exc):
+        return lambda new_exc, old_exc: None
+
+    # Use 3 argument raise in Python 2,
+    # but use exec to avoid SyntaxError in Python 3
+    def _reraise_with_existing_context(exc_details):
+        exc_type, exc_value, exc_tb = exc_details
+        exec ("raise exc_type, exc_value, exc_tb")
+
+# Handle old-style classes if they exist
+try:
+    from types import InstanceType
+except ImportError:
+    # Python 3 doesn't have old-style classes
+    _get_type = type
+else:
+    # Need to handle old-style context managers on Python 2
+    def _get_type(obj):
+        obj_type = type(obj)
+        if obj_type is InstanceType:
+            return obj.__class__ # Old-style class
+        return obj_type # New-style class
+
+# Inspired by discussions on http://bugs.python.org/issue13585
+class ExitStack(object):
+    """Context manager for dynamic management of a stack of exit callbacks
+
+    For example:
+
+        with ExitStack() as stack:
+            files = [stack.enter_context(open(fname)) for fname in filenames]
+            # All opened files will automatically be closed at the end of
+            # the with statement, even if attempts to open files later
+            # in the list raise an exception
+
+    """
+    def __init__(self):
+        self._exit_callbacks = deque()
+
+    def pop_all(self):
+        """Preserve the context stack by transferring it to a new instance"""
+        new_stack = type(self)()
+        new_stack._exit_callbacks = self._exit_callbacks
+        self._exit_callbacks = deque()
+        return new_stack
+
+    def _push_cm_exit(self, cm, cm_exit):
+        """Helper to correctly register callbacks to __exit__ methods"""
+        def _exit_wrapper(*exc_details):
+            return cm_exit(cm, *exc_details)
+        _exit_wrapper.__self__ = cm
+        self.push(_exit_wrapper)
+
+    def push(self, exit):
+        """Registers a callback with the standard __exit__ method signature
+
+        Can suppress exceptions the same way __exit__ methods can.
+
+        Also accepts any object with an __exit__ method (registering a call
+        to the method instead of the object itself)
+        """
+        # We use an unbound method rather than a bound method to follow
+        # the standard lookup behaviour for special methods
+        _cb_type = _get_type(exit)
+        try:
+            exit_method = _cb_type.__exit__
+        except AttributeError:
+            # Not a context manager, so assume its a callable
+            self._exit_callbacks.append(exit)
+        else:
+            self._push_cm_exit(exit, exit_method)
+        return exit # Allow use as a decorator
+
+    def callback(self, callback, *args, **kwds):
+        """Registers an arbitrary callback and arguments.
+
+        Cannot suppress exceptions.
+        """
+        def _exit_wrapper(exc_type, exc, tb):
+            callback(*args, **kwds)
+        # We changed the signature, so using @wraps is not appropriate, but
+        # setting __wrapped__ may still help with introspection
+        _exit_wrapper.__wrapped__ = callback
+        self.push(_exit_wrapper)
+        return callback # Allow use as a decorator
+
+    def enter_context(self, cm):
+        """Enters the supplied context manager
+
+        If successful, also pushes its __exit__ method as a callback and
+        returns the result of the __enter__ method.
+        """
+        # We look up the special methods on the type to match the with statement
+        _cm_type = _get_type(cm)
+        _exit = _cm_type.__exit__
+        result = _cm_type.__enter__(cm)
+        self._push_cm_exit(cm, _exit)
+        return result
+
+    def close(self):
+        """Immediately unwind the context stack"""
+        self.__exit__(None, None, None)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc_details):
+        received_exc = exc_details[0] is not None
+
+        # We manipulate the exception state so it behaves as though
+        # we were actually nesting multiple with statements
+        frame_exc = sys.exc_info()[1]
+        _fix_exception_context = _make_context_fixer(frame_exc)
+
+        # Callbacks are invoked in LIFO order to match the behaviour of
+        # nested context managers
+        suppressed_exc = False
+        pending_raise = False
+        while self._exit_callbacks:
+            cb = self._exit_callbacks.pop()
+            try:
+                if cb(*exc_details):
+                    suppressed_exc = True
+                    pending_raise = False
+                    exc_details = (None, None, None)
+            except:
+                new_exc_details = sys.exc_info()
+                # simulate the stack of exceptions by setting the context
+                _fix_exception_context(new_exc_details[1], exc_details[1])
+                pending_raise = True
+                exc_details = new_exc_details
+        if pending_raise:
+            _reraise_with_existing_context(exc_details)
+        return received_exc and suppressed_exc
+
+# Preserve backwards compatibility
+class ContextStack(ExitStack):
+    """Backwards compatibility alias for ExitStack"""
+
+    def __init__(self):
+        warnings.warn("ContextStack has been renamed to ExitStack",
+                      DeprecationWarning)
+        super(ContextStack, self).__init__()
+
+    def register_exit(self, callback):
+        return self.push(callback)
+
+    def register(self, callback, *args, **kwds):
+        return self.callback(callback, *args, **kwds)
+
+    def preserve(self):
+        return self.pop_all()
@@ -1,4 +1,4 @@
-__version__ = '0.6.2'
+__version__ = '0.6.5'

 from .lock import Lock  # noqa
-from .lock import NeedRegenerationException  # noqa
+from .lock import NeedRegenerationException  # noqa
@@ -13,6 +13,13 @@ class NoValue(object):
    def payload(self):
        return self

+    def __repr__(self):
+        """Ensure __repr__ is a consistent value in case NoValue is used to
+        fill another cache key.
+
+        """
+        return '<dogpile.cache.api.NoValue object>'
+
    if py3k:
        def __bool__(self):  # pragma NO COVERAGE
            return False
@@ -20,6 +27,7 @@ class NoValue(object):
        def __nonzero__(self):  # pragma NO COVERAGE
            return False

+
 NO_VALUE = NoValue()
 """Value returned from ``get()`` that describes
 a  key not present."""
@@ -15,3 +15,11 @@ class RegionNotConfigured(DogpileCacheException):

 class ValidationError(DogpileCacheException):
    """Error validating a value or option."""
+
+
+class PluginNotFound(DogpileCacheException):
+    """The specified plugin could not be found.
+
+    .. versionadded:: 0.6.4
+
+    """
@@ -410,7 +410,13 @@ class CacheRegion(object):
                "configured with backend: %s.  "
                "Specify replace_existing_backend=True to replace."
                % self.backend)
-        backend_cls = _backend_loader.load(backend)
+
+        try:
+            backend_cls = _backend_loader.load(backend)
+        except PluginLoader.NotFound:
+            raise exception.PluginNotFound(
+                "Couldn't find cache plugin to load: %s" % backend)
+
        if _config_argument_dict:
            self.backend = backend_cls.from_config_dict(
                _config_argument_dict,
@@ -487,8 +493,19 @@ class CacheRegion(object):
        a value.  Any retrieved value whose creation
        time is prior to this timestamp
        is considered to be stale.  It does not
-        affect the data in the cache in any way, and is also
-        local to this instance of :class:`.CacheRegion`.
+        affect the data in the cache in any way, and is
+        **local to this instance of :class:`.CacheRegion`.**
+
+        .. warning::
+
+            The :meth:`.CacheRegion.invalidate` method's default mode of
+            operation is to set a timestamp **local to this CacheRegion
+            in this Python process only**.   It does not impact other Python
+            processes or regions as the timestamp is **only stored locally in
+            memory**.  To implement invalidation where the
+            timestamp is stored in the cache or similar so that all Python
+            processes can be affected by an invalidation timestamp, implement a
+            custom :class:`.RegionInvalidationStrategy`.

        Once set, the invalidation time is honored by
        the :meth:`.CacheRegion.get_or_create`,
@@ -550,6 +567,8 @@ class CacheRegion(object):
            _config_prefix="%sarguments." % prefix,
            wrap=config_dict.get(
                "%swrap" % prefix, None),
+            replace_existing_backend=config_dict.get(
+                "%sreplace_existing_backend" % prefix, False),
        )

    @memoized_property
@@ -944,11 +963,14 @@ class CacheRegion(object):
                if not should_cache_fn:
                    self.backend.set_multi(values_w_created)
                else:
-                    self.backend.set_multi(dict(
+                    values_to_cache = dict(
                        (k, v)
                        for k, v in values_w_created.items()
                        if should_cache_fn(v[0])
-                    ))
+                    )
+
+                    if values_to_cache:
+                        self.backend.set_multi(values_to_cache)

                values.update(values_w_created)
            return [values[orig_to_mangled[k]].payload for k in keys]
@@ -1075,6 +1097,14 @@ class CacheRegion(object):
        .. versionadded:: 0.5.0 Added ``refresh()`` method to decorated
           function.

+        ``original()`` on other hand will invoke the decorated function
+        without any caching::
+
+            newvalue = generate_something.original(5, 6)
+
+        .. versionadded:: 0.6.0 Added ``original()`` method to decorated
+           function.
+
        Lastly, the ``get()`` method returns either the value cached
        for the given key, or the token ``NO_VALUE`` if no such key
        exists::
@@ -1,4 +1,4 @@
 from .nameregistry import NameRegistry  # noqa
 from .readwrite_lock import ReadWriteMutex  # noqa
 from .langhelpers import PluginLoader, memoized_property, \
-    coerce_string_conf, to_list, KeyReentrantMutex  # noqa
+    coerce_string_conf, to_list, KeyReentrantMutex  # noqa
@@ -39,9 +39,9 @@ class PluginLoader(object):
                self.impls[name] = impl.load
                return impl.load()
            else:
-                raise Exception(
-                    "Can't load plugin %s %s" %
-                    (self.group, name))
+                raise self.NotFound(
+                    "Can't load plugin %s %s" % (self.group, name)
+                )

    def register(self, name, modulepath, objname):
        def load():
@@ -49,6 +49,9 @@ class PluginLoader(object):
            return getattr(mod, objname)
        self.impls[name] = load

+    class NotFound(Exception):
+        """The specified plugin could not be found."""
+

 class memoized_property(object):
    """A read-only @property that is only evaluated once."""
@@ -0,0 +1,32 @@
+Copyright (c) 2013, Ethan Furman.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+    Redistributions of source code must retain the above
+    copyright notice, this list of conditions and the
+    following disclaimer.
+
+    Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials
+    provided with the distribution.
+
+    Neither the name Ethan Furman nor the names of any
+    contributors may be used to endorse or promote products
+    derived from this software without specific prior written
+    permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,3 @@
+enum34 is the new Python stdlib enum module available in Python 3.4
+backported for previous versions of Python from 2.4 to 3.3.
+tested on 2.6, 2.7, and 3.3+
@@ -0,0 +1,837 @@
+"""Python Enumerations"""
+
+import sys as _sys
+
+__all__ = ['Enum', 'IntEnum', 'unique']
+
+version = 1, 1, 6
+
+pyver = float('%s.%s' % _sys.version_info[:2])
+
+try:
+    any
+except NameError:
+    def any(iterable):
+        for element in iterable:
+            if element:
+                return True
+        return False
+
+try:
+    from collections import OrderedDict
+except ImportError:
+    OrderedDict = None
+
+try:
+    basestring
+except NameError:
+    # In Python 2 basestring is the ancestor of both str and unicode
+    # in Python 3 it's just str, but was missing in 3.1
+    basestring = str
+
+try:
+    unicode
+except NameError:
+    # In Python 3 unicode no longer exists (it's just str)
+    unicode = str
+
+class _RouteClassAttributeToGetattr(object):
+    """Route attribute access on a class to __getattr__.
+
+    This is a descriptor, used to define attributes that act differently when
+    accessed through an instance and through a class.  Instance access remains
+    normal, but access to an attribute through a class will be routed to the
+    class's __getattr__ method; this is done by raising AttributeError.
+
+    """
+    def __init__(self, fget=None):
+        self.fget = fget
+
+    def __get__(self, instance, ownerclass=None):
+        if instance is None:
+            raise AttributeError()
+        return self.fget(instance)
+
+    def __set__(self, instance, value):
+        raise AttributeError("can't set attribute")
+
+    def __delete__(self, instance):
+        raise AttributeError("can't delete attribute")
+
+
+def _is_descriptor(obj):
+    """Returns True if obj is a descriptor, False otherwise."""
+    return (
+            hasattr(obj, '__get__') or
+            hasattr(obj, '__set__') or
+            hasattr(obj, '__delete__'))
+
+
+def _is_dunder(name):
+    """Returns True if a __dunder__ name, False otherwise."""
+    return (name[:2] == name[-2:] == '__' and
+            name[2:3] != '_' and
+            name[-3:-2] != '_' and
+            len(name) > 4)
+
+
+def _is_sunder(name):
+    """Returns True if a _sunder_ name, False otherwise."""
+    return (name[0] == name[-1] == '_' and
+            name[1:2] != '_' and
+            name[-2:-1] != '_' and
+            len(name) > 2)
+
+
+def _make_class_unpicklable(cls):
+    """Make the given class un-picklable."""
+    def _break_on_call_reduce(self, protocol=None):
+        raise TypeError('%r cannot be pickled' % self)
+    cls.__reduce_ex__ = _break_on_call_reduce
+    cls.__module__ = '<unknown>'
+
+
+class _EnumDict(dict):
+    """Track enum member order and ensure member names are not reused.
+
+    EnumMeta will use the names found in self._member_names as the
+    enumeration member names.
+
+    """
+    def __init__(self):
+        super(_EnumDict, self).__init__()
+        self._member_names = []
+
+    def __setitem__(self, key, value):
+        """Changes anything not dundered or not a descriptor.
+
+        If a descriptor is added with the same name as an enum member, the name
+        is removed from _member_names (this may leave a hole in the numerical
+        sequence of values).
+
+        If an enum member name is used twice, an error is raised; duplicate
+        values are not checked for.
+
+        Single underscore (sunder) names are reserved.
+
+        Note:   in 3.x __order__ is simply discarded as a not necessary piece
+                leftover from 2.x
+
+        """
+        if pyver >= 3.0 and key in ('_order_', '__order__'):
+            return
+        elif key == '__order__':
+            key = '_order_'
+        if _is_sunder(key):
+            if key != '_order_':
+                raise ValueError('_names_ are reserved for future Enum use')
+        elif _is_dunder(key):
+            pass
+        elif key in self._member_names:
+            # descriptor overwriting an enum?
+            raise TypeError('Attempted to reuse key: %r' % key)
+        elif not _is_descriptor(value):
+            if key in self:
+                # enum overwriting a descriptor?
+                raise TypeError('Key already defined as: %r' % self[key])
+            self._member_names.append(key)
+        super(_EnumDict, self).__setitem__(key, value)
+
+
+# Dummy value for Enum as EnumMeta explicity checks for it, but of course until
+# EnumMeta finishes running the first time the Enum class doesn't exist.  This
+# is also why there are checks in EnumMeta like `if Enum is not None`
+Enum = None
+
+
+class EnumMeta(type):
+    """Metaclass for Enum"""
+    @classmethod
+    def __prepare__(metacls, cls, bases):
+        return _EnumDict()
+
+    def __new__(metacls, cls, bases, classdict):
+        # an Enum class is final once enumeration items have been defined; it
+        # cannot be mixed with other types (int, float, etc.) if it has an
+        # inherited __new__ unless a new __new__ is defined (or the resulting
+        # class will fail).
+        if type(classdict) is dict:
+            original_dict = classdict
+            classdict = _EnumDict()
+            for k, v in original_dict.items():
+                classdict[k] = v
+
+        member_type, first_enum = metacls._get_mixins_(bases)
+        __new__, save_new, use_args = metacls._find_new_(classdict, member_type,
+                                                        first_enum)
+        # save enum items into separate mapping so they don't get baked into
+        # the new class
+        members = dict((k, classdict[k]) for k in classdict._member_names)
+        for name in classdict._member_names:
+            del classdict[name]
+
+        # py2 support for definition order
+        _order_ = classdict.get('_order_')
+        if _order_ is None:
+            if pyver < 3.0:
+                try:
+                    _order_ = [name for (name, value) in sorted(members.items(), key=lambda item: item[1])]
+                except TypeError:
+                    _order_ = [name for name in sorted(members.keys())]
+            else:
+                _order_ = classdict._member_names
+        else:
+            del classdict['_order_']
+            if pyver < 3.0:
+                _order_ = _order_.replace(',', ' ').split()
+                aliases = [name for name in members if name not in _order_]
+                _order_ += aliases
+
+        # check for illegal enum names (any others?)
+        invalid_names = set(members) & set(['mro'])
+        if invalid_names:
+            raise ValueError('Invalid enum member name(s): %s' % (
+                ', '.join(invalid_names), ))
+
+        # save attributes from super classes so we know if we can take
+        # the shortcut of storing members in the class dict
+        base_attributes = set([a for b in bases for a in b.__dict__])
+        # create our new Enum type
+        enum_class = super(EnumMeta, metacls).__new__(metacls, cls, bases, classdict)
+        enum_class._member_names_ = []               # names in random order
+        if OrderedDict is not None:
+            enum_class._member_map_ = OrderedDict()
+        else:
+            enum_class._member_map_ = {}             # name->value map
+        enum_class._member_type_ = member_type
+
+        # Reverse value->name map for hashable values.
+        enum_class._value2member_map_ = {}
+
+        # instantiate them, checking for duplicates as we go
+        # we instantiate first instead of checking for duplicates first in case
+        # a custom __new__ is doing something funky with the values -- such as
+        # auto-numbering ;)
+        if __new__ is None:
+            __new__ = enum_class.__new__
+        for member_name in _order_:
+            value = members[member_name]
+            if not isinstance(value, tuple):
+                args = (value, )
+            else:
+                args = value
+            if member_type is tuple:   # special case for tuple enums
+                args = (args, )     # wrap it one more time
+            if not use_args or not args:
+                enum_member = __new__(enum_class)
+                if not hasattr(enum_member, '_value_'):
+                    enum_member._value_ = value
+            else:
+                enum_member = __new__(enum_class, *args)
+                if not hasattr(enum_member, '_value_'):
+                    enum_member._value_ = member_type(*args)
+            value = enum_member._value_
+            enum_member._name_ = member_name
+            enum_member.__objclass__ = enum_class
+            enum_member.__init__(*args)
+            # If another member with the same value was already defined, the
+            # new member becomes an alias to the existing one.
+            for name, canonical_member in enum_class._member_map_.items():
+                if canonical_member.value == enum_member._value_:
+                    enum_member = canonical_member
+                    break
+            else:
+                # Aliases don't appear in member names (only in __members__).
+                enum_class._member_names_.append(member_name)
+            # performance boost for any member that would not shadow
+            # a DynamicClassAttribute (aka _RouteClassAttributeToGetattr)
+            if member_name not in base_attributes:
+                setattr(enum_class, member_name, enum_member)
+            # now add to _member_map_
+            enum_class._member_map_[member_name] = enum_member
+            try:
+                # This may fail if value is not hashable. We can't add the value
+                # to the map, and by-value lookups for this value will be
+                # linear.
+                enum_class._value2member_map_[value] = enum_member
+            except TypeError:
+                pass
+
+
+        # If a custom type is mixed into the Enum, and it does not know how
+        # to pickle itself, pickle.dumps will succeed but pickle.loads will
+        # fail.  Rather than have the error show up later and possibly far
+        # from the source, sabotage the pickle protocol for this class so
+        # that pickle.dumps also fails.
+        #
+        # However, if the new class implements its own __reduce_ex__, do not
+        # sabotage -- it's on them to make sure it works correctly.  We use
+        # __reduce_ex__ instead of any of the others as it is preferred by
+        # pickle over __reduce__, and it handles all pickle protocols.
+        unpicklable = False
+        if '__reduce_ex__' not in classdict:
+            if member_type is not object:
+                methods = ('__getnewargs_ex__', '__getnewargs__',
+                        '__reduce_ex__', '__reduce__')
+                if not any(m in member_type.__dict__ for m in methods):
+                    _make_class_unpicklable(enum_class)
+                    unpicklable = True
+
+
+        # double check that repr and friends are not the mixin's or various
+        # things break (such as pickle)
+        for name in ('__repr__', '__str__', '__format__', '__reduce_ex__'):
+            class_method = getattr(enum_class, name)
+            obj_method = getattr(member_type, name, None)
+            enum_method = getattr(first_enum, name, None)
+            if name not in classdict and class_method is not enum_method:
+                if name == '__reduce_ex__' and unpicklable:
+                    continue
+                setattr(enum_class, name, enum_method)
+
+        # method resolution and int's are not playing nice
+        # Python's less than 2.6 use __cmp__
+
+        if pyver < 2.6:
+
+            if issubclass(enum_class, int):
+                setattr(enum_class, '__cmp__', getattr(int, '__cmp__'))
+
+        elif pyver < 3.0:
+
+            if issubclass(enum_class, int):
+                for method in (
+                        '__le__',
+                        '__lt__',
+                        '__gt__',
+                        '__ge__',
+                        '__eq__',
+                        '__ne__',
+                        '__hash__',
+                        ):
+                    setattr(enum_class, method, getattr(int, method))
+
+        # replace any other __new__ with our own (as long as Enum is not None,
+        # anyway) -- again, this is to support pickle
+        if Enum is not None:
+            # if the user defined their own __new__, save it before it gets
+            # clobbered in case they subclass later
+            if save_new:
+                setattr(enum_class, '__member_new__', enum_class.__dict__['__new__'])
+            setattr(enum_class, '__new__', Enum.__dict__['__new__'])
+        return enum_class
+
+    def __bool__(cls):
+        """
+        classes/types should always be True.
+        """
+        return True
+
+    def __call__(cls, value, names=None, module=None, type=None, start=1):
+        """Either returns an existing member, or creates a new enum class.
+
+        This method is used both when an enum class is given a value to match
+        to an enumeration member (i.e. Color(3)) and for the functional API
+        (i.e. Color = Enum('Color', names='red green blue')).
+
+        When used for the functional API: `module`, if set, will be stored in
+        the new class' __module__ attribute; `type`, if set, will be mixed in
+        as the first base class.
+
+        Note: if `module` is not set this routine will attempt to discover the
+        calling module by walking the frame stack; if this is unsuccessful
+        the resulting class will not be pickleable.
+
+        """
+        if names is None:  # simple value lookup
+            return cls.__new__(cls, value)
+        # otherwise, functional API: we're creating a new Enum type
+        return cls._create_(value, names, module=module, type=type, start=start)
+
+    def __contains__(cls, member):
+        return isinstance(member, cls) and member.name in cls._member_map_
+
+    def __delattr__(cls, attr):
+        # nicer error message when someone tries to delete an attribute
+        # (see issue19025).
+        if attr in cls._member_map_:
+            raise AttributeError(
+                    "%s: cannot delete Enum member." % cls.__name__)
+        super(EnumMeta, cls).__delattr__(attr)
+
+    def __dir__(self):
+        return (['__class__', '__doc__', '__members__', '__module__'] +
+                self._member_names_)
+
+    @property
+    def __members__(cls):
+        """Returns a mapping of member name->value.
+
+        This mapping lists all enum members, including aliases. Note that this
+        is a copy of the internal mapping.
+
+        """
+        return cls._member_map_.copy()
+
+    def __getattr__(cls, name):
+        """Return the enum member matching `name`
+
+        We use __getattr__ instead of descriptors or inserting into the enum
+        class' __dict__ in order to support `name` and `value` being both
+        properties for enum members (which live in the class' __dict__) and
+        enum members themselves.
+
+        """
+        if _is_dunder(name):
+            raise AttributeError(name)
+        try:
+            return cls._member_map_[name]
+        except KeyError:
+            raise AttributeError(name)
+
+    def __getitem__(cls, name):
+        return cls._member_map_[name]
+
+    def __iter__(cls):
+        return (cls._member_map_[name] for name in cls._member_names_)
+
+    def __reversed__(cls):
+        return (cls._member_map_[name] for name in reversed(cls._member_names_))
+
+    def __len__(cls):
+        return len(cls._member_names_)
+
+    __nonzero__ = __bool__
+
+    def __repr__(cls):
+        return "<enum %r>" % cls.__name__
+
+    def __setattr__(cls, name, value):
+        """Block attempts to reassign Enum members.
+
+        A simple assignment to the class namespace only changes one of the
+        several possible ways to get an Enum member from the Enum class,
+        resulting in an inconsistent Enumeration.
+
+        """
+        member_map = cls.__dict__.get('_member_map_', {})
+        if name in member_map:
+            raise AttributeError('Cannot reassign members.')
+        super(EnumMeta, cls).__setattr__(name, value)
+
+    def _create_(cls, class_name, names=None, module=None, type=None, start=1):
+        """Convenience method to create a new Enum class.
+
+        `names` can be:
+
+        * A string containing member names, separated either with spaces or
+          commas.  Values are auto-numbered from 1.
+        * An iterable of member names.  Values are auto-numbered from 1.
+        * An iterable of (member name, value) pairs.
+        * A mapping of member name -> value.
+
+        """
+        if pyver < 3.0:
+            # if class_name is unicode, attempt a conversion to ASCII
+            if isinstance(class_name, unicode):
+                try:
+                    class_name = class_name.encode('ascii')
+                except UnicodeEncodeError:
+                    raise TypeError('%r is not representable in ASCII' % class_name)
+        metacls = cls.__class__
+        if type is None:
+            bases = (cls, )
+        else:
+            bases = (type, cls)
+        classdict = metacls.__prepare__(class_name, bases)
+        _order_ = []
+
+        # special processing needed for names?
+        if isinstance(names, basestring):
+            names = names.replace(',', ' ').split()
+        if isinstance(names, (tuple, list)) and isinstance(names[0], basestring):
+            names = [(e, i+start) for (i, e) in enumerate(names)]
+
+        # Here, names is either an iterable of (name, value) or a mapping.
+        item = None  # in case names is empty
+        for item in names:
+            if isinstance(item, basestring):
+                member_name, member_value = item, names[item]
+            else:
+                member_name, member_value = item
+            classdict[member_name] = member_value
+            _order_.append(member_name)
+        # only set _order_ in classdict if name/value was not from a mapping
+        if not isinstance(item, basestring):
+            classdict['_order_'] = ' '.join(_order_)
+        enum_class = metacls.__new__(metacls, class_name, bases, classdict)
+
+        # TODO: replace the frame hack if a blessed way to know the calling
+        # module is ever developed
+        if module is None:
+            try:
+                module = _sys._getframe(2).f_globals['__name__']
+            except (AttributeError, ValueError):
+                pass
+        if module is None:
+            _make_class_unpicklable(enum_class)
+        else:
+            enum_class.__module__ = module
+
+        return enum_class
+
+    @staticmethod
+    def _get_mixins_(bases):
+        """Returns the type for creating enum members, and the first inherited
+        enum class.
+
+        bases: the tuple of bases that was given to __new__
+
+        """
+        if not bases or Enum is None:
+            return object, Enum
+
+
+        # double check that we are not subclassing a class with existing
+        # enumeration members; while we're at it, see if any other data
+        # type has been mixed in so we can use the correct __new__
+        member_type = first_enum = None
+        for base in bases:
+            if  (base is not Enum and
+                    issubclass(base, Enum) and
+                    base._member_names_):
+                raise TypeError("Cannot extend enumerations")
+        # base is now the last base in bases
+        if not issubclass(base, Enum):
+            raise TypeError("new enumerations must be created as "
+                    "`ClassName([mixin_type,] enum_type)`")
+
+        # get correct mix-in type (either mix-in type of Enum subclass, or
+        # first base if last base is Enum)
+        if not issubclass(bases[0], Enum):
+            member_type = bases[0]     # first data type
+            first_enum = bases[-1]  # enum type
+        else:
+            for base in bases[0].__mro__:
+                # most common: (IntEnum, int, Enum, object)
+                # possible:    (<Enum 'AutoIntEnum'>, <Enum 'IntEnum'>,
+                #               <class 'int'>, <Enum 'Enum'>,
+                #               <class 'object'>)
+                if issubclass(base, Enum):
+                    if first_enum is None:
+                        first_enum = base
+                else:
+                    if member_type is None:
+                        member_type = base
+
+        return member_type, first_enum
+
+    if pyver < 3.0:
+        @staticmethod
+        def _find_new_(classdict, member_type, first_enum):
+            """Returns the __new__ to be used for creating the enum members.
+
+            classdict: the class dictionary given to __new__
+            member_type: the data type whose __new__ will be used by default
+            first_enum: enumeration to check for an overriding __new__
+
+            """
+            # now find the correct __new__, checking to see of one was defined
+            # by the user; also check earlier enum classes in case a __new__ was
+            # saved as __member_new__
+            __new__ = classdict.get('__new__', None)
+            if __new__:
+                return None, True, True      # __new__, save_new, use_args
+
+            N__new__ = getattr(None, '__new__')
+            O__new__ = getattr(object, '__new__')
+            if Enum is None:
+                E__new__ = N__new__
+            else:
+                E__new__ = Enum.__dict__['__new__']
+            # check all possibles for __member_new__ before falling back to
+            # __new__
+            for method in ('__member_new__', '__new__'):
+                for possible in (member_type, first_enum):
+                    try:
+                        target = possible.__dict__[method]
+                    except (AttributeError, KeyError):
+                        target = getattr(possible, method, None)
+                    if target not in [
+                            None,
+                            N__new__,
+                            O__new__,
+                            E__new__,
+                            ]:
+                        if method == '__member_new__':
+                            classdict['__new__'] = target
+                            return None, False, True
+                        if isinstance(target, staticmethod):
+                            target = target.__get__(member_type)
+                        __new__ = target
+                        break
+                if __new__ is not None:
+                    break
+            else:
+                __new__ = object.__new__
+
+            # if a non-object.__new__ is used then whatever value/tuple was
+            # assigned to the enum member name will be passed to __new__ and to the
+            # new enum member's __init__
+            if __new__ is object.__new__:
+                use_args = False
+            else:
+                use_args = True
+
+            return __new__, False, use_args
+    else:
+        @staticmethod
+        def _find_new_(classdict, member_type, first_enum):
+            """Returns the __new__ to be used for creating the enum members.
+
+            classdict: the class dictionary given to __new__
+            member_type: the data type whose __new__ will be used by default
+            first_enum: enumeration to check for an overriding __new__
+
+            """
+            # now find the correct __new__, checking to see of one was defined
+            # by the user; also check earlier enum classes in case a __new__ was
+            # saved as __member_new__
+            __new__ = classdict.get('__new__', None)
+
+            # should __new__ be saved as __member_new__ later?
+            save_new = __new__ is not None
+
+            if __new__ is None:
+                # check all possibles for __member_new__ before falling back to
+                # __new__
+                for method in ('__member_new__', '__new__'):
+                    for possible in (member_type, first_enum):
+                        target = getattr(possible, method, None)
+                        if target not in (
+                                None,
+                                None.__new__,
+                                object.__new__,
+                                Enum.__new__,
+                                ):
+                            __new__ = target
+                            break
+                    if __new__ is not None:
+                        break
+                else:
+                    __new__ = object.__new__
+
+            # if a non-object.__new__ is used then whatever value/tuple was
+            # assigned to the enum member name will be passed to __new__ and to the
+            # new enum member's __init__
+            if __new__ is object.__new__:
+                use_args = False
+            else:
+                use_args = True
+
+            return __new__, save_new, use_args
+
+
+########################################################
+# In order to support Python 2 and 3 with a single
+# codebase we have to create the Enum methods separately
+# and then use the `type(name, bases, dict)` method to
+# create the class.
+########################################################
+temp_enum_dict = {}
+temp_enum_dict['__doc__'] = "Generic enumeration.\n\n    Derive from this class to define new enumerations.\n\n"
+
+def __new__(cls, value):
+    # all enum instances are actually created during class construction
+    # without calling this method; this method is called by the metaclass'
+    # __call__ (i.e. Color(3) ), and by pickle
+    if type(value) is cls:
+        # For lookups like Color(Color.red)
+        value = value.value
+        #return value
+    # by-value search for a matching enum member
+    # see if it's in the reverse mapping (for hashable values)
+    try:
+        if value in cls._value2member_map_:
+            return cls._value2member_map_[value]
+    except TypeError:
+        # not there, now do long search -- O(n) behavior
+        for member in cls._member_map_.values():
+            if member.value == value:
+                return member
+    raise ValueError("%s is not a valid %s" % (value, cls.__name__))
+temp_enum_dict['__new__'] = __new__
+del __new__
+
+def __repr__(self):
+    return "<%s.%s: %r>" % (
+            self.__class__.__name__, self._name_, self._value_)
+temp_enum_dict['__repr__'] = __repr__
+del __repr__
+
+def __str__(self):
+    return "%s.%s" % (self.__class__.__name__, self._name_)
+temp_enum_dict['__str__'] = __str__
+del __str__
+
+if pyver >= 3.0:
+    def __dir__(self):
+        added_behavior = [
+                m
+                for cls in self.__class__.mro()
+                for m in cls.__dict__
+                if m[0] != '_' and m not in self._member_map_
+                ]
+        return (['__class__', '__doc__', '__module__', ] + added_behavior)
+    temp_enum_dict['__dir__'] = __dir__
+    del __dir__
+
+def __format__(self, format_spec):
+    # mixed-in Enums should use the mixed-in type's __format__, otherwise
+    # we can get strange results with the Enum name showing up instead of
+    # the value
+
+    # pure Enum branch
+    if self._member_type_ is object:
+        cls = str
+        val = str(self)
+    # mix-in branch
+    else:
+        cls = self._member_type_
+        val = self.value
+    return cls.__format__(val, format_spec)
+temp_enum_dict['__format__'] = __format__
+del __format__
+
+
+####################################
+# Python's less than 2.6 use __cmp__
+
+if pyver < 2.6:
+
+    def __cmp__(self, other):
+        if type(other) is self.__class__:
+            if self is other:
+                return 0
+            return -1
+        return NotImplemented
+        raise TypeError("unorderable types: %s() and %s()" % (self.__class__.__name__, other.__class__.__name__))
+    temp_enum_dict['__cmp__'] = __cmp__
+    del __cmp__
+
+else:
+
+    def __le__(self, other):
+        raise TypeError("unorderable types: %s() <= %s()" % (self.__class__.__name__, other.__class__.__name__))
+    temp_enum_dict['__le__'] = __le__
+    del __le__
+
+    def __lt__(self, other):
+        raise TypeError("unorderable types: %s() < %s()" % (self.__class__.__name__, other.__class__.__name__))
+    temp_enum_dict['__lt__'] = __lt__
+    del __lt__
+
+    def __ge__(self, other):
+        raise TypeError("unorderable types: %s() >= %s()" % (self.__class__.__name__, other.__class__.__name__))
+    temp_enum_dict['__ge__'] = __ge__
+    del __ge__
+
+    def __gt__(self, other):
+        raise TypeError("unorderable types: %s() > %s()" % (self.__class__.__name__, other.__class__.__name__))
+    temp_enum_dict['__gt__'] = __gt__
+    del __gt__
+
+
+def __eq__(self, other):
+    if type(other) is self.__class__:
+        return self is other
+    return NotImplemented
+temp_enum_dict['__eq__'] = __eq__
+del __eq__
+
+def __ne__(self, other):
+    if type(other) is self.__class__:
+        return self is not other
+    return NotImplemented
+temp_enum_dict['__ne__'] = __ne__
+del __ne__
+
+def __hash__(self):
+    return hash(self._name_)
+temp_enum_dict['__hash__'] = __hash__
+del __hash__
+
+def __reduce_ex__(self, proto):
+    return self.__class__, (self._value_, )
+temp_enum_dict['__reduce_ex__'] = __reduce_ex__
+del __reduce_ex__
+
+# _RouteClassAttributeToGetattr is used to provide access to the `name`
+# and `value` properties of enum members while keeping some measure of
+# protection from modification, while still allowing for an enumeration
+# to have members named `name` and `value`.  This works because enumeration
+# members are not set directly on the enum class -- __getattr__ is
+# used to look them up.
+
+@_RouteClassAttributeToGetattr
+def name(self):
+    return self._name_
+temp_enum_dict['name'] = name
+del name
+
+@_RouteClassAttributeToGetattr
+def value(self):
+    return self._value_
+temp_enum_dict['value'] = value
+del value
+
+@classmethod
+def _convert(cls, name, module, filter, source=None):
+    """
+    Create a new Enum subclass that replaces a collection of global constants
+    """
+    # convert all constants from source (or module) that pass filter() to
+    # a new Enum called name, and export the enum and its members back to
+    # module;
+    # also, replace the __reduce_ex__ method so unpickling works in
+    # previous Python versions
+    module_globals = vars(_sys.modules[module])
+    if source:
+        source = vars(source)
+    else:
+        source = module_globals
+    members = dict((name, value) for name, value in source.items() if filter(name))
+    cls = cls(name, members, module=module)
+    cls.__reduce_ex__ = _reduce_ex_by_name
+    module_globals.update(cls.__members__)
+    module_globals[name] = cls
+    return cls
+temp_enum_dict['_convert'] = _convert
+del _convert
+
+Enum = EnumMeta('Enum', (object, ), temp_enum_dict)
+del temp_enum_dict
+
+# Enum has now been created
+###########################
+
+class IntEnum(int, Enum):
+    """Enum where members are also (and must be) ints"""
+
+def _reduce_ex_by_name(self, proto):
+    return self.name
+
+def unique(enumeration):
+    """Class decorator that ensures only unique members exist in an enumeration."""
+    duplicates = []
+    for name, member in enumeration.__members__.items():
+        if name != member.name:
+            duplicates.append((name, member.name))
+    if duplicates:
+        duplicate_names = ', '.join(
+                ["%s -> %s" % (alias, name) for (alias, name) in duplicates]
+                )
+        raise ValueError('duplicate names found in %r: %s' %
+                (enumeration, duplicate_names)
+                )
+    return enumeration
@@ -0,0 +1,735 @@
+``enum`` --- support for enumerations
+========================================
+
+.. :synopsis: enumerations are sets of symbolic names bound to unique, constant
+  values.
+.. :moduleauthor:: Ethan Furman <ethan@stoneleaf.us>
+.. :sectionauthor:: Barry Warsaw <barry@python.org>,
+.. :sectionauthor:: Eli Bendersky <eliben@gmail.com>,
+.. :sectionauthor:: Ethan Furman <ethan@stoneleaf.us>
+
+----------------
+
+An enumeration is a set of symbolic names (members) bound to unique, constant
+values.  Within an enumeration, the members can be compared by identity, and
+the enumeration itself can be iterated over.
+
+
+Module Contents
+---------------
+
+This module defines two enumeration classes that can be used to define unique
+sets of names and values: ``Enum`` and ``IntEnum``.  It also defines
+one decorator, ``unique``.
+
+``Enum``
+
+Base class for creating enumerated constants.  See section `Functional API`_
+for an alternate construction syntax.
+
+``IntEnum``
+
+Base class for creating enumerated constants that are also subclasses of ``int``.
+
+``unique``
+
+Enum class decorator that ensures only one name is bound to any one value.
+
+
+Creating an Enum
+----------------
+
+Enumerations are created using the ``class`` syntax, which makes them
+easy to read and write.  An alternative creation method is described in
+`Functional API`_.  To define an enumeration, subclass ``Enum`` as
+follows::
+
+    >>> from enum import Enum
+    >>> class Color(Enum):
+    ...     red = 1
+    ...     green = 2
+    ...     blue = 3
+
+Note: Nomenclature
+
+  - The class ``Color`` is an *enumeration* (or *enum*)
+  - The attributes ``Color.red``, ``Color.green``, etc., are
+    *enumeration members* (or *enum members*).
+  - The enum members have *names* and *values* (the name of
+    ``Color.red`` is ``red``, the value of ``Color.blue`` is
+    ``3``, etc.)
+    
+Note:
+
+    Even though we use the ``class`` syntax to create Enums, Enums
+    are not normal Python classes.  See `How are Enums different?`_ for
+    more details.
+
+Enumeration members have human readable string representations::
+
+    >>> print(Color.red)
+    Color.red
+
+...while their ``repr`` has more information::
+
+    >>> print(repr(Color.red))
+    <Color.red: 1>
+
+The *type* of an enumeration member is the enumeration it belongs to::
+
+    >>> type(Color.red)
+    <enum 'Color'>
+    >>> isinstance(Color.green, Color)
+    True
+    >>>
+
+Enum members also have a property that contains just their item name::
+
+    >>> print(Color.red.name)
+    red
+
+Enumerations support iteration.  In Python 3.x definition order is used; in
+Python 2.x the definition order is not available, but class attribute
+``__order__`` is supported;  otherwise, value order is used::
+
+    >>> class Shake(Enum):
+    ...   __order__ = 'vanilla chocolate cookies mint'  # only needed in 2.x
+    ...   vanilla = 7
+    ...   chocolate = 4
+    ...   cookies = 9
+    ...   mint = 3
+    ...
+    >>> for shake in Shake:
+    ...   print(shake)
+    ...
+    Shake.vanilla
+    Shake.chocolate
+    Shake.cookies
+    Shake.mint
+
+The ``__order__`` attribute is always removed, and in 3.x it is also ignored
+(order is definition order); however, in the stdlib version it will be ignored
+but not removed.
+
+Enumeration members are hashable, so they can be used in dictionaries and sets::
+
+    >>> apples = {}
+    >>> apples[Color.red] = 'red delicious'
+    >>> apples[Color.green] = 'granny smith'
+    >>> apples == {Color.red: 'red delicious', Color.green: 'granny smith'}
+    True
+
+
+Programmatic access to enumeration members and their attributes
+---------------------------------------------------------------
+
+Sometimes it's useful to access members in enumerations programmatically (i.e.
+situations where ``Color.red`` won't do because the exact color is not known
+at program-writing time).  ``Enum`` allows such access::
+
+    >>> Color(1)
+    <Color.red: 1>
+    >>> Color(3)
+    <Color.blue: 3>
+
+If you want to access enum members by *name*, use item access::
+
+    >>> Color['red']
+    <Color.red: 1>
+    >>> Color['green']
+    <Color.green: 2>
+
+If have an enum member and need its ``name`` or ``value``::
+
+    >>> member = Color.red
+    >>> member.name
+    'red'
+    >>> member.value
+    1
+
+
+Duplicating enum members and values
+-----------------------------------
+
+Having two enum members (or any other attribute) with the same name is invalid;
+in Python 3.x this would raise an error, but in Python 2.x the second member
+simply overwrites the first::
+
+    >>> # python 2.x
+    >>> class Shape(Enum):
+    ...   square = 2
+    ...   square = 3
+    ...
+    >>> Shape.square
+    <Shape.square: 3>
+
+    >>> # python 3.x
+    >>> class Shape(Enum):
+    ...   square = 2
+    ...   square = 3
+    Traceback (most recent call last):
+    ...
+    TypeError: Attempted to reuse key: 'square'
+
+However, two enum members are allowed to have the same value.  Given two members
+A and B with the same value (and A defined first), B is an alias to A.  By-value
+lookup of the value of A and B will return A.  By-name lookup of B will also
+return A::
+
+    >>> class Shape(Enum):
+    ...   __order__ = 'square diamond circle alias_for_square'  # only needed in 2.x
+    ...   square = 2
+    ...   diamond = 1
+    ...   circle = 3
+    ...   alias_for_square = 2
+    ...
+    >>> Shape.square
+    <Shape.square: 2>
+    >>> Shape.alias_for_square
+    <Shape.square: 2>
+    >>> Shape(2)
+    <Shape.square: 2>
+
+
+Allowing aliases is not always desirable.  ``unique`` can be used to ensure
+that none exist in a particular enumeration::
+
+    >>> from enum import unique
+    >>> @unique
+    ... class Mistake(Enum):
+    ...   __order__ = 'one two three four'  # only needed in 2.x
+    ...   one = 1
+    ...   two = 2
+    ...   three = 3
+    ...   four = 3
+    Traceback (most recent call last):
+    ...
+    ValueError: duplicate names found in <enum 'Mistake'>: four -> three
+
+Iterating over the members of an enum does not provide the aliases::
+
+    >>> list(Shape)
+    [<Shape.square: 2>, <Shape.diamond: 1>, <Shape.circle: 3>]
+
+The special attribute ``__members__`` is a dictionary mapping names to members.
+It includes all names defined in the enumeration, including the aliases::
+
+    >>> for name, member in sorted(Shape.__members__.items()):
+    ...   name, member
+    ...
+    ('alias_for_square', <Shape.square: 2>)
+    ('circle', <Shape.circle: 3>)
+    ('diamond', <Shape.diamond: 1>)
+    ('square', <Shape.square: 2>)
+
+The ``__members__`` attribute can be used for detailed programmatic access to
+the enumeration members.  For example, finding all the aliases::
+
+    >>> [name for name, member in Shape.__members__.items() if member.name != name]
+    ['alias_for_square']
+
+Comparisons
+-----------
+
+Enumeration members are compared by identity::
+
+    >>> Color.red is Color.red
+    True
+    >>> Color.red is Color.blue
+    False
+    >>> Color.red is not Color.blue
+    True
+
+Ordered comparisons between enumeration values are *not* supported.  Enum
+members are not integers (but see `IntEnum`_ below)::
+
+    >>> Color.red < Color.blue
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: unorderable types: Color() < Color()
+
+.. warning::
+
+    In Python 2 *everything* is ordered, even though the ordering may not
+    make sense.  If you want your enumerations to have a sensible ordering
+    check out the `OrderedEnum`_ recipe below.
+
+
+Equality comparisons are defined though::
+
+    >>> Color.blue == Color.red
+    False
+    >>> Color.blue != Color.red
+    True
+    >>> Color.blue == Color.blue
+    True
+
+Comparisons against non-enumeration values will always compare not equal
+(again, ``IntEnum`` was explicitly designed to behave differently, see
+below)::
+
+    >>> Color.blue == 2
+    False
+
+
+Allowed members and attributes of enumerations
+----------------------------------------------
+
+The examples above use integers for enumeration values.  Using integers is
+short and handy (and provided by default by the `Functional API`_), but not
+strictly enforced.  In the vast majority of use-cases, one doesn't care what
+the actual value of an enumeration is.  But if the value *is* important,
+enumerations can have arbitrary values.
+
+Enumerations are Python classes, and can have methods and special methods as
+usual.  If we have this enumeration::
+
+    >>> class Mood(Enum):
+    ...   funky = 1
+    ...   happy = 3
+    ... 
+    ...   def describe(self):
+    ...     # self is the member here
+    ...     return self.name, self.value
+    ... 
+    ...   def __str__(self):
+    ...     return 'my custom str! {0}'.format(self.value)
+    ... 
+    ...   @classmethod
+    ...   def favorite_mood(cls):
+    ...     # cls here is the enumeration
+    ...     return cls.happy
+
+Then::
+
+    >>> Mood.favorite_mood()
+    <Mood.happy: 3>
+    >>> Mood.happy.describe()
+    ('happy', 3)
+    >>> str(Mood.funky)
+    'my custom str! 1'
+
+The rules for what is allowed are as follows: _sunder_ names (starting and
+ending with a single underscore) are reserved by enum and cannot be used;
+all other attributes defined within an enumeration will become members of this
+enumeration, with the exception of *__dunder__* names and descriptors (methods
+are also descriptors).
+
+Note:
+
+    If your enumeration defines ``__new__`` and/or ``__init__`` then
+    whatever value(s) were given to the enum member will be passed into
+    those methods.  See `Planet`_ for an example.
+
+
+Restricted subclassing of enumerations
+--------------------------------------
+
+Subclassing an enumeration is allowed only if the enumeration does not define
+any members.  So this is forbidden::
+
+    >>> class MoreColor(Color):
+    ...   pink = 17
+    Traceback (most recent call last):
+    ...
+    TypeError: Cannot extend enumerations
+
+But this is allowed::
+
+    >>> class Foo(Enum):
+    ...   def some_behavior(self):
+    ...     pass
+    ...
+    >>> class Bar(Foo):
+    ...   happy = 1
+    ...   sad = 2
+    ...
+
+Allowing subclassing of enums that define members would lead to a violation of
+some important invariants of types and instances.  On the other hand, it makes
+sense to allow sharing some common behavior between a group of enumerations.
+(See `OrderedEnum`_ for an example.)
+
+
+Pickling
+--------
+
+Enumerations can be pickled and unpickled::
+
+    >>> from enum.test_enum import Fruit
+    >>> from pickle import dumps, loads
+    >>> Fruit.tomato is loads(dumps(Fruit.tomato, 2))
+    True
+
+The usual restrictions for pickling apply: picklable enums must be defined in
+the top level of a module, since unpickling requires them to be importable
+from that module.
+
+Note:
+
+    With pickle protocol version 4 (introduced in Python 3.4) it is possible
+    to easily pickle enums nested in other classes.
+
+
+
+Functional API
+--------------
+
+The ``Enum`` class is callable, providing the following functional API::
+
+    >>> Animal = Enum('Animal', 'ant bee cat dog')
+    >>> Animal
+    <enum 'Animal'>
+    >>> Animal.ant
+    <Animal.ant: 1>
+    >>> Animal.ant.value
+    1
+    >>> list(Animal)
+    [<Animal.ant: 1>, <Animal.bee: 2>, <Animal.cat: 3>, <Animal.dog: 4>]
+
+The semantics of this API resemble ``namedtuple``. The first argument
+of the call to ``Enum`` is the name of the enumeration. 
+
+The second argument is the *source* of enumeration member names.  It can be a
+whitespace-separated string of names, a sequence of names, a sequence of
+2-tuples with key/value pairs, or a mapping (e.g. dictionary) of names to
+values.  The last two options enable assigning arbitrary values to
+enumerations; the others auto-assign increasing integers starting with 1.  A
+new class derived from ``Enum`` is returned.  In other words, the above
+assignment to ``Animal`` is equivalent to::
+
+    >>> class Animals(Enum):
+    ...   ant = 1
+    ...   bee = 2
+    ...   cat = 3
+    ...   dog = 4
+
+Pickling enums created with the functional API can be tricky as frame stack
+implementation details are used to try and figure out which module the
+enumeration is being created in (e.g. it will fail if you use a utility
+function in separate module, and also may not work on IronPython or Jython).
+The solution is to specify the module name explicitly as follows::
+
+    >>> Animals = Enum('Animals', 'ant bee cat dog', module=__name__)
+
+Derived Enumerations
+--------------------
+
+IntEnum
+^^^^^^^
+
+A variation of ``Enum`` is provided which is also a subclass of
+``int``.  Members of an ``IntEnum`` can be compared to integers;
+by extension, integer enumerations of different types can also be compared
+to each other::
+
+    >>> from enum import IntEnum
+    >>> class Shape(IntEnum):
+    ...   circle = 1
+    ...   square = 2
+    ...
+    >>> class Request(IntEnum):
+    ...   post = 1
+    ...   get = 2
+    ...
+    >>> Shape == 1
+    False
+    >>> Shape.circle == 1
+    True
+    >>> Shape.circle == Request.post
+    True
+
+However, they still can't be compared to standard ``Enum`` enumerations::
+
+    >>> class Shape(IntEnum):
+    ...   circle = 1
+    ...   square = 2
+    ...
+    >>> class Color(Enum):
+    ...   red = 1
+    ...   green = 2
+    ...
+    >>> Shape.circle == Color.red
+    False
+
+``IntEnum`` values behave like integers in other ways you'd expect::
+
+    >>> int(Shape.circle)
+    1
+    >>> ['a', 'b', 'c'][Shape.circle]
+    'b'
+    >>> [i for i in range(Shape.square)]
+    [0, 1]
+
+For the vast majority of code, ``Enum`` is strongly recommended,
+since ``IntEnum`` breaks some semantic promises of an enumeration (by
+being comparable to integers, and thus by transitivity to other
+unrelated enumerations).  It should be used only in special cases where
+there's no other choice; for example, when integer constants are
+replaced with enumerations and backwards compatibility is required with code
+that still expects integers.
+
+
+Others
+^^^^^^
+
+While ``IntEnum`` is part of the ``enum`` module, it would be very
+simple to implement independently::
+
+    class IntEnum(int, Enum):
+        pass
+
+This demonstrates how similar derived enumerations can be defined; for example
+a ``StrEnum`` that mixes in ``str`` instead of ``int``.
+
+Some rules:
+
+1. When subclassing ``Enum``, mix-in types must appear before
+   ``Enum`` itself in the sequence of bases, as in the ``IntEnum``
+   example above.
+2. While ``Enum`` can have members of any type, once you mix in an
+   additional type, all the members must have values of that type, e.g.
+   ``int`` above.  This restriction does not apply to mix-ins which only
+   add methods and don't specify another data type such as ``int`` or
+   ``str``.
+3. When another data type is mixed in, the ``value`` attribute is *not the
+   same* as the enum member itself, although it is equivalant and will compare
+   equal.
+4. %-style formatting:  ``%s`` and ``%r`` call ``Enum``'s ``__str__`` and
+   ``__repr__`` respectively; other codes (such as ``%i`` or ``%h`` for
+   IntEnum) treat the enum member as its mixed-in type.
+
+   Note: Prior to Python 3.4 there is a bug in ``str``'s %-formatting: ``int``
+   subclasses are printed as strings and not numbers when the ``%d``, ``%i``,
+   or ``%u`` codes are used.
+5. ``str.__format__`` (or ``format``) will use the mixed-in
+   type's ``__format__``.  If the ``Enum``'s ``str`` or
+   ``repr`` is desired use the ``!s`` or ``!r`` ``str`` format codes.
+
+
+Decorators
+----------
+
+unique
+^^^^^^
+
+A ``class`` decorator specifically for enumerations.  It searches an
+enumeration's ``__members__`` gathering any aliases it finds; if any are
+found ``ValueError`` is raised with the details::
+
+    >>> @unique
+    ... class NoDupes(Enum):
+    ...    first = 'one'
+    ...    second = 'two'
+    ...    third = 'two'
+    Traceback (most recent call last):
+    ...
+    ValueError: duplicate names found in <enum 'NoDupes'>: third -> second
+
+
+Interesting examples
+--------------------
+
+While ``Enum`` and ``IntEnum`` are expected to cover the majority of
+use-cases, they cannot cover them all.  Here are recipes for some different
+types of enumerations that can be used directly, or as examples for creating
+one's own.
+
+
+AutoNumber
+^^^^^^^^^^
+
+Avoids having to specify the value for each enumeration member::
+
+    >>> class AutoNumber(Enum):
+    ...     def __new__(cls):
+    ...         value = len(cls.__members__) + 1
+    ...         obj = object.__new__(cls)
+    ...         obj._value_ = value
+    ...         return obj
+    ...
+    >>> class Color(AutoNumber):
+    ...     __order__ = "red green blue"  # only needed in 2.x
+    ...     red = ()
+    ...     green = ()
+    ...     blue = ()
+    ...
+    >>> Color.green.value == 2
+    True
+
+Note:
+
+    The `__new__` method, if defined, is used during creation of the Enum
+    members; it is then replaced by Enum's `__new__` which is used after
+    class creation for lookup of existing members.  Due to the way Enums are
+    supposed to behave, there is no way to customize Enum's `__new__`.
+
+
+UniqueEnum
+^^^^^^^^^^
+
+Raises an error if a duplicate member name is found instead of creating an
+alias::
+
+    >>> class UniqueEnum(Enum):
+    ...     def __init__(self, *args):
+    ...         cls = self.__class__
+    ...         if any(self.value == e.value for e in cls):
+    ...             a = self.name
+    ...             e = cls(self.value).name
+    ...             raise ValueError(
+    ...                     "aliases not allowed in UniqueEnum:  %r --> %r"
+    ...                     % (a, e))
+    ... 
+    >>> class Color(UniqueEnum):
+    ...     red = 1
+    ...     green = 2
+    ...     blue = 3
+    ...     grene = 2
+    Traceback (most recent call last):
+    ...
+    ValueError: aliases not allowed in UniqueEnum:  'grene' --> 'green'
+    
+
+OrderedEnum
+^^^^^^^^^^^
+
+An ordered enumeration that is not based on ``IntEnum`` and so maintains
+the normal ``Enum`` invariants (such as not being comparable to other
+enumerations)::
+
+    >>> class OrderedEnum(Enum):
+    ...     def __ge__(self, other):
+    ...         if self.__class__ is other.__class__:
+    ...             return self._value_ >= other._value_
+    ...         return NotImplemented
+    ...     def __gt__(self, other):
+    ...         if self.__class__ is other.__class__:
+    ...             return self._value_ > other._value_
+    ...         return NotImplemented
+    ...     def __le__(self, other):
+    ...         if self.__class__ is other.__class__:
+    ...             return self._value_ <= other._value_
+    ...         return NotImplemented
+    ...     def __lt__(self, other):
+    ...         if self.__class__ is other.__class__:
+    ...             return self._value_ < other._value_
+    ...         return NotImplemented
+    ...
+    >>> class Grade(OrderedEnum):
+    ...     __ordered__ = 'A B C D F'
+    ...     A = 5
+    ...     B = 4
+    ...     C = 3
+    ...     D = 2
+    ...     F = 1
+    ...
+    >>> Grade.C < Grade.A
+    True
+
+
+Planet
+^^^^^^
+
+If ``__new__`` or ``__init__`` is defined the value of the enum member
+will be passed to those methods::
+
+    >>> class Planet(Enum):
+    ...     MERCURY = (3.303e+23, 2.4397e6)
+    ...     VENUS   = (4.869e+24, 6.0518e6)
+    ...     EARTH   = (5.976e+24, 6.37814e6)
+    ...     MARS    = (6.421e+23, 3.3972e6)
+    ...     JUPITER = (1.9e+27,   7.1492e7)
+    ...     SATURN  = (5.688e+26, 6.0268e7)
+    ...     URANUS  = (8.686e+25, 2.5559e7)
+    ...     NEPTUNE = (1.024e+26, 2.4746e7)
+    ...     def __init__(self, mass, radius):
+    ...         self.mass = mass       # in kilograms
+    ...         self.radius = radius   # in meters
+    ...     @property
+    ...     def surface_gravity(self):
+    ...         # universal gravitational constant  (m3 kg-1 s-2)
+    ...         G = 6.67300E-11
+    ...         return G * self.mass / (self.radius * self.radius)
+    ... 
+    >>> Planet.EARTH.value
+    (5.976e+24, 6378140.0)
+    >>> Planet.EARTH.surface_gravity
+    9.802652743337129
+
+
+How are Enums different?
+------------------------
+
+Enums have a custom metaclass that affects many aspects of both derived Enum
+classes and their instances (members).
+
+
+Enum Classes
+^^^^^^^^^^^^
+
+The ``EnumMeta`` metaclass is responsible for providing the
+``__contains__``, ``__dir__``, ``__iter__`` and other methods that
+allow one to do things with an ``Enum`` class that fail on a typical
+class, such as ``list(Color)`` or ``some_var in Color``.  ``EnumMeta`` is
+responsible for ensuring that various other methods on the final ``Enum``
+class are correct (such as ``__new__``, ``__getnewargs__``,
+``__str__`` and ``__repr__``).
+
+.. note::
+
+    ``__dir__`` is not changed in the Python 2 line as it messes up some
+    of the decorators included in the stdlib.
+
+
+Enum Members (aka instances)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The most interesting thing about Enum members is that they are singletons.
+``EnumMeta`` creates them all while it is creating the ``Enum``
+class itself, and then puts a custom ``__new__`` in place to ensure
+that no new ones are ever instantiated by returning only the existing
+member instances.
+
+
+Finer Points
+^^^^^^^^^^^^
+
+``Enum`` members are instances of an ``Enum`` class, and even though they
+are accessible as `EnumClass.member1.member2`, they should not be
+accessed directly from the member as that lookup may fail or, worse,
+return something besides the ``Enum`` member you were looking for
+(changed in version 1.1.1)::
+
+    >>> class FieldTypes(Enum):
+    ...     name = 1
+    ...     value = 2
+    ...     size = 3
+    ...
+    >>> FieldTypes.value.size
+    <FieldTypes.size: 3>
+    >>> FieldTypes.size.value
+    3
+
+The ``__members__`` attribute is only available on the class.
+
+In Python 3.x ``__members__`` is always an ``OrderedDict``, with the order being
+the definition order.  In Python 2.7 ``__members__`` is an ``OrderedDict`` if
+``__order__`` was specified, and a plain ``dict`` otherwise.  In all other Python
+2.x versions ``__members__`` is a plain ``dict`` even if ``__order__`` was specified
+as the ``OrderedDict`` type didn't exist yet.
+
+If you give your ``Enum`` subclass extra methods, like the `Planet`_
+class above, those methods will show up in a `dir` of the member,
+but not of the class::
+
+    >>> dir(Planet)
+    ['EARTH', 'JUPITER', 'MARS', 'MERCURY', 'NEPTUNE', 'SATURN', 'URANUS',
+    'VENUS', '__class__', '__doc__', '__members__', '__module__']
+    >>> dir(Planet.EARTH)
+    ['__class__', '__doc__', '__module__', 'name', 'surface_gravity', 'value']
+
+A ``__new__`` method will only be used for the creation of the
+``Enum`` members -- after that it is replaced.  This means if you wish to
+change how ``Enum`` members are looked up you either have to write a
+helper function or a ``classmethod``.
@@ -0,0 +1,312 @@
+import codecs
+import logging
+import os
+import pickle
+import shutil
+import tempfile
+import traceback
+
+import appdirs
+
+from scandir import scandir
+
+try:
+    from collections.abc import MutableMapping
+    unicode = str
+except ImportError:
+    # Python 2 imports
+    from collections import MutableMapping
+    FileNotFoundError = IOError
+
+from .posixemulation import rename
+
+logger = logging.getLogger(__name__)
+
+
+class FileCache(MutableMapping):
+    """A persistent file cache that is dictionary-like and has a write buffer.
+
+    *appname* is passed to `appdirs <https://pypi.python.org/pypi/appdirs/>`_
+    to determine a system-appropriate location for the cache files. The cache
+    directory used is available via :data:`cache_dir`.
+
+    By default, a write buffer is used, so writing to cache files is not done
+    until :meth:`sync` is explicitly called. This behavior can be changed using
+    the optional *flag* argument.
+
+    .. NOTE::
+        Keys and values are always stored as :class:`bytes` objects. If data
+        serialization is enabled, keys are returned as :class:`str` or
+        :class:`unicode` objects.
+        If data serialization is disabled, keys are returned as a
+        :class:`bytes` object.
+
+    :param str appname: The app/script the cache should be associated with.
+    :param str flag: How the cache should be opened. See below for details.
+    :param mode: The Unix mode for the cache files.
+    :param str keyencoding: The encoding the keys use, defaults to 'utf-8'.
+        This is used if *serialize* is ``False``; the keys are treated as
+        :class:`bytes` objects.
+    :param bool serialize: Whether or not to (de)serialize the values. If a
+        cache is used with a :class:`~shelve.Shelf`, set this to ``False``.
+    :param str app_cache_dir: absolute path to root cache directory to be
+        used in place of system-appropriate location determined by appdirs
+
+    The optional *flag* argument can be:
+
+    +---------+-------------------------------------------+
+    | Value   | Meaning                                   |
+    +=========+===========================================+
+    | ``'r'`` | Open existing cache for reading only      |
+    +---------+-------------------------------------------+
+    | ``'w'`` | Open existing cache for reading and       |
+    |         | writing                                   |
+    +---------+-------------------------------------------+
+    | ``'c'`` | Open cache for reading and writing,       |
+    |         | creating it if it doesn't exist (default) |
+    +---------+-------------------------------------------+
+    | ``'n'`` | Always create a new, empty cache, open    |
+    |         | for reading and writing                   |
+    +---------+-------------------------------------------+
+
+    If a ``'s'`` is appended to the *flag* argument, the cache will be opened
+    in sync mode. Writing to the cache will happen immediately and will not be
+    buffered.
+
+    If an application needs to use more than one cache, then it should use
+    subcaches. To create a subcache, append a series of one or more names
+    separated by periods to the application name when creating a
+    :class:`FileCache` object (e.g. ``'appname.subcache'`` or
+    ``'appname.subcache.subcache'``).
+    Subcaches are a way for an application to use more than one cache without
+    polluting a user's cache directory. All caches -- main caches or subcaches
+    -- are totally independent. The only aspect in which they are linked is
+    that all of an application's caches exist in the same system directory.
+    Because each cache is independent of every other cache, calling
+    :meth:`delete` on an application's main cache will not delete data in
+    its subcaches.
+
+    """
+
+    def __init__(self, appname, flag='c', mode=0o666, keyencoding='utf-8',
+                 serialize=True, app_cache_dir=None):
+        """Initialize a :class:`FileCache` object."""
+        if not isinstance(flag, str):
+            raise TypeError("flag must be str not '{}'".format(type(flag)))
+        elif flag[0] not in 'rwcn':
+            raise ValueError("invalid flag: '{}', first flag must be one of "
+                             "'r', 'w', 'c' or 'n'".format(flag))
+        elif len(flag) > 1 and flag[1] != 's':
+            raise ValueError("invalid flag: '{}', second flag must be "
+                             "'s'".format(flag))
+
+        appname, subcache = self._parse_appname(appname)
+        if 'cache' in subcache:
+            raise ValueError("invalid subcache name: 'cache'.")
+        self._is_subcache = bool(subcache)
+
+        if not app_cache_dir:
+            app_cache_dir = appdirs.user_cache_dir(appname, appname)
+        subcache_dir = os.path.join(app_cache_dir, *subcache)
+        self.cache_dir = os.path.join(subcache_dir, 'cache')
+        exists = os.path.exists(self.cache_dir)
+
+        if len(flag) > 1 and flag[1] == 's':
+            self._sync = True
+        else:
+            self._sync = False
+            self._buffer = {}
+
+        if exists and 'n' in flag:
+            self.clear()
+            self.create()
+        elif not exists and ('c' in flag or 'n' in flag):
+            self.create()
+        elif not exists:
+            raise FileNotFoundError("no such directory: '{}'".format(
+                self.cache_dir))
+
+        self._flag = 'rb' if 'r' in flag else 'wb'
+        self._mode = mode
+        self._keyencoding = keyencoding
+        self._serialize = serialize
+
+    def _parse_appname(self, appname):
+        """Splits an appname into the appname and subcache components."""
+        components = appname.split('.')
+        return components[0], components[1:]
+
+    def create(self):
+        """Create the write buffer and cache directory."""
+        if not self._sync and not hasattr(self, '_buffer'):
+            self._buffer = {}
+        if not os.path.exists(self.cache_dir):
+            os.makedirs(self.cache_dir)
+
+    def clear(self):
+        """Remove all items from the write buffer and cache.
+
+        The write buffer object and cache directory are not deleted.
+
+        """
+        self.delete()
+        self.create()
+
+    def delete(self):
+        """Delete the write buffer and cache directory."""
+        if not self._sync:
+            del self._buffer
+        shutil.rmtree(self.cache_dir)
+
+    def close(self):
+        """Sync the write buffer, then close the cache.
+
+        If a closed :class:`FileCache` object's methods are called, a
+        :exc:`ValueError` will be raised.
+
+        """
+        self.sync()
+        self.sync = self.create = self.delete = self._closed
+        self._write_to_file = self._read_to_file = self._closed
+        self._key_to_filename = self._filename_to_key = self._closed
+        self.__getitem__ = self.__setitem__ = self.__delitem__ = self._closed
+        self.__iter__ = self.__len__ = self.__contains__ = self._closed
+
+    def sync(self):
+        """Sync the write buffer with the cache files and clear the buffer.
+
+        If the :class:`FileCache` object was opened with the optional ``'s'``
+        *flag* argument, then calling :meth:`sync` will do nothing.
+        """
+        if self._sync:
+            return  # opened in sync mode, so skip the manual sync
+        self._sync = True
+        for ekey in self._buffer:
+            filename = self._key_to_filename(ekey)
+            try:
+                self._write_to_file(filename, self._buffer[ekey])
+            except:
+                logger.error("Couldn't write content from %r to cache file: %r: %s", ekey, filename,
+                             traceback.format_exc())
+        self._buffer.clear()
+        self._sync = False
+
+    def _closed(self, *args, **kwargs):
+        """Filler method for closed cache methods."""
+        raise ValueError("invalid operation on closed cache")
+
+    def _encode_key(self, key):
+        """Encode key using *hex_codec* for constructing a cache filename.
+
+        Keys are implicitly converted to :class:`bytes` if passed as
+        :class:`str`.
+
+        """
+        if isinstance(key, str) or isinstance(key, unicode):
+            key = key.encode(self._keyencoding)
+        elif not isinstance(key, bytes):
+            raise TypeError("key must be bytes or str")
+        return codecs.encode(key, 'hex_codec').decode(self._keyencoding)
+
+    def _decode_key(self, key):
+        """Decode key using hex_codec to retrieve the original key.
+
+        Keys are returned as :class:`str` if serialization is enabled.
+        Keys are returned as :class:`bytes` if serialization is disabled.
+
+        """
+        bkey = codecs.decode(key.encode(self._keyencoding), 'hex_codec')
+        return bkey.decode(self._keyencoding) if self._serialize else bkey
+
+    def _dumps(self, value):
+        return value if not self._serialize else pickle.dumps(value)
+
+    def _loads(self, value):
+        return value if not self._serialize else pickle.loads(value)
+
+    def _key_to_filename(self, key):
+        """Convert an encoded key to an absolute cache filename."""
+        return os.path.join(self.cache_dir, key)
+
+    def _filename_to_key(self, absfilename):
+        """Convert an absolute cache filename to a key name."""
+        return os.path.split(absfilename)[1]
+
+    def _all_filenames(self):
+        """Return a list of absolute cache filenames"""
+        try:
+            for entry in scandir(self.cache_dir):
+                if entry.is_file(follow_symlinks=False):
+                    yield os.path.join(self.cache_dir, entry.name)
+        except (FileNotFoundError, OSError):
+            raise StopIteration
+
+    def _all_keys(self):
+        """Return a list of all encoded key names."""
+        file_keys = [self._filename_to_key(fn) for fn in self._all_filenames()]
+        if self._sync:
+            return set(file_keys)
+        else:
+            return set(file_keys + list(self._buffer))
+
+    def _write_to_file(self, filename, bytesvalue):
+        """Write bytesvalue to filename."""
+        fh, tmp = tempfile.mkstemp()
+        with os.fdopen(fh, self._flag) as f:
+            f.write(self._dumps(bytesvalue))
+        rename(tmp, filename)
+        os.chmod(filename, self._mode)
+
+    def _read_from_file(self, filename):
+        """Read data from filename."""
+        try:
+            with open(filename, 'rb') as f:
+                return self._loads(f.read())
+        except (IOError, OSError):
+            logger.warning('Error opening file: {}'.format(filename))
+            return None
+
+    def __setitem__(self, key, value):
+        ekey = self._encode_key(key)
+        if not self._sync:
+            self._buffer[ekey] = value
+        else:
+            filename = self._key_to_filename(ekey)
+            self._write_to_file(filename, value)
+
+    def __getitem__(self, key):
+        ekey = self._encode_key(key)
+        if not self._sync:
+            try:
+                return self._buffer[ekey]
+            except KeyError:
+                pass
+        filename = self._key_to_filename(ekey)
+        if filename not in self._all_filenames():
+            raise KeyError(key)
+        return self._read_from_file(filename)
+
+    def __delitem__(self, key):
+        ekey = self._encode_key(key)
+        filename = self._key_to_filename(ekey)
+        if not self._sync:
+            try:
+                del self._buffer[ekey]
+            except KeyError:
+                if filename not in self._all_filenames():
+                    raise KeyError(key)
+        try:
+            os.remove(filename)
+        except (IOError, OSError):
+            pass
+
+    def __iter__(self):
+        for key in self._all_keys():
+            yield self._decode_key(key)
+
+    def __len__(self):
+        return len(self._all_keys())
+
+    def __contains__(self, key):
+        ekey = self._encode_key(key)
+        return ekey in self._all_keys()
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+r"""
+    werkzeug.posixemulation
+    ~~~~~~~~~~~~~~~~~~~~~~~
+
+    Provides a POSIX emulation for some features that are relevant to
+    web applications.  The main purpose is to simplify support for
+    systems such as Windows NT that are not 100% POSIX compatible.
+
+    Currently this only implements a :func:`rename` function that
+    follows POSIX semantics.  Eg: if the target file already exists it
+    will be replaced without asking.
+
+    This module was introduced in 0.6.1 and is not a public interface.
+    It might become one in later versions of Werkzeug.
+
+    :copyright: (c) 2013 by the Werkzeug Team, see AUTHORS for more details.
+    :license: BSD, see LICENSE for more details.
+"""
+import sys
+import os
+import errno
+import time
+import random
+import shutil
+
+
+can_rename_open_file = False
+if os.name == 'nt':  # pragma: no cover
+    _rename = lambda src, dst: False
+    _rename_atomic = lambda src, dst: False
+    if sys.version_info >= (3, 0):
+        unicode = str
+
+    try:
+        import ctypes
+
+        _MOVEFILE_REPLACE_EXISTING = 0x1
+        _MOVEFILE_WRITE_THROUGH = 0x8
+        _MoveFileEx = ctypes.windll.kernel32.MoveFileExW
+
+        def _rename(src, dst):
+            if not isinstance(src, unicode):
+                src = unicode(src, sys.getfilesystemencoding())
+            if not isinstance(dst, unicode):
+                dst = unicode(dst, sys.getfilesystemencoding())
+            if _rename_atomic(src, dst):
+                return True
+            retry = 0
+            rv = False
+            while not rv and retry < 100:
+                rv = _MoveFileEx(src, dst, _MOVEFILE_REPLACE_EXISTING |
+                                 _MOVEFILE_WRITE_THROUGH)
+                if not rv:
+                    time.sleep(0.001)
+                    retry += 1
+            return rv
+
+        # new in Vista and Windows Server 2008
+        _CreateTransaction = ctypes.windll.ktmw32.CreateTransaction
+        _CommitTransaction = ctypes.windll.ktmw32.CommitTransaction
+        _MoveFileTransacted = ctypes.windll.kernel32.MoveFileTransactedW
+        _CloseHandle = ctypes.windll.kernel32.CloseHandle
+        can_rename_open_file = True
+
+        def _rename_atomic(src, dst):
+            ta = _CreateTransaction(None, 0, 0, 0, 0, 1000, 'Werkzeug rename')
+            if ta == -1:
+                return False
+            try:
+                retry = 0
+                rv = False
+                while not rv and retry < 100:
+                    rv = _MoveFileTransacted(src, dst, None, None,
+                                             _MOVEFILE_REPLACE_EXISTING |
+                                             _MOVEFILE_WRITE_THROUGH, ta)
+                    if rv:
+                        rv = _CommitTransaction(ta)
+                        break
+                    else:
+                        time.sleep(0.001)
+                        retry += 1
+                return rv
+            finally:
+                _CloseHandle(ta)
+    except Exception:
+        pass
+
+    def rename(src, dst):
+        # Try atomic or pseudo-atomic rename
+        if _rename(src, dst):
+            return
+        # Fall back to "move away and replace"
+        try:
+            os.rename(src, dst)
+        except OSError as e:
+            if e.errno != errno.EEXIST:
+                raise
+            old = "%s-%08x" % (dst, random.randint(0, sys.maxint))
+            os.rename(dst, old)
+            os.rename(src, dst)
+            try:
+                os.unlink(old)
+            except Exception:
+                pass
+else:
+    """
+    If dst on current filesystem then use
+    atomic rename. Otherwise, fall back to a
+    non-atomic copy and remove.
+    """
+    rename = shutil.move
+    can_rename_open_file = True
@@ -4,7 +4,6 @@ This gives other modules access to the gritty details about characters and the
 encodings that use them.
 """

-from __future__ import unicode_literals
 import re
 import zlib
 import unicodedata
@@ -15,13 +14,13 @@ from ftfy.compatibility import unichr
 # These are the encodings we will try to fix in ftfy, in the
 # order that they should be tried.
 CHARMAP_ENCODINGS = [
-    'latin-1',
-    'sloppy-windows-1252',
-    'sloppy-windows-1250',
-    'iso-8859-2',
-    'sloppy-windows-1251',
-    'macroman',
-    'cp437',
+    u'latin-1',
+    u'sloppy-windows-1252',
+    u'sloppy-windows-1250',
+    u'iso-8859-2',
+    u'sloppy-windows-1251',
+    u'macroman',
+    u'cp437',
 ]


@@ -29,25 +28,25 @@ def _build_regexes():
    """
    ENCODING_REGEXES contain reasonably fast ways to detect if we
    could represent a given string in a given encoding. The simplest one is
-    the 'ascii' detector, which of course just determines if all characters
+    the u'ascii' detector, which of course just determines if all characters
    are between U+0000 and U+007F.
    """
    # Define a regex that matches ASCII text.
-    encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}
+    encoding_regexes = {u'ascii': re.compile('^[\x00-\x7f]*$')}

    for encoding in CHARMAP_ENCODINGS:
        # Make a sequence of characters that bytes \x80 to \xFF decode to
        # in each encoding, as well as byte \x1A, which is used to represent
        # the replacement character � in the sloppy-* encodings.
-        latin1table = ''.join(unichr(i) for i in range(128, 256)) + '\x1a'
-        charlist = latin1table.encode('latin-1').decode(encoding)
+        latin1table = u''.join(unichr(i) for i in range(128, 256)) + '\x1a'
+        charlist = latin1table.encode(u'latin-1').decode(encoding)

        # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B
        # to \x7F -- will decode as those ASCII characters in any encoding we
        # support, so we can just include them as ranges. This also lets us
        # not worry about escaping regex special characters, because all of
        # them are in the \x1B to \x7F range.
-        regex = '^[\x00-\x19\x1b-\x7f{0}]*$'.format(charlist)
+        regex = u'^[\x00-\x19\x1b-\x7f{0}]*$'.format(charlist)
        encoding_regexes[encoding] = re.compile(regex)
    return encoding_regexes
 ENCODING_REGEXES = _build_regexes()
@@ -57,10 +56,10 @@ def _build_utf8_punct_regex():
    """
    Recognize UTF-8 mojibake that's so blatant that we can fix it even when the
    rest of the string doesn't decode as UTF-8 -- namely, UTF-8 sequences for
-    the 'General Punctuation' characters U+2000 to U+2040, re-encoded in
+    the u'General Punctuation' characters U+2000 to U+2040, re-encoded in
    Windows-1252.

-    These are recognizable by the distinctive 'â€' ('\xe2\x80') sequence they
+    These are recognizable by the distinctiveu'â€u' ('\xe2\x80') sequence they
    all begin with when decoded as Windows-1252.
    """
    # We're making a regex that has all the literal bytes from 0x80 to 0xbf in
@@ -72,10 +71,10 @@ def _build_utf8_punct_regex():
    # prettier when we deprecate Python 2.
    continuation_char_list = ''.join(
        unichr(i) for i in range(0x80, 0xc0)
-    ).encode('latin-1')
-    obvious_utf8 = ('â€['
-                    + continuation_char_list.decode('sloppy-windows-1252')
-                    + ']')
+    ).encode(u'latin-1')
+    obvious_utf8 = (u'â€['
+                    + continuation_char_list.decode(u'sloppy-windows-1252')
+                    + u']')
    return re.compile(obvious_utf8)
 PARTIAL_UTF8_PUNCT_RE = _build_utf8_punct_regex()

@@ -126,8 +125,8 @@ LOSSY_UTF8_RE = re.compile(
 )

 # These regexes match various Unicode variations on single and double quotes.
-SINGLE_QUOTE_RE = re.compile('[\u2018-\u201b]')
-DOUBLE_QUOTE_RE = re.compile('[\u201c-\u201f]')
+SINGLE_QUOTE_RE = re.compile(u'[\u2018-\u201b]')
+DOUBLE_QUOTE_RE = re.compile(u'[\u201c-\u201f]')


 def possible_encoding(text, encoding):
@@ -143,7 +142,7 @@ def possible_encoding(text, encoding):

 CHAR_CLASS_STRING = zlib.decompress(
    resource_string(__name__, 'char_classes.dat')
-).decode('ascii')
+).decode(u'ascii')

 def chars_to_classes(string):
    """
@@ -185,15 +184,15 @@ CONTROL_CHARS = _build_control_char_mapping()
 # Ligatures may also be separated by NFKC normalization, but that is sometimes
 # more normalization than you want.
 LIGATURES = {
-    ord('Ĳ'): 'IJ',
-    ord('ĳ'): 'ij',
-    ord('ﬀ'): 'ff',
-    ord('ﬁ'): 'fi',
-    ord('ﬂ'): 'fl',
-    ord('ﬃ'): 'ffi',
-    ord('ﬄ'): 'ffl',
-    ord('ﬅ'): 'ſt',
-    ord('ﬆ'): 'st'
+    ord(u'Ĳ'): u'IJ',
+    ord(u'ĳ'): u'ij',
+    ord(u'ﬀ'): u'ff',
+    ord(u'ﬁ'): u'fi',
+    ord(u'ﬂ'): u'fl',
+    ord(u'ﬃ'): u'ffi',
+    ord(u'ﬄ'): u'ffl',
+    ord(u'ﬅ'): u'ſt',
+    ord(u'ﬆ'): u'st'
 }


@@ -205,10 +204,10 @@ def _build_width_map():
    # Though it's not listed as a fullwidth character, we'll want to convert
    # U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start
    # with that in the dictionary.
-    width_map = {0x3000: ' '}
+    width_map = {0x3000: u' '}
    for i in range(0xff01, 0xfff0):
        char = unichr(i)
-        alternate = unicodedata.normalize('NFKC', char)
+        alternate = unicodedata.normalize(u'NFKC', char)
        if alternate != char:
            width_map[i] = alternate
    return width_map
@@ -4,4 +4,4 @@
 Version module
 """
 # pragma: no cover
-__version__ = '3.0.0.dev0'
+__version__ = '2.1.4'
@@ -133,7 +133,6 @@ class ValidateHasNeighbor(Rule):
    Validate tag has-neighbor
    """
    consequence = RemoveMatch
-    priority = 64

    def when(self, matches, context):
        ret = []
@@ -159,7 +158,6 @@ class ValidateHasNeighborBefore(Rule):
    Validate tag has-neighbor-before that previous match exists.
    """
    consequence = RemoveMatch
-    priority = 64

    def when(self, matches, context):
        ret = []
@@ -179,7 +177,6 @@ class ValidateHasNeighborAfter(Rule):
    Validate tag has-neighbor-after that next match exists.
    """
    consequence = RemoveMatch
-    priority = 64

    def when(self, matches, context):
        ret = []
@@ -3895,6 +3895,7 @@
  season: 7
  episode: 22
  episode_title: 2000 Light Years from Home
+  other: Classic
  container: mkv
  mimetype: video/x-matroska
  type: episode
@@ -3962,15 +3963,3 @@
  subtitle_language: fr
  other: FullHD
  type: episode
-
-? Whose Line is it anyway/Season 01/Whose.Line.is.it.Anyway.US.S13E01.720p.WEB.x264-TBS.mkv
-: title: Whose Line is it Anyway
-  season: 13
-  episode: 1
-  country: US
-  screen_size: 720p
-  format: WEB-DL
-  video_codec: h264
-  release_group: TBS
-  container: mkv
-  type: episode
@@ -1,163 +0,0 @@
-Change Log
----------
-
-0.999
-~~~~~
-
-Released on December 23, 2013
-
-* Fix #127: add work-around for CPython issue #20007: .read(0) on
-  http.client.HTTPResponse drops the rest of the content.
-
-* Fix #115: lxml treewalker can now deal with fragments containing, at
-  their root level, text nodes with non-ASCII characters on Python 2.
-
-
-0.99
-~~~~
-
-Released on September 10, 2013
-
-* No library changes from 1.0b3; released as 0.99 as pip has changed
-  behaviour from 1.4 to avoid installing pre-release versions per
-  PEP 440.
-
-
-1.0b3
-~~~~~
-
-Released on July 24, 2013
-
-* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
-  implementation using it should be moved to
-  ``NonRecursiveTreeWalker``, as everything bundled with html5lib has
-  for years.
-
-* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
-  object, thereby fixing any case where html5lib is passed a
-  non-seekable RawIOBase-like object.
-
-
-1.0b2
-~~~~~
-
-Released on June 27, 2013
-
-* Removed reordering of attributes within the serializer. There is now
-  an ``alphabetical_attributes`` option which preserves the previous
-  behaviour through a new filter. This allows attribute order to be
-  preserved through html5lib if the tree builder preserves order.
-
-* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
-  ``treeadapters.sax.to_sax`` which is generic and supports any
-  treewalker; it also resolves all known bugs with ``dom2sax``.
-
-* Fix treewalker assertions on hitting bytes strings on
-  Python 2. Previous to 1.0b1, treewalkers coped with mixed
-  bytes/unicode data on Python 2; this reintroduces this prior
-  behaviour on Python 2. Behaviour is unchanged on Python 3.
-
-
-1.0b1
-~~~~~
-
-Released on May 17, 2013
-
-* Implementation updated to implement the `HTML specification
-  <http://www.whatwg.org/specs/web-apps/current-work/>`_ as of 5th May
-  2013 (`SVN <http://svn.whatwg.org/webapps/>`_ revision r7867).
-
-* Python 3.2+ supported in a single codebase using the ``six`` library.
-
-* Removed support for Python 2.5 and older.
-
-* Removed the deprecated Beautiful Soup 3 treebuilder.
-  ``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
-  since it doesn't support namespaces, foreign content like SVG and
-  MathML is parsed incorrectly.
-
-* Removed ``simpletree`` from the package. The default tree builder is
-  now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
-  available, and ``xml.etree.ElementTree`` otherwise).
-
-* Removed the ``XHTMLSerializer`` as it never actually guaranteed its
-  output was well-formed XML, and hence provided little of use.
-
-* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
-  longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
-  return the default DOM treebuilder, which uses ``xml.dom.minidom``.
-
-* Optional heuristic character encoding detection now based on
-  ``charade`` for Python 2.6 - 3.3 compatibility.
-
-* Optional ``Genshi`` treewalker support fixed.
-
-* Many bugfixes, including:
-
-  * #33: null in attribute value breaks XML AttValue;
-
-  * #4: nested, indirect descendant, <button> causes infinite loop;
-
-  * `Google Code 215
-    <http://code.google.com/p/html5lib/issues/detail?id=215>`_: Properly
-    detect seekable streams;
-
-  * `Google Code 206
-    <http://code.google.com/p/html5lib/issues/detail?id=206>`_: add
-    support for <video preload=...>, <audio preload=...>;
-
-  * `Google Code 205
-    <http://code.google.com/p/html5lib/issues/detail?id=205>`_: add
-    support for <video poster=...>;
-
-  * `Google Code 202
-    <http://code.google.com/p/html5lib/issues/detail?id=202>`_: Unicode
-    file breaks InputStream.
-
-* Source code is now mostly PEP 8 compliant.
-
-* Test harness has been improved and now depends on ``nose``.
-
-* Documentation updated and moved to http://html5lib.readthedocs.org/.
-
-
-0.95
-~~~~
-
-Released on February 11, 2012
-
-
-0.90
-~~~~
-
-Released on January 17, 2010
-
-
-0.11.1
-~~~~~~
-
-Released on June 12, 2008
-
-
-0.11
-~~~~
-
-Released on June 10, 2008
-
-
-0.10
-~~~~
-
-Released on October 7, 2007
-
-
-0.9
-~~~
-
-Released on March 11, 2007
-
-
-0.2
-~~~
-
-Released on January 8, 2007
@@ -1,157 +0,0 @@
-html5lib
-========
-
-.. image:: https://travis-ci.org/html5lib/html5lib-python.png?branch=master
-  :target: https://travis-ci.org/html5lib/html5lib-python
-
-html5lib is a pure-python library for parsing HTML. It is designed to
-conform to the WHATWG HTML specification, as is implemented by all major
-web browsers.
-
-
-Usage
-----
-
-Simple usage follows this pattern:
-
-.. code-block:: python
-
-  import html5lib
-  with open("mydocument.html", "rb") as f:
-      document = html5lib.parse(f)
-
-or:
-
-.. code-block:: python
-
-  import html5lib
-  document = html5lib.parse("<p>Hello World!")
-
-By default, the ``document`` will be an ``xml.etree`` element instance.
-Whenever possible, html5lib chooses the accelerated ``ElementTree``
-implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
-
-Two other tree types are supported: ``xml.dom.minidom`` and
-``lxml.etree``. To use an alternative format, specify the name of
-a treebuilder:
-
-.. code-block:: python
-
-  import html5lib
-  with open("mydocument.html", "rb") as f:
-      lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
-
-When using with ``urllib2`` (Python 2), the charset from HTTP should be
-pass into html5lib as follows:
-
-.. code-block:: python
-
-  from contextlib import closing
-  from urllib2 import urlopen
-  import html5lib
-
-  with closing(urlopen("http://example.com/")) as f:
-      document = html5lib.parse(f, encoding=f.info().getparam("charset"))
-
-When using with ``urllib.request`` (Python 3), the charset from HTTP
-should be pass into html5lib as follows:
-
-.. code-block:: python
-
-  from urllib.request import urlopen
-  import html5lib
-
-  with urlopen("http://example.com/") as f:
-      document = html5lib.parse(f, encoding=f.info().get_content_charset())
-
-To have more control over the parser, create a parser object explicitly.
-For instance, to make the parser raise exceptions on parse errors, use:
-
-.. code-block:: python
-
-  import html5lib
-  with open("mydocument.html", "rb") as f:
-      parser = html5lib.HTMLParser(strict=True)
-      document = parser.parse(f)
-
-When you're instantiating parser objects explicitly, pass a treebuilder
-class as the ``tree`` keyword argument to use an alternative document
-format:
-
-.. code-block:: python
-
-  import html5lib
-  parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
-  minidom_document = parser.parse("<p>Hello World!")
-
-More documentation is available at http://html5lib.readthedocs.org/.
-
-
-Installation
------------
-
-html5lib works on CPython 2.6+, CPython 3.2+ and PyPy.  To install it,
-use:
-
-.. code-block:: bash
-
-    $ pip install html5lib
-
-
-Optional Dependencies
---------------------
-
-The following third-party libraries may be used for additional
-functionality:
-
- ``datrie`` can be used to improve parsing performance (though in
-  almost all cases the improvement is marginal);
-
- ``lxml`` is supported as a tree format (for both building and
-  walking) under CPython (but *not* PyPy where it is known to cause
-  segfaults);
-
- ``genshi`` has a treewalker (but not builder); and
-
- ``charade`` can be used as a fallback when character encoding cannot
-  be determined; ``chardet``, from which it was forked, can also be used
-  on Python 2.
-
- ``ordereddict`` can be used under Python 2.6
-  (``collections.OrderedDict`` is used instead on later versions) to
-  serialize attributes in alphabetical order.
-
-
-Bugs
----
-
-Please report any bugs on the `issue tracker
-<https://github.com/html5lib/html5lib-python/issues>`_.
-
-
-Tests
-----
-
-Unit tests require the ``nose`` library and can be run using the
-``nosetests`` command in the root directory; ``ordereddict`` is
-required under Python 2.6. All should pass.
-
-Test data are contained in a separate `html5lib-tests
-<https://github.com/html5lib/html5lib-tests>`_ repository and included
-as a submodule, thus for git checkouts they must be initialized::
-
-  $ git submodule init
-  $ git submodule update
-
-If you have all compatible Python implementations available on your
-system, you can run tests on all of them using the ``tox`` utility,
-which can be found on PyPI.
-
-
-Questions?
----------
-
-There's a mailing list available for support on Google Groups,
-`html5lib-discuss <http://groups.google.com/group/html5lib-discuss>`_,
-though you may get a quicker response asking on IRC in `#whatwg on
-irc.freenode.net <http://wiki.whatwg.org/wiki/IRC>`_.
@@ -1,14 +1,23 @@
 """
-HTML parsing library based on the WHATWG "HTML5"
-specification. The parser is designed to be compatible with existing
-HTML found in the wild and implements well-defined error recovery that
+HTML parsing library based on the `WHATWG HTML specification
+<https://whatwg.org/html>`_. The parser is designed to be compatible with
+existing HTML found in the wild and implements well-defined error recovery that
 is largely compatible with modern desktop web browsers.

-Example usage:
+Example usage::

-import html5lib
-f = open("my_document.html")
-tree = html5lib.parse(f)
+    import html5lib
+    with open("my_document.html", "rb") as f:
+        tree = html5lib.parse(f)
+
+For convenience, this module re-exports the following names:
+
+* :func:`~.html5parser.parse`
+* :func:`~.html5parser.parseFragment`
+* :class:`~.html5parser.HTMLParser`
+* :func:`~.treebuilders.getTreeBuilder`
+* :func:`~.treewalkers.getTreeWalker`
+* :func:`~.serializer.serialize`
 """

 from __future__ import absolute_import, division, unicode_literals
@@ -20,4 +29,7 @@ from .serializer import serialize

 __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
           "getTreeWalker", "serialize"]
-__version__ = "0.999"
+
+# this has to be at the top level, see how setup.py parses this
+#: Distribution version number.
+__version__ = "1.0.1"
@@ -175,18 +175,18 @@ def escapeRegexp(string):
    return string

 # output from the above
-nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa

-nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa

 # Simpler things
-nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
+nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")


 class InfosetFilter(object):
    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")

-    def __init__(self, replaceChars=None,
+    def __init__(self,
                 dropXmlnsLocalName=False,
                 dropXmlnsAttrNs=False,
                 preventDoubleDashComments=False,
@@ -217,7 +217,7 @@ class InfosetFilter(object):
        else:
            return self.toXmlName(name)

-    def coerceElement(self, name, namespace=None):
+    def coerceElement(self, name):
        return self.toXmlName(name)

    def coerceComment(self, data):
@@ -225,11 +225,14 @@ class InfosetFilter(object):
            while "--" in data:
                warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
                data = data.replace("--", "- -")
+            if data.endswith("-"):
+                warnings.warn("Comments cannot end in a dash", DataLossWarning)
+                data += " "
        return data

    def coerceCharacters(self, data):
        if self.replaceFormFeedCharacters:
-            for i in range(data.count("\x0C")):
+            for _ in range(data.count("\x0C")):
                warnings.warn("Text cannot contain U+000C", DataLossWarning)
            data = data.replace("\x0C", " ")
        # Other non-xml characters
@@ -1,13 +1,16 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import text_type
-from six.moves import http_client
+
+from six import text_type, binary_type
+from six.moves import http_client, urllib

 import codecs
 import re

+import webencodings
+
 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
-from .constants import encodings, ReparseException
-from . import utils
+from .constants import _ReparseException
+from . import _utils

 from io import StringIO

@@ -16,19 +19,26 @@ try:
 except ImportError:
    BytesIO = StringIO

-try:
-    from io import BufferedIOBase
-except ImportError:
-    class BufferedIOBase(object):
-        pass
-
 # Non-unicode versions of constants for use in the pre-parser
 spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
 asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
 asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])

-invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"  # noqa
+
+if _utils.supports_lone_surrogates:
+    # Use one extra step of indirection and create surrogates with
+    # eval. Not using this indirection would introduce an illegal
+    # unicode literal on platforms not supporting such lone
+    # surrogates.
+    assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
+                                    eval('"\\uD800-\\uDFFF"') +  # pylint:disable=eval-used
+                                    "]")
+else:
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)

 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -38,7 +48,7 @@ non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
                                  0x10FFFE, 0x10FFFF])

-ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
+ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")

 # Cache for charsUntil()
 charsUntilRegEx = {}
@@ -118,10 +128,13 @@ class BufferedStream(object):
        return b"".join(rv)


-def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
-    if isinstance(source, http_client.HTTPResponse):
-        # Work around Python bug #20007: read(0) closes the connection.
-        # http://bugs.python.org/issue20007
+def HTMLInputStream(source, **kwargs):
+    # Work around Python bug #20007: read(0) closes the connection.
+    # http://bugs.python.org/issue20007
+    if (isinstance(source, http_client.HTTPResponse) or
+        # Also check for addinfourl wrapping HTTPResponse
+        (isinstance(source, urllib.response.addbase) and
+         isinstance(source.fp, http_client.HTTPResponse))):
        isUnicode = False
    elif hasattr(source, "read"):
        isUnicode = isinstance(source.read(0), text_type)
@@ -129,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
        isUnicode = isinstance(source, text_type)

    if isUnicode:
-        if encoding is not None:
-            raise TypeError("Cannot explicitly set an encoding with a unicode string")
+        encodings = [x for x in kwargs if x.endswith("_encoding")]
+        if encodings:
+            raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)

-        return HTMLUnicodeInputStream(source)
+        return HTMLUnicodeInputStream(source, **kwargs)
    else:
-        return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
+        return HTMLBinaryInputStream(source, **kwargs)


 class HTMLUnicodeInputStream(object):
@@ -160,22 +174,21 @@ class HTMLUnicodeInputStream(object):
        regardless of any BOM or later declaration (such as in a meta
        element)

-        parseMeta - Look for a <meta> element containing encoding information
-
        """

-        # Craziness
-        if len("\U0010FFFF") == 1:
+        if not _utils.supports_lone_surrogates:
+            # Such platforms will have already checked for such
+            # surrogate errors, so no need to do this checking.
+            self.reportCharacterErrors = None
+        elif len("\U0010FFFF") == 1:
            self.reportCharacterErrors = self.characterErrorsUCS4
-            self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
        else:
            self.reportCharacterErrors = self.characterErrorsUCS2
-            self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")

        # List of where new lines occur
        self.newLines = [0]

-        self.charEncoding = ("utf-8", "certain")
+        self.charEncoding = (lookupEncoding("utf-8"), "certain")
        self.dataStream = self.openStream(source)

        self.reset()
@@ -265,12 +278,10 @@ class HTMLUnicodeInputStream(object):
                self._bufferedCharacter = data[-1]
                data = data[:-1]

-        self.reportCharacterErrors(data)
+        if self.reportCharacterErrors:
+            self.reportCharacterErrors(data)

        # Replace invalid characters
-        # Note U+0000 is dealt with in the tokenizer
-        data = self.replaceCharactersRegexp.sub("\ufffd", data)
-
        data = data.replace("\r\n", "\n")
        data = data.replace("\r", "\n")

@@ -280,7 +291,7 @@ class HTMLUnicodeInputStream(object):
        return True

    def characterErrorsUCS4(self, data):
-        for i in range(len(invalid_unicode_re.findall(data))):
+        for _ in range(len(invalid_unicode_re.findall(data))):
            self.errors.append("invalid-codepoint")

    def characterErrorsUCS2(self, data):
@@ -293,9 +304,9 @@ class HTMLUnicodeInputStream(object):
            codepoint = ord(match.group())
            pos = match.start()
            # Pretty sure there should be endianness issues here
-            if utils.isSurrogatePair(data[pos:pos + 2]):
+            if _utils.isSurrogatePair(data[pos:pos + 2]):
                # We have a surrogate pair!
-                char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
+                char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
                if char_val in non_bmp_invalid_codepoints:
                    self.errors.append("invalid-codepoint")
                skip = True
@@ -378,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):

    """

-    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
+    def __init__(self, source, override_encoding=None, transport_encoding=None,
+                 same_origin_parent_encoding=None, likely_encoding=None,
+                 default_encoding="windows-1252", useChardet=True):
        """Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -391,8 +404,6 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
        regardless of any BOM or later declaration (such as in a meta
        element)

-        parseMeta - Look for a <meta> element containing encoding information
-
        """
        # Raw Stream - for unicode objects this will encode to utf-8 and set
        #              self.charEncoding as appropriate
@@ -400,27 +411,28 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):

        HTMLUnicodeInputStream.__init__(self, self.rawStream)

-        self.charEncoding = (codecName(encoding), "certain")
-
        # Encoding Information
        # Number of bytes to use when looking for a meta element with
        # encoding information
-        self.numBytesMeta = 512
+        self.numBytesMeta = 1024
        # Number of bytes to use when using detecting encoding using chardet
        self.numBytesChardet = 100
-        # Encoding to use if no other information can be found
-        self.defaultEncoding = "windows-1252"
+        # Things from args
+        self.override_encoding = override_encoding
+        self.transport_encoding = transport_encoding
+        self.same_origin_parent_encoding = same_origin_parent_encoding
+        self.likely_encoding = likely_encoding
+        self.default_encoding = default_encoding

-        # Detect encoding iff no explicit "transport level" encoding is supplied
-        if (self.charEncoding[0] is None):
-            self.charEncoding = self.detectEncoding(parseMeta, chardet)
+        # Determine encoding
+        self.charEncoding = self.determineEncoding(useChardet)
+        assert self.charEncoding[0] is not None

        # Call superclass
        self.reset()

    def reset(self):
-        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
-                                                                 'replace')
+        self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
        HTMLUnicodeInputStream.reset(self)

    def openStream(self, source):
@@ -437,29 +449,50 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):

        try:
            stream.seek(stream.tell())
-        except:
+        except:  # pylint:disable=bare-except
            stream = BufferedStream(stream)

        return stream

-    def detectEncoding(self, parseMeta=True, chardet=True):
-        # First look for a BOM
+    def determineEncoding(self, chardet=True):
+        # BOMs take precedence over everything
        # This will also read past the BOM if present
-        encoding = self.detectBOM()
-        confidence = "certain"
-        # If there is no BOM need to look for meta elements with encoding
-        # information
-        if encoding is None and parseMeta:
-            encoding = self.detectEncodingMeta()
-            confidence = "tentative"
-        # Guess with chardet, if avaliable
-        if encoding is None and chardet:
-            confidence = "tentative"
+        charEncoding = self.detectBOM(), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # If we've been overriden, we've been overriden
+        charEncoding = lookupEncoding(self.override_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Now check the transport layer
+        charEncoding = lookupEncoding(self.transport_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Look for meta elements with encoding information
+        charEncoding = self.detectEncodingMeta(), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Parent document encoding
+        charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
+        if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
+            return charEncoding
+
+        # "likely" encoding
+        charEncoding = lookupEncoding(self.likely_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Guess with chardet, if available
+        if chardet:
            try:
-                try:
-                    from charade.universaldetector import UniversalDetector
-                except ImportError:
-                    from chardet.universaldetector import UniversalDetector
+                from chardet.universaldetector import UniversalDetector
+            except ImportError:
+                pass
+            else:
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
@@ -470,37 +503,34 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
-                encoding = detector.result['encoding']
+                encoding = lookupEncoding(detector.result['encoding'])
                self.rawStream.seek(0)
-            except ImportError:
-                pass
-        # If all else fails use the default encoding
-        if encoding is None:
-            confidence = "tentative"
-            encoding = self.defaultEncoding
+                if encoding is not None:
+                    return encoding, "tentative"

-        # Substitute for equivalent encodings:
-        encodingSub = {"iso-8859-1": "windows-1252"}
+        # Try the default encoding
+        charEncoding = lookupEncoding(self.default_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding

-        if encoding.lower() in encodingSub:
-            encoding = encodingSub[encoding.lower()]
-
-        return encoding, confidence
+        # Fallback to html5lib's default if even that hasn't worked
+        return lookupEncoding("windows-1252"), "tentative"

    def changeEncoding(self, newEncoding):
        assert self.charEncoding[1] != "certain"
-        newEncoding = codecName(newEncoding)
-        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
-            newEncoding = "utf-8"
+        newEncoding = lookupEncoding(newEncoding)
        if newEncoding is None:
            return
+        if newEncoding.name in ("utf-16be", "utf-16le"):
+            newEncoding = lookupEncoding("utf-8")
+            assert newEncoding is not None
        elif newEncoding == self.charEncoding[0]:
            self.charEncoding = (self.charEncoding[0], "certain")
        else:
            self.rawStream.seek(0)
-            self.reset()
            self.charEncoding = (newEncoding, "certain")
-            raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
+            self.reset()
+            raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))

    def detectBOM(self):
        """Attempts to detect at BOM at the start of the stream. If
@@ -508,8 +538,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
        encoding otherwise return None"""
        bomDict = {
            codecs.BOM_UTF8: 'utf-8',
-            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
-            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
+            codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
+            codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
        }

        # Go to beginning of file and read in 4 bytes
@@ -529,9 +559,12 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):

        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
-        self.rawStream.seek(encoding and seek or 0)
-
-        return encoding
+        if encoding:
+            self.rawStream.seek(seek)
+            return lookupEncoding(encoding)
+        else:
+            self.rawStream.seek(0)
+            return None

    def detectEncodingMeta(self):
        """Report the encoding declared by the meta element
@@ -542,8 +575,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
        self.rawStream.seek(0)
        encoding = parser.getEncoding()

-        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
-            encoding = "utf-8"
+        if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
+            encoding = lookupEncoding("utf-8")

        return encoding

@@ -557,6 +590,7 @@ class EncodingBytes(bytes):
        return bytes.__new__(self, value.lower())

    def __init__(self, value):
+        # pylint:disable=unused-argument
        self._position = -1

    def __iter__(self):
@@ -667,7 +701,7 @@ class EncodingParser(object):
            (b"<!", self.handleOther),
            (b"<?", self.handleOther),
            (b"<", self.handlePossibleStartTag))
-        for byte in self.data:
+        for _ in self.data:
            keepParsing = True
            for key, method in methodDispatch:
                if self.data.matchBytes(key):
@@ -706,7 +740,7 @@ class EncodingParser(object):
                        return False
                elif attr[0] == b"charset":
                    tentativeEncoding = attr[1]
-                    codec = codecName(tentativeEncoding)
+                    codec = lookupEncoding(tentativeEncoding)
                    if codec is not None:
                        self.encoding = codec
                        return False
@@ -714,7 +748,7 @@ class EncodingParser(object):
                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
                    tentativeEncoding = contentParser.parse()
                    if tentativeEncoding is not None:
-                        codec = codecName(tentativeEncoding)
+                        codec = lookupEncoding(tentativeEncoding)
                        if codec is not None:
                            if hasPragma:
                                self.encoding = codec
@@ -871,16 +905,19 @@ class ContentAttrParser(object):
            return None


-def codecName(encoding):
+def lookupEncoding(encoding):
    """Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding."""
-    if isinstance(encoding, bytes):
+    if isinstance(encoding, binary_type):
        try:
            encoding = encoding.decode("ascii")
        except UnicodeDecodeError:
            return None
-    if encoding:
-        canonicalName = ascii_punctuation_re.sub("", encoding).lower()
-        return encodings.get(canonicalName, None)
+
+    if encoding is not None:
+        try:
+            return webencodings.lookup(encoding)
+        except AttributeError:
+            return None
    else:
        return None
@@ -1,9 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals

-try:
-    chr = unichr # flake8: noqa
-except NameError:
-    pass
+from six import unichr as chr

 from collections import deque

@@ -14,9 +11,9 @@ from .constants import digits, hexDigits, EOF
 from .constants import tokenTypes, tagTokenTypes
 from .constants import replacementCharacters

-from .inputstream import HTMLInputStream
+from ._inputstream import HTMLInputStream

-from .trie import Trie
+from ._trie import Trie

 entitiesTrie = Trie(entities)

@@ -34,16 +31,11 @@ class HTMLTokenizer(object):
      Points to HTMLInputStream object.
    """

-    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
-                 lowercaseElementName=True, lowercaseAttrName=True, parser=None):
+    def __init__(self, stream, parser=None, **kwargs):

-        self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
+        self.stream = HTMLInputStream(stream, **kwargs)
        self.parser = parser

-        # Perform case conversions?
-        self.lowercaseElementName = lowercaseElementName
-        self.lowercaseAttrName = lowercaseAttrName
-
        # Setup the initial tokenizer state
        self.escapeFlag = False
        self.lastFourChars = []
@@ -147,8 +139,8 @@ class HTMLTokenizer(object):
        output = "&"

        charStack = [self.stream.char()]
-        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
-                or (allowedChar is not None and allowedChar == charStack[0])):
+        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
+                (allowedChar is not None and allowedChar == charStack[0])):
            self.stream.unget(charStack[0])

        elif charStack[0] == "#":
@@ -235,8 +227,7 @@ class HTMLTokenizer(object):
        token = self.currentToken
        # Add token to the queue to be yielded
        if (token["type"] in tagTokenTypes):
-            if self.lowercaseElementName:
-                token["name"] = token["name"].translate(asciiUpper2Lower)
+            token["name"] = token["name"].translate(asciiUpper2Lower)
            if token["type"] == tokenTypes["EndTag"]:
                if token["data"]:
                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
@@ -921,10 +912,9 @@ class HTMLTokenizer(object):
            # Attributes are not dropped at this stage. That happens when the
            # start tag token is emitted so values can still be safely appended
            # to attributes, but we do want to report the parse error in time.
-            if self.lowercaseAttrName:
-                self.currentToken["data"][-1][0] = (
-                    self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
-            for name, value in self.currentToken["data"][:-1]:
+            self.currentToken["data"][-1][0] = (
+                self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
+            for name, _ in self.currentToken["data"][:-1]:
                if self.currentToken["data"][-1][0] == name:
                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
                                            "duplicate-attribute"})
@@ -1716,11 +1706,11 @@ class HTMLTokenizer(object):
                else:
                    data.append(char)

-        data = "".join(data)
+        data = "".join(data)  # pylint:disable=redefined-variable-type
        # Deal with null here rather than in the parser
        nullCount = data.count("\u0000")
        if nullCount > 0:
-            for i in range(nullCount):
+            for _ in range(nullCount):
                self.tokenQueue.append({"type": tokenTypes["ParseError"],
                                        "data": "invalid-codepoint"})
            data = data.replace("\u0000", "\uFFFD")
@@ -4,9 +4,11 @@ from .py import Trie as PyTrie

 Trie = PyTrie

+# pylint:disable=wrong-import-position
 try:
    from .datrie import Trie as DATrie
 except ImportError:
    pass
 else:
    Trie = DATrie
+# pylint:enable=wrong-import-position
@@ -7,13 +7,13 @@ class Trie(Mapping):
    """Abstract base class for tries"""

    def keys(self, prefix=None):
-        keys = super().keys()
+        # pylint:disable=arguments-differ
+        keys = super(Trie, self).keys()

        if prefix is None:
            return set(keys)

-        # Python 2.6: no set comprehensions
-        return set([x for x in keys if x.startswith(prefix)])
+        return {x for x in keys if x.startswith(prefix)}

    def has_keys_with_prefix(self, prefix):
        for key in self.keys():
@@ -2,6 +2,8 @@ from __future__ import absolute_import, division, unicode_literals

 from types import ModuleType

+from six import text_type
+
 try:
    import xml.etree.cElementTree as default_etree
 except ImportError:
@@ -9,7 +11,26 @@ except ImportError:


 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
-           "surrogatePairToCodepoint", "moduleFactoryFactory"]
+           "surrogatePairToCodepoint", "moduleFactoryFactory",
+           "supports_lone_surrogates"]
+
+
+# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
+# caught by the below test. In general this would be any platform
+# using UTF-16 as its encoding of unicode strings, such as
+# Jython. This is because UTF-16 itself is based on the use of such
+# surrogates, and there is no mechanism to further escape such
+# escapes.
+try:
+    _x = eval('"\\uD800"')  # pylint:disable=eval-used
+    if not isinstance(_x, text_type):
+        # We need this with u"" because of http://bugs.jython.org/issue2039
+        _x = eval('u"\\uD800"')  # pylint:disable=eval-used
+        assert isinstance(_x, text_type)
+except:  # pylint:disable=bare-except
+    supports_lone_surrogates = False
+else:
+    supports_lone_surrogates = True


 class MethodDispatcher(dict):
@@ -31,19 +52,20 @@ class MethodDispatcher(dict):
        # anything here.
        _dictEntries = []
        for name, value in items:
-            if type(name) in (list, tuple, frozenset, set):
+            if isinstance(name, (list, tuple, frozenset, set)):
                for item in name:
                    _dictEntries.append((item, value))
            else:
                _dictEntries.append((name, value))
        dict.__init__(self, _dictEntries)
+        assert len(self) == len(_dictEntries)
        self.default = None

    def __getitem__(self, key):
        return dict.get(self, key, self.default)


-# Some utility functions to dal with weirdness around UCS2 vs UCS4
+# Some utility functions to deal with weirdness around UCS2 vs UCS4
 # python builds

 def isSurrogatePair(data):
@@ -70,13 +92,33 @@ def moduleFactoryFactory(factory):
        else:
            name = b"_%s_factory" % baseModule.__name__

-        if name in moduleCache:
-            return moduleCache[name]
-        else:
+        kwargs_tuple = tuple(kwargs.items())
+
+        try:
+            return moduleCache[name][args][kwargs_tuple]
+        except KeyError:
            mod = ModuleType(name)
            objs = factory(baseModule, *args, **kwargs)
            mod.__dict__.update(objs)
-            moduleCache[name] = mod
+            if "name" not in moduleCache:
+                moduleCache[name] = {}
+            if "args" not in moduleCache[name]:
+                moduleCache[name][args] = {}
+            if "kwargs" not in moduleCache[name][args]:
+                moduleCache[name][args][kwargs_tuple] = {}
+            moduleCache[name][args][kwargs_tuple] = mod
            return mod

    return moduleFactory
+
+
+def memoize(func):
+    cache = {}
+
+    def wrapped(*args, **kwargs):
+        key = (tuple(args), tuple(kwargs.items()))
+        if key not in cache:
+            cache[key] = func(*args, **kwargs)
+        return cache[key]
+
+    return wrapped
@@ -1,292 +1,296 @@
 from __future__ import absolute_import, division, unicode_literals

 import string
-import gettext
-_ = gettext.gettext

 EOF = None

 E = {
    "null-character":
-        _("Null character in input stream, replaced with U+FFFD."),
+        "Null character in input stream, replaced with U+FFFD.",
    "invalid-codepoint":
-        _("Invalid codepoint in stream."),
+        "Invalid codepoint in stream.",
    "incorrectly-placed-solidus":
-        _("Solidus (/) incorrectly placed in tag."),
+        "Solidus (/) incorrectly placed in tag.",
    "incorrect-cr-newline-entity":
-        _("Incorrect CR newline entity, replaced with LF."),
+        "Incorrect CR newline entity, replaced with LF.",
    "illegal-windows-1252-entity":
-        _("Entity used with illegal number (windows-1252 reference)."),
+        "Entity used with illegal number (windows-1252 reference).",
    "cant-convert-numeric-entity":
-        _("Numeric entity couldn't be converted to character "
-          "(codepoint U+%(charAsInt)08x)."),
+        "Numeric entity couldn't be converted to character "
+        "(codepoint U+%(charAsInt)08x).",
    "illegal-codepoint-for-numeric-entity":
-        _("Numeric entity represents an illegal codepoint: "
-          "U+%(charAsInt)08x."),
+        "Numeric entity represents an illegal codepoint: "
+        "U+%(charAsInt)08x.",
    "numeric-entity-without-semicolon":
-        _("Numeric entity didn't end with ';'."),
+        "Numeric entity didn't end with ';'.",
    "expected-numeric-entity-but-got-eof":
-        _("Numeric entity expected. Got end of file instead."),
+        "Numeric entity expected. Got end of file instead.",
    "expected-numeric-entity":
-        _("Numeric entity expected but none found."),
+        "Numeric entity expected but none found.",
    "named-entity-without-semicolon":
-        _("Named entity didn't end with ';'."),
+        "Named entity didn't end with ';'.",
    "expected-named-entity":
-        _("Named entity expected. Got none."),
+        "Named entity expected. Got none.",
    "attributes-in-end-tag":
-        _("End tag contains unexpected attributes."),
+        "End tag contains unexpected attributes.",
    'self-closing-flag-on-end-tag':
-        _("End tag contains unexpected self-closing flag."),
+        "End tag contains unexpected self-closing flag.",
    "expected-tag-name-but-got-right-bracket":
-        _("Expected tag name. Got '>' instead."),
+        "Expected tag name. Got '>' instead.",
    "expected-tag-name-but-got-question-mark":
-        _("Expected tag name. Got '?' instead. (HTML doesn't "
-          "support processing instructions.)"),
+        "Expected tag name. Got '?' instead. (HTML doesn't "
+        "support processing instructions.)",
    "expected-tag-name":
-        _("Expected tag name. Got something else instead"),
+        "Expected tag name. Got something else instead",
    "expected-closing-tag-but-got-right-bracket":
-        _("Expected closing tag. Got '>' instead. Ignoring '</>'."),
+        "Expected closing tag. Got '>' instead. Ignoring '</>'.",
    "expected-closing-tag-but-got-eof":
-        _("Expected closing tag. Unexpected end of file."),
+        "Expected closing tag. Unexpected end of file.",
    "expected-closing-tag-but-got-char":
-        _("Expected closing tag. Unexpected character '%(data)s' found."),
+        "Expected closing tag. Unexpected character '%(data)s' found.",
    "eof-in-tag-name":
-        _("Unexpected end of file in the tag name."),
+        "Unexpected end of file in the tag name.",
    "expected-attribute-name-but-got-eof":
-        _("Unexpected end of file. Expected attribute name instead."),
+        "Unexpected end of file. Expected attribute name instead.",
    "eof-in-attribute-name":
-        _("Unexpected end of file in attribute name."),
+        "Unexpected end of file in attribute name.",
    "invalid-character-in-attribute-name":
-        _("Invalid character in attribute name"),
+        "Invalid character in attribute name",
    "duplicate-attribute":
-        _("Dropped duplicate attribute on tag."),
+        "Dropped duplicate attribute on tag.",
    "expected-end-of-tag-name-but-got-eof":
-        _("Unexpected end of file. Expected = or end of tag."),
+        "Unexpected end of file. Expected = or end of tag.",
    "expected-attribute-value-but-got-eof":
-        _("Unexpected end of file. Expected attribute value."),
+        "Unexpected end of file. Expected attribute value.",
    "expected-attribute-value-but-got-right-bracket":
-        _("Expected attribute value. Got '>' instead."),
+        "Expected attribute value. Got '>' instead.",
    'equals-in-unquoted-attribute-value':
-        _("Unexpected = in unquoted attribute"),
+        "Unexpected = in unquoted attribute",
    'unexpected-character-in-unquoted-attribute-value':
-        _("Unexpected character in unquoted attribute"),
+        "Unexpected character in unquoted attribute",
    "invalid-character-after-attribute-name":
-        _("Unexpected character after attribute name."),
+        "Unexpected character after attribute name.",
    "unexpected-character-after-attribute-value":
-        _("Unexpected character after attribute value."),
+        "Unexpected character after attribute value.",
    "eof-in-attribute-value-double-quote":
-        _("Unexpected end of file in attribute value (\")."),
+        "Unexpected end of file in attribute value (\").",
    "eof-in-attribute-value-single-quote":
-        _("Unexpected end of file in attribute value (')."),
+        "Unexpected end of file in attribute value (').",
    "eof-in-attribute-value-no-quotes":
-        _("Unexpected end of file in attribute value."),
+        "Unexpected end of file in attribute value.",
    "unexpected-EOF-after-solidus-in-tag":
-        _("Unexpected end of file in tag. Expected >"),
+        "Unexpected end of file in tag. Expected >",
    "unexpected-character-after-solidus-in-tag":
-        _("Unexpected character after / in tag. Expected >"),
+        "Unexpected character after / in tag. Expected >",
    "expected-dashes-or-doctype":
-        _("Expected '--' or 'DOCTYPE'. Not found."),
+        "Expected '--' or 'DOCTYPE'. Not found.",
    "unexpected-bang-after-double-dash-in-comment":
-        _("Unexpected ! after -- in comment"),
+        "Unexpected ! after -- in comment",
    "unexpected-space-after-double-dash-in-comment":
-        _("Unexpected space after -- in comment"),
+        "Unexpected space after -- in comment",
    "incorrect-comment":
-        _("Incorrect comment."),
+        "Incorrect comment.",
    "eof-in-comment":
-        _("Unexpected end of file in comment."),
+        "Unexpected end of file in comment.",
    "eof-in-comment-end-dash":
-        _("Unexpected end of file in comment (-)"),
+        "Unexpected end of file in comment (-)",
    "unexpected-dash-after-double-dash-in-comment":
-        _("Unexpected '-' after '--' found in comment."),
+        "Unexpected '-' after '--' found in comment.",
    "eof-in-comment-double-dash":
-        _("Unexpected end of file in comment (--)."),
+        "Unexpected end of file in comment (--).",
    "eof-in-comment-end-space-state":
-        _("Unexpected end of file in comment."),
+        "Unexpected end of file in comment.",
    "eof-in-comment-end-bang-state":
-        _("Unexpected end of file in comment."),
+        "Unexpected end of file in comment.",
    "unexpected-char-in-comment":
-        _("Unexpected character in comment found."),
+        "Unexpected character in comment found.",
    "need-space-after-doctype":
-        _("No space after literal string 'DOCTYPE'."),
+        "No space after literal string 'DOCTYPE'.",
    "expected-doctype-name-but-got-right-bracket":
-        _("Unexpected > character. Expected DOCTYPE name."),
+        "Unexpected > character. Expected DOCTYPE name.",
    "expected-doctype-name-but-got-eof":
-        _("Unexpected end of file. Expected DOCTYPE name."),
+        "Unexpected end of file. Expected DOCTYPE name.",
    "eof-in-doctype-name":
-        _("Unexpected end of file in DOCTYPE name."),
+        "Unexpected end of file in DOCTYPE name.",
    "eof-in-doctype":
-        _("Unexpected end of file in DOCTYPE."),
+        "Unexpected end of file in DOCTYPE.",
    "expected-space-or-right-bracket-in-doctype":
-        _("Expected space or '>'. Got '%(data)s'"),
+        "Expected space or '>'. Got '%(data)s'",
    "unexpected-end-of-doctype":
-        _("Unexpected end of DOCTYPE."),
+        "Unexpected end of DOCTYPE.",
    "unexpected-char-in-doctype":
-        _("Unexpected character in DOCTYPE."),
+        "Unexpected character in DOCTYPE.",
    "eof-in-innerhtml":
-        _("XXX innerHTML EOF"),
+        "XXX innerHTML EOF",
    "unexpected-doctype":
-        _("Unexpected DOCTYPE. Ignored."),
+        "Unexpected DOCTYPE. Ignored.",
    "non-html-root":
-        _("html needs to be the first start tag."),
+        "html needs to be the first start tag.",
    "expected-doctype-but-got-eof":
-        _("Unexpected End of file. Expected DOCTYPE."),
+        "Unexpected End of file. Expected DOCTYPE.",
    "unknown-doctype":
-        _("Erroneous DOCTYPE."),
+        "Erroneous DOCTYPE.",
    "expected-doctype-but-got-chars":
-        _("Unexpected non-space characters. Expected DOCTYPE."),
+        "Unexpected non-space characters. Expected DOCTYPE.",
    "expected-doctype-but-got-start-tag":
-        _("Unexpected start tag (%(name)s). Expected DOCTYPE."),
+        "Unexpected start tag (%(name)s). Expected DOCTYPE.",
    "expected-doctype-but-got-end-tag":
-        _("Unexpected end tag (%(name)s). Expected DOCTYPE."),
+        "Unexpected end tag (%(name)s). Expected DOCTYPE.",
    "end-tag-after-implied-root":
-        _("Unexpected end tag (%(name)s) after the (implied) root element."),
+        "Unexpected end tag (%(name)s) after the (implied) root element.",
    "expected-named-closing-tag-but-got-eof":
-        _("Unexpected end of file. Expected end tag (%(name)s)."),
+        "Unexpected end of file. Expected end tag (%(name)s).",
    "two-heads-are-not-better-than-one":
-        _("Unexpected start tag head in existing head. Ignored."),
+        "Unexpected start tag head in existing head. Ignored.",
    "unexpected-end-tag":
-        _("Unexpected end tag (%(name)s). Ignored."),
+        "Unexpected end tag (%(name)s). Ignored.",
    "unexpected-start-tag-out-of-my-head":
-        _("Unexpected start tag (%(name)s) that can be in head. Moved."),
+        "Unexpected start tag (%(name)s) that can be in head. Moved.",
    "unexpected-start-tag":
-        _("Unexpected start tag (%(name)s)."),
+        "Unexpected start tag (%(name)s).",
    "missing-end-tag":
-        _("Missing end tag (%(name)s)."),
+        "Missing end tag (%(name)s).",
    "missing-end-tags":
-        _("Missing end tags (%(name)s)."),
+        "Missing end tags (%(name)s).",
    "unexpected-start-tag-implies-end-tag":
-        _("Unexpected start tag (%(startName)s) "
-          "implies end tag (%(endName)s)."),
+        "Unexpected start tag (%(startName)s) "
+        "implies end tag (%(endName)s).",
    "unexpected-start-tag-treated-as":
-        _("Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
+        "Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
    "deprecated-tag":
-        _("Unexpected start tag %(name)s. Don't use it!"),
+        "Unexpected start tag %(name)s. Don't use it!",
    "unexpected-start-tag-ignored":
-        _("Unexpected start tag %(name)s. Ignored."),
+        "Unexpected start tag %(name)s. Ignored.",
    "expected-one-end-tag-but-got-another":
-        _("Unexpected end tag (%(gotName)s). "
-          "Missing end tag (%(expectedName)s)."),
+        "Unexpected end tag (%(gotName)s). "
+        "Missing end tag (%(expectedName)s).",
    "end-tag-too-early":
-        _("End tag (%(name)s) seen too early. Expected other end tag."),
+        "End tag (%(name)s) seen too early. Expected other end tag.",
    "end-tag-too-early-named":
-        _("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
+        "Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).",
    "end-tag-too-early-ignored":
-        _("End tag (%(name)s) seen too early. Ignored."),
+        "End tag (%(name)s) seen too early. Ignored.",
    "adoption-agency-1.1":
-        _("End tag (%(name)s) violates step 1, "
-          "paragraph 1 of the adoption agency algorithm."),
+        "End tag (%(name)s) violates step 1, "
+        "paragraph 1 of the adoption agency algorithm.",
    "adoption-agency-1.2":
-        _("End tag (%(name)s) violates step 1, "
-          "paragraph 2 of the adoption agency algorithm."),
+        "End tag (%(name)s) violates step 1, "
+        "paragraph 2 of the adoption agency algorithm.",
    "adoption-agency-1.3":
-        _("End tag (%(name)s) violates step 1, "
-          "paragraph 3 of the adoption agency algorithm."),
+        "End tag (%(name)s) violates step 1, "
+        "paragraph 3 of the adoption agency algorithm.",
    "adoption-agency-4.4":
-        _("End tag (%(name)s) violates step 4, "
-          "paragraph 4 of the adoption agency algorithm."),
+        "End tag (%(name)s) violates step 4, "
+        "paragraph 4 of the adoption agency algorithm.",
    "unexpected-end-tag-treated-as":
-        _("Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
+        "Unexpected end tag (%(originalName)s). Treated as %(newName)s.",
    "no-end-tag":
-        _("This element (%(name)s) has no end tag."),
+        "This element (%(name)s) has no end tag.",
    "unexpected-implied-end-tag-in-table":
-        _("Unexpected implied end tag (%(name)s) in the table phase."),
+        "Unexpected implied end tag (%(name)s) in the table phase.",
    "unexpected-implied-end-tag-in-table-body":
-        _("Unexpected implied end tag (%(name)s) in the table body phase."),
+        "Unexpected implied end tag (%(name)s) in the table body phase.",
    "unexpected-char-implies-table-voodoo":
-        _("Unexpected non-space characters in "
-          "table context caused voodoo mode."),
+        "Unexpected non-space characters in "
+        "table context caused voodoo mode.",
    "unexpected-hidden-input-in-table":
-        _("Unexpected input with type hidden in table context."),
+        "Unexpected input with type hidden in table context.",
    "unexpected-form-in-table":
-        _("Unexpected form in table context."),
+        "Unexpected form in table context.",
    "unexpected-start-tag-implies-table-voodoo":
-        _("Unexpected start tag (%(name)s) in "
-          "table context caused voodoo mode."),
+        "Unexpected start tag (%(name)s) in "
+        "table context caused voodoo mode.",
    "unexpected-end-tag-implies-table-voodoo":
-        _("Unexpected end tag (%(name)s) in "
-          "table context caused voodoo mode."),
+        "Unexpected end tag (%(name)s) in "
+        "table context caused voodoo mode.",
    "unexpected-cell-in-table-body":
-        _("Unexpected table cell start tag (%(name)s) "
-          "in the table body phase."),
+        "Unexpected table cell start tag (%(name)s) "
+        "in the table body phase.",
    "unexpected-cell-end-tag":
-        _("Got table cell end tag (%(name)s) "
-          "while required end tags are missing."),
+        "Got table cell end tag (%(name)s) "
+        "while required end tags are missing.",
    "unexpected-end-tag-in-table-body":
-        _("Unexpected end tag (%(name)s) in the table body phase. Ignored."),
+        "Unexpected end tag (%(name)s) in the table body phase. Ignored.",
    "unexpected-implied-end-tag-in-table-row":
-        _("Unexpected implied end tag (%(name)s) in the table row phase."),
+        "Unexpected implied end tag (%(name)s) in the table row phase.",
    "unexpected-end-tag-in-table-row":
-        _("Unexpected end tag (%(name)s) in the table row phase. Ignored."),
+        "Unexpected end tag (%(name)s) in the table row phase. Ignored.",
    "unexpected-select-in-select":
-        _("Unexpected select start tag in the select phase "
-          "treated as select end tag."),
+        "Unexpected select start tag in the select phase "
+        "treated as select end tag.",
    "unexpected-input-in-select":
-        _("Unexpected input start tag in the select phase."),
+        "Unexpected input start tag in the select phase.",
    "unexpected-start-tag-in-select":
-        _("Unexpected start tag token (%(name)s in the select phase. "
-          "Ignored."),
+        "Unexpected start tag token (%(name)s in the select phase. "
+        "Ignored.",
    "unexpected-end-tag-in-select":
-        _("Unexpected end tag (%(name)s) in the select phase. Ignored."),
+        "Unexpected end tag (%(name)s) in the select phase. Ignored.",
    "unexpected-table-element-start-tag-in-select-in-table":
-        _("Unexpected table element start tag (%(name)s) in the select in table phase."),
+        "Unexpected table element start tag (%(name)s) in the select in table phase.",
    "unexpected-table-element-end-tag-in-select-in-table":
-        _("Unexpected table element end tag (%(name)s) in the select in table phase."),
+        "Unexpected table element end tag (%(name)s) in the select in table phase.",
    "unexpected-char-after-body":
-        _("Unexpected non-space characters in the after body phase."),
+        "Unexpected non-space characters in the after body phase.",
    "unexpected-start-tag-after-body":
-        _("Unexpected start tag token (%(name)s)"
-          " in the after body phase."),
+        "Unexpected start tag token (%(name)s)"
+        " in the after body phase.",
    "unexpected-end-tag-after-body":
-        _("Unexpected end tag token (%(name)s)"
-          " in the after body phase."),
+        "Unexpected end tag token (%(name)s)"
+        " in the after body phase.",
    "unexpected-char-in-frameset":
-        _("Unexpected characters in the frameset phase. Characters ignored."),
+        "Unexpected characters in the frameset phase. Characters ignored.",
    "unexpected-start-tag-in-frameset":
-        _("Unexpected start tag token (%(name)s)"
-          " in the frameset phase. Ignored."),
+        "Unexpected start tag token (%(name)s)"
+        " in the frameset phase. Ignored.",
    "unexpected-frameset-in-frameset-innerhtml":
-        _("Unexpected end tag token (frameset) "
-          "in the frameset phase (innerHTML)."),
+        "Unexpected end tag token (frameset) "
+        "in the frameset phase (innerHTML).",
    "unexpected-end-tag-in-frameset":
-        _("Unexpected end tag token (%(name)s)"
-          " in the frameset phase. Ignored."),
+        "Unexpected end tag token (%(name)s)"
+        " in the frameset phase. Ignored.",
    "unexpected-char-after-frameset":
-        _("Unexpected non-space characters in the "
-          "after frameset phase. Ignored."),
+        "Unexpected non-space characters in the "
+        "after frameset phase. Ignored.",
    "unexpected-start-tag-after-frameset":
-        _("Unexpected start tag (%(name)s)"
-          " in the after frameset phase. Ignored."),
+        "Unexpected start tag (%(name)s)"
+        " in the after frameset phase. Ignored.",
    "unexpected-end-tag-after-frameset":
-        _("Unexpected end tag (%(name)s)"
-          " in the after frameset phase. Ignored."),
+        "Unexpected end tag (%(name)s)"
+        " in the after frameset phase. Ignored.",
    "unexpected-end-tag-after-body-innerhtml":
-        _("Unexpected end tag after body(innerHtml)"),
+        "Unexpected end tag after body(innerHtml)",
    "expected-eof-but-got-char":
-        _("Unexpected non-space characters. Expected end of file."),
+        "Unexpected non-space characters. Expected end of file.",
    "expected-eof-but-got-start-tag":
-        _("Unexpected start tag (%(name)s)"
-          ". Expected end of file."),
+        "Unexpected start tag (%(name)s)"
+        ". Expected end of file.",
    "expected-eof-but-got-end-tag":
-        _("Unexpected end tag (%(name)s)"
-          ". Expected end of file."),
+        "Unexpected end tag (%(name)s)"
+        ". Expected end of file.",
    "eof-in-table":
-        _("Unexpected end of file. Expected table content."),
+        "Unexpected end of file. Expected table content.",
    "eof-in-select":
-        _("Unexpected end of file. Expected select content."),
+        "Unexpected end of file. Expected select content.",
    "eof-in-frameset":
-        _("Unexpected end of file. Expected frameset content."),
+        "Unexpected end of file. Expected frameset content.",
    "eof-in-script-in-script":
-        _("Unexpected end of file. Expected script content."),
+        "Unexpected end of file. Expected script content.",
    "eof-in-foreign-lands":
-        _("Unexpected end of file. Expected foreign content"),
+        "Unexpected end of file. Expected foreign content",
    "non-void-element-with-trailing-solidus":
-        _("Trailing solidus not allowed on element %(name)s"),
+        "Trailing solidus not allowed on element %(name)s",
    "unexpected-html-element-in-foreign-content":
-        _("Element %(name)s not allowed in a non-html context"),
+        "Element %(name)s not allowed in a non-html context",
    "unexpected-end-tag-before-html":
-        _("Unexpected end tag (%(name)s) before html."),
+        "Unexpected end tag (%(name)s) before html.",
+    "unexpected-inhead-noscript-tag":
+        "Element %(name)s not allowed in a inhead-noscript context",
+    "eof-in-head-noscript":
+        "Unexpected end of file. Expected inhead-noscript content",
+    "char-in-head-noscript":
+        "Unexpected non-space character. Expected inhead-noscript content",
    "XXX-undefined-error":
-        _("Undefined error (this sucks and should be fixed)"),
+        "Undefined error (this sucks and should be fixed)",
 }

 namespaces = {
@@ -298,7 +302,7 @@ namespaces = {
    "xmlns": "http://www.w3.org/2000/xmlns/"
 }

-scopingElements = frozenset((
+scopingElements = frozenset([
    (namespaces["html"], "applet"),
    (namespaces["html"], "caption"),
    (namespaces["html"], "html"),
@@ -316,9 +320,9 @@ scopingElements = frozenset((
    (namespaces["svg"], "foreignObject"),
    (namespaces["svg"], "desc"),
    (namespaces["svg"], "title"),
-))
+])

-formattingElements = frozenset((
+formattingElements = frozenset([
    (namespaces["html"], "a"),
    (namespaces["html"], "b"),
    (namespaces["html"], "big"),
@@ -333,9 +337,9 @@ formattingElements = frozenset((
    (namespaces["html"], "strong"),
    (namespaces["html"], "tt"),
    (namespaces["html"], "u")
-))
+])

-specialElements = frozenset((
+specialElements = frozenset([
    (namespaces["html"], "address"),
    (namespaces["html"], "applet"),
    (namespaces["html"], "area"),
@@ -416,22 +420,89 @@ specialElements = frozenset((
    (namespaces["html"], "wbr"),
    (namespaces["html"], "xmp"),
    (namespaces["svg"], "foreignObject")
-))
+])

-htmlIntegrationPointElements = frozenset((
-    (namespaces["mathml"], "annotaion-xml"),
+htmlIntegrationPointElements = frozenset([
+    (namespaces["mathml"], "annotation-xml"),
    (namespaces["svg"], "foreignObject"),
    (namespaces["svg"], "desc"),
    (namespaces["svg"], "title")
-))
+])

-mathmlTextIntegrationPointElements = frozenset((
+mathmlTextIntegrationPointElements = frozenset([
    (namespaces["mathml"], "mi"),
    (namespaces["mathml"], "mo"),
    (namespaces["mathml"], "mn"),
    (namespaces["mathml"], "ms"),
    (namespaces["mathml"], "mtext")
-))
+])
+
+adjustSVGAttributes = {
+    "attributename": "attributeName",
+    "attributetype": "attributeType",
+    "basefrequency": "baseFrequency",
+    "baseprofile": "baseProfile",
+    "calcmode": "calcMode",
+    "clippathunits": "clipPathUnits",
+    "contentscripttype": "contentScriptType",
+    "contentstyletype": "contentStyleType",
+    "diffuseconstant": "diffuseConstant",
+    "edgemode": "edgeMode",
+    "externalresourcesrequired": "externalResourcesRequired",
+    "filterres": "filterRes",
+    "filterunits": "filterUnits",
+    "glyphref": "glyphRef",
+    "gradienttransform": "gradientTransform",
+    "gradientunits": "gradientUnits",
+    "kernelmatrix": "kernelMatrix",
+    "kernelunitlength": "kernelUnitLength",
+    "keypoints": "keyPoints",
+    "keysplines": "keySplines",
+    "keytimes": "keyTimes",
+    "lengthadjust": "lengthAdjust",
+    "limitingconeangle": "limitingConeAngle",
+    "markerheight": "markerHeight",
+    "markerunits": "markerUnits",
+    "markerwidth": "markerWidth",
+    "maskcontentunits": "maskContentUnits",
+    "maskunits": "maskUnits",
+    "numoctaves": "numOctaves",
+    "pathlength": "pathLength",
+    "patterncontentunits": "patternContentUnits",
+    "patterntransform": "patternTransform",
+    "patternunits": "patternUnits",
+    "pointsatx": "pointsAtX",
+    "pointsaty": "pointsAtY",
+    "pointsatz": "pointsAtZ",
+    "preservealpha": "preserveAlpha",
+    "preserveaspectratio": "preserveAspectRatio",
+    "primitiveunits": "primitiveUnits",
+    "refx": "refX",
+    "refy": "refY",
+    "repeatcount": "repeatCount",
+    "repeatdur": "repeatDur",
+    "requiredextensions": "requiredExtensions",
+    "requiredfeatures": "requiredFeatures",
+    "specularconstant": "specularConstant",
+    "specularexponent": "specularExponent",
+    "spreadmethod": "spreadMethod",
+    "startoffset": "startOffset",
+    "stddeviation": "stdDeviation",
+    "stitchtiles": "stitchTiles",
+    "surfacescale": "surfaceScale",
+    "systemlanguage": "systemLanguage",
+    "tablevalues": "tableValues",
+    "targetx": "targetX",
+    "targety": "targetY",
+    "textlength": "textLength",
+    "viewbox": "viewBox",
+    "viewtarget": "viewTarget",
+    "xchannelselector": "xChannelSelector",
+    "ychannelselector": "yChannelSelector",
+    "zoomandpan": "zoomAndPan"
+}
+
+adjustMathMLAttributes = {"definitionurl": "definitionURL"}

 adjustForeignAttributes = {
    "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
@@ -451,21 +522,21 @@ adjustForeignAttributes = {
 unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
                                  adjustForeignAttributes.items()])

-spaceCharacters = frozenset((
+spaceCharacters = frozenset([
    "\t",
    "\n",
    "\u000C",
    " ",
    "\r"
-))
+])

-tableInsertModeElements = frozenset((
+tableInsertModeElements = frozenset([
    "table",
    "tbody",
    "tfoot",
    "thead",
    "tr"
-))
+])

 asciiLowercase = frozenset(string.ascii_lowercase)
 asciiUppercase = frozenset(string.ascii_uppercase)
@@ -486,7 +557,7 @@ headingElements = (
    "h6"
 )

-voidElements = frozenset((
+voidElements = frozenset([
    "base",
    "command",
    "event-source",
@@ -502,11 +573,11 @@ voidElements = frozenset((
    "input",
    "source",
    "track"
-))
+])

-cdataElements = frozenset(('title', 'textarea'))
+cdataElements = frozenset(['title', 'textarea'])

-rcdataElements = frozenset((
+rcdataElements = frozenset([
    'style',
    'script',
    'xmp',
@@ -514,27 +585,28 @@ rcdataElements = frozenset((
    'noembed',
    'noframes',
    'noscript'
-))
+])

 booleanAttributes = {
-    "": frozenset(("irrelevant",)),
-    "style": frozenset(("scoped",)),
-    "img": frozenset(("ismap",)),
-    "audio": frozenset(("autoplay", "controls")),
-    "video": frozenset(("autoplay", "controls")),
-    "script": frozenset(("defer", "async")),
-    "details": frozenset(("open",)),
-    "datagrid": frozenset(("multiple", "disabled")),
-    "command": frozenset(("hidden", "disabled", "checked", "default")),
-    "hr": frozenset(("noshade")),
-    "menu": frozenset(("autosubmit",)),
-    "fieldset": frozenset(("disabled", "readonly")),
-    "option": frozenset(("disabled", "readonly", "selected")),
-    "optgroup": frozenset(("disabled", "readonly")),
-    "button": frozenset(("disabled", "autofocus")),
-    "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
-    "select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
-    "output": frozenset(("disabled", "readonly")),
+    "": frozenset(["irrelevant", "itemscope"]),
+    "style": frozenset(["scoped"]),
+    "img": frozenset(["ismap"]),
+    "audio": frozenset(["autoplay", "controls"]),
+    "video": frozenset(["autoplay", "controls"]),
+    "script": frozenset(["defer", "async"]),
+    "details": frozenset(["open"]),
+    "datagrid": frozenset(["multiple", "disabled"]),
+    "command": frozenset(["hidden", "disabled", "checked", "default"]),
+    "hr": frozenset(["noshade"]),
+    "menu": frozenset(["autosubmit"]),
+    "fieldset": frozenset(["disabled", "readonly"]),
+    "option": frozenset(["disabled", "readonly", "selected"]),
+    "optgroup": frozenset(["disabled", "readonly"]),
+    "button": frozenset(["disabled", "autofocus"]),
+    "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
+    "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
+    "output": frozenset(["disabled", "readonly"]),
+    "iframe": frozenset(["seamless"]),
 }

 # entitiesWindows1252 has to be _ordered_ and needs to have an index. It
@@ -574,7 +646,7 @@ entitiesWindows1252 = (
    376     # 0x9F  0x0178  LATIN CAPITAL LETTER Y WITH DIAERESIS
 )

-xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
+xmlEntities = frozenset(['lt;', 'gt;', 'amp;', 'apos;', 'quot;'])

 entities = {
    "AElig": "\xc6",
@@ -2815,7 +2887,6 @@ replacementCharacters = {
    0x0d: "\u000D",
    0x80: "\u20AC",
    0x81: "\u0081",
-    0x81: "\u0081",
    0x82: "\u201A",
    0x83: "\u0192",
    0x84: "\u201E",
@@ -2848,235 +2919,6 @@ replacementCharacters = {
    0x9F: "\u0178",
 }

-encodings = {
-    '437': 'cp437',
-    '850': 'cp850',
-    '852': 'cp852',
-    '855': 'cp855',
-    '857': 'cp857',
-    '860': 'cp860',
-    '861': 'cp861',
-    '862': 'cp862',
-    '863': 'cp863',
-    '865': 'cp865',
-    '866': 'cp866',
-    '869': 'cp869',
-    'ansix341968': 'ascii',
-    'ansix341986': 'ascii',
-    'arabic': 'iso8859-6',
-    'ascii': 'ascii',
-    'asmo708': 'iso8859-6',
-    'big5': 'big5',
-    'big5hkscs': 'big5hkscs',
-    'chinese': 'gbk',
-    'cp037': 'cp037',
-    'cp1026': 'cp1026',
-    'cp154': 'ptcp154',
-    'cp367': 'ascii',
-    'cp424': 'cp424',
-    'cp437': 'cp437',
-    'cp500': 'cp500',
-    'cp775': 'cp775',
-    'cp819': 'windows-1252',
-    'cp850': 'cp850',
-    'cp852': 'cp852',
-    'cp855': 'cp855',
-    'cp857': 'cp857',
-    'cp860': 'cp860',
-    'cp861': 'cp861',
-    'cp862': 'cp862',
-    'cp863': 'cp863',
-    'cp864': 'cp864',
-    'cp865': 'cp865',
-    'cp866': 'cp866',
-    'cp869': 'cp869',
-    'cp936': 'gbk',
-    'cpgr': 'cp869',
-    'cpis': 'cp861',
-    'csascii': 'ascii',
-    'csbig5': 'big5',
-    'cseuckr': 'cp949',
-    'cseucpkdfmtjapanese': 'euc_jp',
-    'csgb2312': 'gbk',
-    'cshproman8': 'hp-roman8',
-    'csibm037': 'cp037',
-    'csibm1026': 'cp1026',
-    'csibm424': 'cp424',
-    'csibm500': 'cp500',
-    'csibm855': 'cp855',
-    'csibm857': 'cp857',
-    'csibm860': 'cp860',
-    'csibm861': 'cp861',
-    'csibm863': 'cp863',
-    'csibm864': 'cp864',
-    'csibm865': 'cp865',
-    'csibm866': 'cp866',
-    'csibm869': 'cp869',
-    'csiso2022jp': 'iso2022_jp',
-    'csiso2022jp2': 'iso2022_jp_2',
-    'csiso2022kr': 'iso2022_kr',
-    'csiso58gb231280': 'gbk',
-    'csisolatin1': 'windows-1252',
-    'csisolatin2': 'iso8859-2',
-    'csisolatin3': 'iso8859-3',
-    'csisolatin4': 'iso8859-4',
-    'csisolatin5': 'windows-1254',
-    'csisolatin6': 'iso8859-10',
-    'csisolatinarabic': 'iso8859-6',
-    'csisolatincyrillic': 'iso8859-5',
-    'csisolatingreek': 'iso8859-7',
-    'csisolatinhebrew': 'iso8859-8',
-    'cskoi8r': 'koi8-r',
-    'csksc56011987': 'cp949',
-    'cspc775baltic': 'cp775',
-    'cspc850multilingual': 'cp850',
-    'cspc862latinhebrew': 'cp862',
-    'cspc8codepage437': 'cp437',
-    'cspcp852': 'cp852',
-    'csptcp154': 'ptcp154',
-    'csshiftjis': 'shift_jis',
-    'csunicode11utf7': 'utf-7',
-    'cyrillic': 'iso8859-5',
-    'cyrillicasian': 'ptcp154',
-    'ebcdiccpbe': 'cp500',
-    'ebcdiccpca': 'cp037',
-    'ebcdiccpch': 'cp500',
-    'ebcdiccphe': 'cp424',
-    'ebcdiccpnl': 'cp037',
-    'ebcdiccpus': 'cp037',
-    'ebcdiccpwt': 'cp037',
-    'ecma114': 'iso8859-6',
-    'ecma118': 'iso8859-7',
-    'elot928': 'iso8859-7',
-    'eucjp': 'euc_jp',
-    'euckr': 'cp949',
-    'extendedunixcodepackedformatforjapanese': 'euc_jp',
-    'gb18030': 'gb18030',
-    'gb2312': 'gbk',
-    'gb231280': 'gbk',
-    'gbk': 'gbk',
-    'greek': 'iso8859-7',
-    'greek8': 'iso8859-7',
-    'hebrew': 'iso8859-8',
-    'hproman8': 'hp-roman8',
-    'hzgb2312': 'hz',
-    'ibm037': 'cp037',
-    'ibm1026': 'cp1026',
-    'ibm367': 'ascii',
-    'ibm424': 'cp424',
-    'ibm437': 'cp437',
-    'ibm500': 'cp500',
-    'ibm775': 'cp775',
-    'ibm819': 'windows-1252',
-    'ibm850': 'cp850',
-    'ibm852': 'cp852',
-    'ibm855': 'cp855',
-    'ibm857': 'cp857',
-    'ibm860': 'cp860',
-    'ibm861': 'cp861',
-    'ibm862': 'cp862',
-    'ibm863': 'cp863',
-    'ibm864': 'cp864',
-    'ibm865': 'cp865',
-    'ibm866': 'cp866',
-    'ibm869': 'cp869',
-    'iso2022jp': 'iso2022_jp',
-    'iso2022jp2': 'iso2022_jp_2',
-    'iso2022kr': 'iso2022_kr',
-    'iso646irv1991': 'ascii',
-    'iso646us': 'ascii',
-    'iso88591': 'windows-1252',
-    'iso885910': 'iso8859-10',
-    'iso8859101992': 'iso8859-10',
-    'iso885911987': 'windows-1252',
-    'iso885913': 'iso8859-13',
-    'iso885914': 'iso8859-14',
-    'iso8859141998': 'iso8859-14',
-    'iso885915': 'iso8859-15',
-    'iso885916': 'iso8859-16',
-    'iso8859162001': 'iso8859-16',
-    'iso88592': 'iso8859-2',
-    'iso885921987': 'iso8859-2',
-    'iso88593': 'iso8859-3',
-    'iso885931988': 'iso8859-3',
-    'iso88594': 'iso8859-4',
-    'iso885941988': 'iso8859-4',
-    'iso88595': 'iso8859-5',
-    'iso885951988': 'iso8859-5',
-    'iso88596': 'iso8859-6',
-    'iso885961987': 'iso8859-6',
-    'iso88597': 'iso8859-7',
-    'iso885971987': 'iso8859-7',
-    'iso88598': 'iso8859-8',
-    'iso885981988': 'iso8859-8',
-    'iso88599': 'windows-1254',
-    'iso885991989': 'windows-1254',
-    'isoceltic': 'iso8859-14',
-    'isoir100': 'windows-1252',
-    'isoir101': 'iso8859-2',
-    'isoir109': 'iso8859-3',
-    'isoir110': 'iso8859-4',
-    'isoir126': 'iso8859-7',
-    'isoir127': 'iso8859-6',
-    'isoir138': 'iso8859-8',
-    'isoir144': 'iso8859-5',
-    'isoir148': 'windows-1254',
-    'isoir149': 'cp949',
-    'isoir157': 'iso8859-10',
-    'isoir199': 'iso8859-14',
-    'isoir226': 'iso8859-16',
-    'isoir58': 'gbk',
-    'isoir6': 'ascii',
-    'koi8r': 'koi8-r',
-    'koi8u': 'koi8-u',
-    'korean': 'cp949',
-    'ksc5601': 'cp949',
-    'ksc56011987': 'cp949',
-    'ksc56011989': 'cp949',
-    'l1': 'windows-1252',
-    'l10': 'iso8859-16',
-    'l2': 'iso8859-2',
-    'l3': 'iso8859-3',
-    'l4': 'iso8859-4',
-    'l5': 'windows-1254',
-    'l6': 'iso8859-10',
-    'l8': 'iso8859-14',
-    'latin1': 'windows-1252',
-    'latin10': 'iso8859-16',
-    'latin2': 'iso8859-2',
-    'latin3': 'iso8859-3',
-    'latin4': 'iso8859-4',
-    'latin5': 'windows-1254',
-    'latin6': 'iso8859-10',
-    'latin8': 'iso8859-14',
-    'latin9': 'iso8859-15',
-    'ms936': 'gbk',
-    'mskanji': 'shift_jis',
-    'pt154': 'ptcp154',
-    'ptcp154': 'ptcp154',
-    'r8': 'hp-roman8',
-    'roman8': 'hp-roman8',
-    'shiftjis': 'shift_jis',
-    'tis620': 'cp874',
-    'unicode11utf7': 'utf-7',
-    'us': 'ascii',
-    'usascii': 'ascii',
-    'utf16': 'utf-16',
-    'utf16be': 'utf-16-be',
-    'utf16le': 'utf-16-le',
-    'utf8': 'utf-8',
-    'windows1250': 'cp1250',
-    'windows1251': 'cp1251',
-    'windows1252': 'cp1252',
-    'windows1253': 'cp1253',
-    'windows1254': 'cp1254',
-    'windows1255': 'cp1255',
-    'windows1256': 'cp1256',
-    'windows1257': 'cp1257',
-    'windows1258': 'cp1258',
-    'windows936': 'gbk',
-    'x-x-big5': 'big5'}
-
 tokenTypes = {
    "Doctype": 0,
    "Characters": 1,
@@ -3088,8 +2930,8 @@ tokenTypes = {
    "ParseError": 7
 }

-tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"],
-                           tokenTypes["EmptyTag"]))
+tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"],
+                           tokenTypes["EmptyTag"]])


 prefixes = dict([(v, k) for k, v in namespaces.items()])
@@ -3097,8 +2939,9 @@ prefixes["http://www.w3.org/1998/Math/MathML"] = "math"


 class DataLossWarning(UserWarning):
+    """Raised when the current tree is unable to represent the input data"""
    pass


-class ReparseException(Exception):
+class _ReparseException(Exception):
    pass
@@ -1,20 +1,29 @@
 from __future__ import absolute_import, division, unicode_literals

-from . import _base
+from . import base

-try:
-    from collections import OrderedDict
-except ImportError:
-    from ordereddict import OrderedDict
+from collections import OrderedDict


-class Filter(_base.Filter):
+def _attr_key(attr):
+    """Return an appropriate key for an attribute for sorting
+
+    Attributes have a namespace that can be either ``None`` or a string. We
+    can't compare the two because they're different types, so we convert
+    ``None`` to an empty string first.
+
+    """
+    return (attr[0][0] or ''), attr[0][1]
+
+
+class Filter(base.Filter):
+    """Alphabetizes attributes for elements"""
    def __iter__(self):
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            if token["type"] in ("StartTag", "EmptyTag"):
                attrs = OrderedDict()
                for name, value in sorted(token["data"].items(),
-                                          key=lambda x: x[0]):
+                                          key=_attr_key):
                    attrs[name] = value
                token["data"] = attrs
            yield token
@@ -1,11 +1,19 @@
 from __future__ import absolute_import, division, unicode_literals

-from . import _base
+from . import base


-class Filter(_base.Filter):
+class Filter(base.Filter):
+    """Injects ``<meta charset=ENCODING>`` tag into head of document"""
    def __init__(self, source, encoding):
-        _base.Filter.__init__(self, source)
+        """Creates a Filter
+
+        :arg source: the source token stream
+
+        :arg encoding: the encoding to set
+
+        """
+        base.Filter.__init__(self, source)
        self.encoding = encoding

    def __iter__(self):
@@ -13,7 +21,7 @@ class Filter(_base.Filter):
        meta_found = (self.encoding is None)
        pending = []

-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            type = token["type"]
            if type == "StartTag":
                if token["name"].lower() == "head":
@@ -1,93 +1,93 @@
 from __future__ import absolute_import, division, unicode_literals

-from gettext import gettext
-_ = gettext
+from six import text_type

-from . import _base
-from ..constants import cdataElements, rcdataElements, voidElements
+from . import base
+from ..constants import namespaces, voidElements

 from ..constants import spaceCharacters
 spaceCharacters = "".join(spaceCharacters)


-class LintError(Exception):
-    pass
+class Filter(base.Filter):
+    """Lints the token stream for errors

+    If it finds any errors, it'll raise an ``AssertionError``.
+
+    """
+    def __init__(self, source, require_matching_tags=True):
+        """Creates a Filter
+
+        :arg source: the source token stream
+
+        :arg require_matching_tags: whether or not to require matching tags
+
+        """
+        super(Filter, self).__init__(source)
+        self.require_matching_tags = require_matching_tags

-class Filter(_base.Filter):
    def __iter__(self):
        open_elements = []
-        contentModelFlag = "PCDATA"
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            type = token["type"]
            if type in ("StartTag", "EmptyTag"):
+                namespace = token["namespace"]
                name = token["name"]
-                if contentModelFlag != "PCDATA":
-                    raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
-                if not isinstance(name, str):
-                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
-                if not name:
-                    raise LintError(_("Empty tag name"))
-                if type == "StartTag" and name in voidElements:
-                    raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
-                elif type == "EmptyTag" and name not in voidElements:
-                    raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
-                if type == "StartTag":
-                    open_elements.append(name)
-                for name, value in token["data"]:
-                    if not isinstance(name, str):
-                        raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
-                    if not name:
-                        raise LintError(_("Empty attribute name"))
-                    if not isinstance(value, str):
-                        raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
-                if name in cdataElements:
-                    contentModelFlag = "CDATA"
-                elif name in rcdataElements:
-                    contentModelFlag = "RCDATA"
-                elif name == "plaintext":
-                    contentModelFlag = "PLAINTEXT"
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
+                assert isinstance(token["data"], dict)
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    assert type == "EmptyTag"
+                else:
+                    assert type == "StartTag"
+                if type == "StartTag" and self.require_matching_tags:
+                    open_elements.append((namespace, name))
+                for (namespace, name), value in token["data"].items():
+                    assert namespace is None or isinstance(namespace, text_type)
+                    assert namespace != ""
+                    assert isinstance(name, text_type)
+                    assert name != ""
+                    assert isinstance(value, text_type)

            elif type == "EndTag":
+                namespace = token["namespace"]
                name = token["name"]
-                if not isinstance(name, str):
-                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
-                if not name:
-                    raise LintError(_("Empty tag name"))
-                if name in voidElements:
-                    raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
-                start_name = open_elements.pop()
-                if start_name != name:
-                    raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
-                contentModelFlag = "PCDATA"
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
+                elif self.require_matching_tags:
+                    start = open_elements.pop()
+                    assert start == (namespace, name)

            elif type == "Comment":
-                if contentModelFlag != "PCDATA":
-                    raise LintError(_("Comment not in PCDATA content model flag"))
+                data = token["data"]
+                assert isinstance(data, text_type)

            elif type in ("Characters", "SpaceCharacters"):
                data = token["data"]
-                if not isinstance(data, str):
-                    raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
-                if not data:
-                    raise LintError(_("%(type)s token with empty data") % {"type": type})
+                assert isinstance(data, text_type)
+                assert data != ""
                if type == "SpaceCharacters":
-                    data = data.strip(spaceCharacters)
-                    if data:
-                        raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
+                    assert data.strip(spaceCharacters) == ""

            elif type == "Doctype":
                name = token["name"]
-                if contentModelFlag != "PCDATA":
-                    raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
-                if not isinstance(name, str):
-                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
-                # XXX: what to do with token["data"] ?
+                assert name is None or isinstance(name, text_type)
+                assert token["publicId"] is None or isinstance(name, text_type)
+                assert token["systemId"] is None or isinstance(name, text_type)

-            elif type in ("ParseError", "SerializeError"):
-                pass
+            elif type == "Entity":
+                assert isinstance(token["name"], text_type)
+
+            elif type == "SerializerError":
+                assert isinstance(token["data"], text_type)

            else:
-                raise LintError(_("Unknown token type: %(type)s") % {"type": type})
+                assert False, "Unknown token type: %(type)s" % {"type": type}

            yield token
@@ -1,9 +1,10 @@
 from __future__ import absolute_import, division, unicode_literals

-from . import _base
+from . import base


-class Filter(_base.Filter):
+class Filter(base.Filter):
+    """Removes optional tags from the token stream"""
    def slider(self):
        previous1 = previous2 = None
        for token in self.source:
@@ -11,7 +12,8 @@ class Filter(_base.Filter):
                yield previous2, previous1, token
            previous2 = previous1
            previous1 = token
-        yield previous2, previous1, None
+        if previous1 is not None:
+            yield previous2, previous1, None

    def __iter__(self):
        for previous, token, next in self.slider():
@@ -58,7 +60,7 @@ class Filter(_base.Filter):
        elif tagname == 'colgroup':
            # A colgroup element's start tag may be omitted if the first thing
            # inside the colgroup element is a col element, and if the element
-            # is not immediately preceeded by another colgroup element whose
+            # is not immediately preceded by another colgroup element whose
            # end tag has been omitted.
            if type in ("StartTag", "EmptyTag"):
                # XXX: we do not look at the preceding event, so instead we never
@@ -70,7 +72,7 @@ class Filter(_base.Filter):
        elif tagname == 'tbody':
            # A tbody element's start tag may be omitted if the first thing
            # inside the tbody element is a tr element, and if the element is
-            # not immediately preceeded by a tbody, thead, or tfoot element
+            # not immediately preceded by a tbody, thead, or tfoot element
            # whose end tag has been omitted.
            if type == "StartTag":
                # omit the thead and tfoot elements' end tag when they are
@@ -1,12 +1,896 @@
 from __future__ import absolute_import, division, unicode_literals

-from . import _base
-from ..sanitizer import HTMLSanitizerMixin
+import re
+from xml.sax.saxutils import escape, unescape
+
+from six.moves import urllib_parse as urlparse
+
+from . import base
+from ..constants import namespaces, prefixes
+
+__all__ = ["Filter"]


-class Filter(_base.Filter, HTMLSanitizerMixin):
+allowed_elements = frozenset((
+    (namespaces['html'], 'a'),
+    (namespaces['html'], 'abbr'),
+    (namespaces['html'], 'acronym'),
+    (namespaces['html'], 'address'),
+    (namespaces['html'], 'area'),
+    (namespaces['html'], 'article'),
+    (namespaces['html'], 'aside'),
+    (namespaces['html'], 'audio'),
+    (namespaces['html'], 'b'),
+    (namespaces['html'], 'big'),
+    (namespaces['html'], 'blockquote'),
+    (namespaces['html'], 'br'),
+    (namespaces['html'], 'button'),
+    (namespaces['html'], 'canvas'),
+    (namespaces['html'], 'caption'),
+    (namespaces['html'], 'center'),
+    (namespaces['html'], 'cite'),
+    (namespaces['html'], 'code'),
+    (namespaces['html'], 'col'),
+    (namespaces['html'], 'colgroup'),
+    (namespaces['html'], 'command'),
+    (namespaces['html'], 'datagrid'),
+    (namespaces['html'], 'datalist'),
+    (namespaces['html'], 'dd'),
+    (namespaces['html'], 'del'),
+    (namespaces['html'], 'details'),
+    (namespaces['html'], 'dfn'),
+    (namespaces['html'], 'dialog'),
+    (namespaces['html'], 'dir'),
+    (namespaces['html'], 'div'),
+    (namespaces['html'], 'dl'),
+    (namespaces['html'], 'dt'),
+    (namespaces['html'], 'em'),
+    (namespaces['html'], 'event-source'),
+    (namespaces['html'], 'fieldset'),
+    (namespaces['html'], 'figcaption'),
+    (namespaces['html'], 'figure'),
+    (namespaces['html'], 'footer'),
+    (namespaces['html'], 'font'),
+    (namespaces['html'], 'form'),
+    (namespaces['html'], 'header'),
+    (namespaces['html'], 'h1'),
+    (namespaces['html'], 'h2'),
+    (namespaces['html'], 'h3'),
+    (namespaces['html'], 'h4'),
+    (namespaces['html'], 'h5'),
+    (namespaces['html'], 'h6'),
+    (namespaces['html'], 'hr'),
+    (namespaces['html'], 'i'),
+    (namespaces['html'], 'img'),
+    (namespaces['html'], 'input'),
+    (namespaces['html'], 'ins'),
+    (namespaces['html'], 'keygen'),
+    (namespaces['html'], 'kbd'),
+    (namespaces['html'], 'label'),
+    (namespaces['html'], 'legend'),
+    (namespaces['html'], 'li'),
+    (namespaces['html'], 'm'),
+    (namespaces['html'], 'map'),
+    (namespaces['html'], 'menu'),
+    (namespaces['html'], 'meter'),
+    (namespaces['html'], 'multicol'),
+    (namespaces['html'], 'nav'),
+    (namespaces['html'], 'nextid'),
+    (namespaces['html'], 'ol'),
+    (namespaces['html'], 'output'),
+    (namespaces['html'], 'optgroup'),
+    (namespaces['html'], 'option'),
+    (namespaces['html'], 'p'),
+    (namespaces['html'], 'pre'),
+    (namespaces['html'], 'progress'),
+    (namespaces['html'], 'q'),
+    (namespaces['html'], 's'),
+    (namespaces['html'], 'samp'),
+    (namespaces['html'], 'section'),
+    (namespaces['html'], 'select'),
+    (namespaces['html'], 'small'),
+    (namespaces['html'], 'sound'),
+    (namespaces['html'], 'source'),
+    (namespaces['html'], 'spacer'),
+    (namespaces['html'], 'span'),
+    (namespaces['html'], 'strike'),
+    (namespaces['html'], 'strong'),
+    (namespaces['html'], 'sub'),
+    (namespaces['html'], 'sup'),
+    (namespaces['html'], 'table'),
+    (namespaces['html'], 'tbody'),
+    (namespaces['html'], 'td'),
+    (namespaces['html'], 'textarea'),
+    (namespaces['html'], 'time'),
+    (namespaces['html'], 'tfoot'),
+    (namespaces['html'], 'th'),
+    (namespaces['html'], 'thead'),
+    (namespaces['html'], 'tr'),
+    (namespaces['html'], 'tt'),
+    (namespaces['html'], 'u'),
+    (namespaces['html'], 'ul'),
+    (namespaces['html'], 'var'),
+    (namespaces['html'], 'video'),
+    (namespaces['mathml'], 'maction'),
+    (namespaces['mathml'], 'math'),
+    (namespaces['mathml'], 'merror'),
+    (namespaces['mathml'], 'mfrac'),
+    (namespaces['mathml'], 'mi'),
+    (namespaces['mathml'], 'mmultiscripts'),
+    (namespaces['mathml'], 'mn'),
+    (namespaces['mathml'], 'mo'),
+    (namespaces['mathml'], 'mover'),
+    (namespaces['mathml'], 'mpadded'),
+    (namespaces['mathml'], 'mphantom'),
+    (namespaces['mathml'], 'mprescripts'),
+    (namespaces['mathml'], 'mroot'),
+    (namespaces['mathml'], 'mrow'),
+    (namespaces['mathml'], 'mspace'),
+    (namespaces['mathml'], 'msqrt'),
+    (namespaces['mathml'], 'mstyle'),
+    (namespaces['mathml'], 'msub'),
+    (namespaces['mathml'], 'msubsup'),
+    (namespaces['mathml'], 'msup'),
+    (namespaces['mathml'], 'mtable'),
+    (namespaces['mathml'], 'mtd'),
+    (namespaces['mathml'], 'mtext'),
+    (namespaces['mathml'], 'mtr'),
+    (namespaces['mathml'], 'munder'),
+    (namespaces['mathml'], 'munderover'),
+    (namespaces['mathml'], 'none'),
+    (namespaces['svg'], 'a'),
+    (namespaces['svg'], 'animate'),
+    (namespaces['svg'], 'animateColor'),
+    (namespaces['svg'], 'animateMotion'),
+    (namespaces['svg'], 'animateTransform'),
+    (namespaces['svg'], 'clipPath'),
+    (namespaces['svg'], 'circle'),
+    (namespaces['svg'], 'defs'),
+    (namespaces['svg'], 'desc'),
+    (namespaces['svg'], 'ellipse'),
+    (namespaces['svg'], 'font-face'),
+    (namespaces['svg'], 'font-face-name'),
+    (namespaces['svg'], 'font-face-src'),
+    (namespaces['svg'], 'g'),
+    (namespaces['svg'], 'glyph'),
+    (namespaces['svg'], 'hkern'),
+    (namespaces['svg'], 'linearGradient'),
+    (namespaces['svg'], 'line'),
+    (namespaces['svg'], 'marker'),
+    (namespaces['svg'], 'metadata'),
+    (namespaces['svg'], 'missing-glyph'),
+    (namespaces['svg'], 'mpath'),
+    (namespaces['svg'], 'path'),
+    (namespaces['svg'], 'polygon'),
+    (namespaces['svg'], 'polyline'),
+    (namespaces['svg'], 'radialGradient'),
+    (namespaces['svg'], 'rect'),
+    (namespaces['svg'], 'set'),
+    (namespaces['svg'], 'stop'),
+    (namespaces['svg'], 'svg'),
+    (namespaces['svg'], 'switch'),
+    (namespaces['svg'], 'text'),
+    (namespaces['svg'], 'title'),
+    (namespaces['svg'], 'tspan'),
+    (namespaces['svg'], 'use'),
+))
+
+allowed_attributes = frozenset((
+    # HTML attributes
+    (None, 'abbr'),
+    (None, 'accept'),
+    (None, 'accept-charset'),
+    (None, 'accesskey'),
+    (None, 'action'),
+    (None, 'align'),
+    (None, 'alt'),
+    (None, 'autocomplete'),
+    (None, 'autofocus'),
+    (None, 'axis'),
+    (None, 'background'),
+    (None, 'balance'),
+    (None, 'bgcolor'),
+    (None, 'bgproperties'),
+    (None, 'border'),
+    (None, 'bordercolor'),
+    (None, 'bordercolordark'),
+    (None, 'bordercolorlight'),
+    (None, 'bottompadding'),
+    (None, 'cellpadding'),
+    (None, 'cellspacing'),
+    (None, 'ch'),
+    (None, 'challenge'),
+    (None, 'char'),
+    (None, 'charoff'),
+    (None, 'choff'),
+    (None, 'charset'),
+    (None, 'checked'),
+    (None, 'cite'),
+    (None, 'class'),
+    (None, 'clear'),
+    (None, 'color'),
+    (None, 'cols'),
+    (None, 'colspan'),
+    (None, 'compact'),
+    (None, 'contenteditable'),
+    (None, 'controls'),
+    (None, 'coords'),
+    (None, 'data'),
+    (None, 'datafld'),
+    (None, 'datapagesize'),
+    (None, 'datasrc'),
+    (None, 'datetime'),
+    (None, 'default'),
+    (None, 'delay'),
+    (None, 'dir'),
+    (None, 'disabled'),
+    (None, 'draggable'),
+    (None, 'dynsrc'),
+    (None, 'enctype'),
+    (None, 'end'),
+    (None, 'face'),
+    (None, 'for'),
+    (None, 'form'),
+    (None, 'frame'),
+    (None, 'galleryimg'),
+    (None, 'gutter'),
+    (None, 'headers'),
+    (None, 'height'),
+    (None, 'hidefocus'),
+    (None, 'hidden'),
+    (None, 'high'),
+    (None, 'href'),
+    (None, 'hreflang'),
+    (None, 'hspace'),
+    (None, 'icon'),
+    (None, 'id'),
+    (None, 'inputmode'),
+    (None, 'ismap'),
+    (None, 'keytype'),
+    (None, 'label'),
+    (None, 'leftspacing'),
+    (None, 'lang'),
+    (None, 'list'),
+    (None, 'longdesc'),
+    (None, 'loop'),
+    (None, 'loopcount'),
+    (None, 'loopend'),
+    (None, 'loopstart'),
+    (None, 'low'),
+    (None, 'lowsrc'),
+    (None, 'max'),
+    (None, 'maxlength'),
+    (None, 'media'),
+    (None, 'method'),
+    (None, 'min'),
+    (None, 'multiple'),
+    (None, 'name'),
+    (None, 'nohref'),
+    (None, 'noshade'),
+    (None, 'nowrap'),
+    (None, 'open'),
+    (None, 'optimum'),
+    (None, 'pattern'),
+    (None, 'ping'),
+    (None, 'point-size'),
+    (None, 'poster'),
+    (None, 'pqg'),
+    (None, 'preload'),
+    (None, 'prompt'),
+    (None, 'radiogroup'),
+    (None, 'readonly'),
+    (None, 'rel'),
+    (None, 'repeat-max'),
+    (None, 'repeat-min'),
+    (None, 'replace'),
+    (None, 'required'),
+    (None, 'rev'),
+    (None, 'rightspacing'),
+    (None, 'rows'),
+    (None, 'rowspan'),
+    (None, 'rules'),
+    (None, 'scope'),
+    (None, 'selected'),
+    (None, 'shape'),
+    (None, 'size'),
+    (None, 'span'),
+    (None, 'src'),
+    (None, 'start'),
+    (None, 'step'),
+    (None, 'style'),
+    (None, 'summary'),
+    (None, 'suppress'),
+    (None, 'tabindex'),
+    (None, 'target'),
+    (None, 'template'),
+    (None, 'title'),
+    (None, 'toppadding'),
+    (None, 'type'),
+    (None, 'unselectable'),
+    (None, 'usemap'),
+    (None, 'urn'),
+    (None, 'valign'),
+    (None, 'value'),
+    (None, 'variable'),
+    (None, 'volume'),
+    (None, 'vspace'),
+    (None, 'vrml'),
+    (None, 'width'),
+    (None, 'wrap'),
+    (namespaces['xml'], 'lang'),
+    # MathML attributes
+    (None, 'actiontype'),
+    (None, 'align'),
+    (None, 'columnalign'),
+    (None, 'columnalign'),
+    (None, 'columnalign'),
+    (None, 'columnlines'),
+    (None, 'columnspacing'),
+    (None, 'columnspan'),
+    (None, 'depth'),
+    (None, 'display'),
+    (None, 'displaystyle'),
+    (None, 'equalcolumns'),
+    (None, 'equalrows'),
+    (None, 'fence'),
+    (None, 'fontstyle'),
+    (None, 'fontweight'),
+    (None, 'frame'),
+    (None, 'height'),
+    (None, 'linethickness'),
+    (None, 'lspace'),
+    (None, 'mathbackground'),
+    (None, 'mathcolor'),
+    (None, 'mathvariant'),
+    (None, 'mathvariant'),
+    (None, 'maxsize'),
+    (None, 'minsize'),
+    (None, 'other'),
+    (None, 'rowalign'),
+    (None, 'rowalign'),
+    (None, 'rowalign'),
+    (None, 'rowlines'),
+    (None, 'rowspacing'),
+    (None, 'rowspan'),
+    (None, 'rspace'),
+    (None, 'scriptlevel'),
+    (None, 'selection'),
+    (None, 'separator'),
+    (None, 'stretchy'),
+    (None, 'width'),
+    (None, 'width'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xlink'], 'show'),
+    (namespaces['xlink'], 'type'),
+    # SVG attributes
+    (None, 'accent-height'),
+    (None, 'accumulate'),
+    (None, 'additive'),
+    (None, 'alphabetic'),
+    (None, 'arabic-form'),
+    (None, 'ascent'),
+    (None, 'attributeName'),
+    (None, 'attributeType'),
+    (None, 'baseProfile'),
+    (None, 'bbox'),
+    (None, 'begin'),
+    (None, 'by'),
+    (None, 'calcMode'),
+    (None, 'cap-height'),
+    (None, 'class'),
+    (None, 'clip-path'),
+    (None, 'color'),
+    (None, 'color-rendering'),
+    (None, 'content'),
+    (None, 'cx'),
+    (None, 'cy'),
+    (None, 'd'),
+    (None, 'dx'),
+    (None, 'dy'),
+    (None, 'descent'),
+    (None, 'display'),
+    (None, 'dur'),
+    (None, 'end'),
+    (None, 'fill'),
+    (None, 'fill-opacity'),
+    (None, 'fill-rule'),
+    (None, 'font-family'),
+    (None, 'font-size'),
+    (None, 'font-stretch'),
+    (None, 'font-style'),
+    (None, 'font-variant'),
+    (None, 'font-weight'),
+    (None, 'from'),
+    (None, 'fx'),
+    (None, 'fy'),
+    (None, 'g1'),
+    (None, 'g2'),
+    (None, 'glyph-name'),
+    (None, 'gradientUnits'),
+    (None, 'hanging'),
+    (None, 'height'),
+    (None, 'horiz-adv-x'),
+    (None, 'horiz-origin-x'),
+    (None, 'id'),
+    (None, 'ideographic'),
+    (None, 'k'),
+    (None, 'keyPoints'),
+    (None, 'keySplines'),
+    (None, 'keyTimes'),
+    (None, 'lang'),
+    (None, 'marker-end'),
+    (None, 'marker-mid'),
+    (None, 'marker-start'),
+    (None, 'markerHeight'),
+    (None, 'markerUnits'),
+    (None, 'markerWidth'),
+    (None, 'mathematical'),
+    (None, 'max'),
+    (None, 'min'),
+    (None, 'name'),
+    (None, 'offset'),
+    (None, 'opacity'),
+    (None, 'orient'),
+    (None, 'origin'),
+    (None, 'overline-position'),
+    (None, 'overline-thickness'),
+    (None, 'panose-1'),
+    (None, 'path'),
+    (None, 'pathLength'),
+    (None, 'points'),
+    (None, 'preserveAspectRatio'),
+    (None, 'r'),
+    (None, 'refX'),
+    (None, 'refY'),
+    (None, 'repeatCount'),
+    (None, 'repeatDur'),
+    (None, 'requiredExtensions'),
+    (None, 'requiredFeatures'),
+    (None, 'restart'),
+    (None, 'rotate'),
+    (None, 'rx'),
+    (None, 'ry'),
+    (None, 'slope'),
+    (None, 'stemh'),
+    (None, 'stemv'),
+    (None, 'stop-color'),
+    (None, 'stop-opacity'),
+    (None, 'strikethrough-position'),
+    (None, 'strikethrough-thickness'),
+    (None, 'stroke'),
+    (None, 'stroke-dasharray'),
+    (None, 'stroke-dashoffset'),
+    (None, 'stroke-linecap'),
+    (None, 'stroke-linejoin'),
+    (None, 'stroke-miterlimit'),
+    (None, 'stroke-opacity'),
+    (None, 'stroke-width'),
+    (None, 'systemLanguage'),
+    (None, 'target'),
+    (None, 'text-anchor'),
+    (None, 'to'),
+    (None, 'transform'),
+    (None, 'type'),
+    (None, 'u1'),
+    (None, 'u2'),
+    (None, 'underline-position'),
+    (None, 'underline-thickness'),
+    (None, 'unicode'),
+    (None, 'unicode-range'),
+    (None, 'units-per-em'),
+    (None, 'values'),
+    (None, 'version'),
+    (None, 'viewBox'),
+    (None, 'visibility'),
+    (None, 'width'),
+    (None, 'widths'),
+    (None, 'x'),
+    (None, 'x-height'),
+    (None, 'x1'),
+    (None, 'x2'),
+    (namespaces['xlink'], 'actuate'),
+    (namespaces['xlink'], 'arcrole'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xlink'], 'role'),
+    (namespaces['xlink'], 'show'),
+    (namespaces['xlink'], 'title'),
+    (namespaces['xlink'], 'type'),
+    (namespaces['xml'], 'base'),
+    (namespaces['xml'], 'lang'),
+    (namespaces['xml'], 'space'),
+    (None, 'y'),
+    (None, 'y1'),
+    (None, 'y2'),
+    (None, 'zoomAndPan'),
+))
+
+attr_val_is_uri = frozenset((
+    (None, 'href'),
+    (None, 'src'),
+    (None, 'cite'),
+    (None, 'action'),
+    (None, 'longdesc'),
+    (None, 'poster'),
+    (None, 'background'),
+    (None, 'datasrc'),
+    (None, 'dynsrc'),
+    (None, 'lowsrc'),
+    (None, 'ping'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xml'], 'base'),
+))
+
+svg_attr_val_allows_ref = frozenset((
+    (None, 'clip-path'),
+    (None, 'color-profile'),
+    (None, 'cursor'),
+    (None, 'fill'),
+    (None, 'filter'),
+    (None, 'marker'),
+    (None, 'marker-start'),
+    (None, 'marker-mid'),
+    (None, 'marker-end'),
+    (None, 'mask'),
+    (None, 'stroke'),
+))
+
+svg_allow_local_href = frozenset((
+    (None, 'altGlyph'),
+    (None, 'animate'),
+    (None, 'animateColor'),
+    (None, 'animateMotion'),
+    (None, 'animateTransform'),
+    (None, 'cursor'),
+    (None, 'feImage'),
+    (None, 'filter'),
+    (None, 'linearGradient'),
+    (None, 'pattern'),
+    (None, 'radialGradient'),
+    (None, 'textpath'),
+    (None, 'tref'),
+    (None, 'set'),
+    (None, 'use')
+))
+
+allowed_css_properties = frozenset((
+    'azimuth',
+    'background-color',
+    'border-bottom-color',
+    'border-collapse',
+    'border-color',
+    'border-left-color',
+    'border-right-color',
+    'border-top-color',
+    'clear',
+    'color',
+    'cursor',
+    'direction',
+    'display',
+    'elevation',
+    'float',
+    'font',
+    'font-family',
+    'font-size',
+    'font-style',
+    'font-variant',
+    'font-weight',
+    'height',
+    'letter-spacing',
+    'line-height',
+    'overflow',
+    'pause',
+    'pause-after',
+    'pause-before',
+    'pitch',
+    'pitch-range',
+    'richness',
+    'speak',
+    'speak-header',
+    'speak-numeral',
+    'speak-punctuation',
+    'speech-rate',
+    'stress',
+    'text-align',
+    'text-decoration',
+    'text-indent',
+    'unicode-bidi',
+    'vertical-align',
+    'voice-family',
+    'volume',
+    'white-space',
+    'width',
+))
+
+allowed_css_keywords = frozenset((
+    'auto',
+    'aqua',
+    'black',
+    'block',
+    'blue',
+    'bold',
+    'both',
+    'bottom',
+    'brown',
+    'center',
+    'collapse',
+    'dashed',
+    'dotted',
+    'fuchsia',
+    'gray',
+    'green',
+    '!important',
+    'italic',
+    'left',
+    'lime',
+    'maroon',
+    'medium',
+    'none',
+    'navy',
+    'normal',
+    'nowrap',
+    'olive',
+    'pointer',
+    'purple',
+    'red',
+    'right',
+    'solid',
+    'silver',
+    'teal',
+    'top',
+    'transparent',
+    'underline',
+    'white',
+    'yellow',
+))
+
+allowed_svg_properties = frozenset((
+    'fill',
+    'fill-opacity',
+    'fill-rule',
+    'stroke',
+    'stroke-width',
+    'stroke-linecap',
+    'stroke-linejoin',
+    'stroke-opacity',
+))
+
+allowed_protocols = frozenset((
+    'ed2k',
+    'ftp',
+    'http',
+    'https',
+    'irc',
+    'mailto',
+    'news',
+    'gopher',
+    'nntp',
+    'telnet',
+    'webcal',
+    'xmpp',
+    'callto',
+    'feed',
+    'urn',
+    'aim',
+    'rsync',
+    'tag',
+    'ssh',
+    'sftp',
+    'rtsp',
+    'afs',
+    'data',
+))
+
+allowed_content_types = frozenset((
+    'image/png',
+    'image/jpeg',
+    'image/gif',
+    'image/webp',
+    'image/bmp',
+    'text/plain',
+))
+
+
+data_content_type = re.compile(r'''
+                                ^
+                                # Match a content type <application>/<type>
+                                (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
+                                # Match any character set and encoding
+                                (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
+                                  |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
+                                # Assume the rest is data
+                                ,.*
+                                $
+                                ''',
+                               re.VERBOSE)
+
+
+class Filter(base.Filter):
+    """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
+    def __init__(self,
+                 source,
+                 allowed_elements=allowed_elements,
+                 allowed_attributes=allowed_attributes,
+                 allowed_css_properties=allowed_css_properties,
+                 allowed_css_keywords=allowed_css_keywords,
+                 allowed_svg_properties=allowed_svg_properties,
+                 allowed_protocols=allowed_protocols,
+                 allowed_content_types=allowed_content_types,
+                 attr_val_is_uri=attr_val_is_uri,
+                 svg_attr_val_allows_ref=svg_attr_val_allows_ref,
+                 svg_allow_local_href=svg_allow_local_href):
+        """Creates a Filter
+
+        :arg allowed_elements: set of elements to allow--everything else will
+            be escaped
+
+        :arg allowed_attributes: set of attributes to allow in
+            elements--everything else will be stripped
+
+        :arg allowed_css_properties: set of CSS properties to allow--everything
+            else will be stripped
+
+        :arg allowed_css_keywords: set of CSS keywords to allow--everything
+            else will be stripped
+
+        :arg allowed_svg_properties: set of SVG properties to allow--everything
+            else will be removed
+
+        :arg allowed_protocols: set of allowed protocols for URIs
+
+        :arg allowed_content_types: set of allowed content types for ``data`` URIs.
+
+        :arg attr_val_is_uri: set of attributes that have URI values--values
+            that have a scheme not listed in ``allowed_protocols`` are removed
+
+        :arg svg_attr_val_allows_ref: set of SVG attributes that can have
+            references
+
+        :arg svg_allow_local_href: set of SVG elements that can have local
+            hrefs--these are removed
+
+        """
+        super(Filter, self).__init__(source)
+        self.allowed_elements = allowed_elements
+        self.allowed_attributes = allowed_attributes
+        self.allowed_css_properties = allowed_css_properties
+        self.allowed_css_keywords = allowed_css_keywords
+        self.allowed_svg_properties = allowed_svg_properties
+        self.allowed_protocols = allowed_protocols
+        self.allowed_content_types = allowed_content_types
+        self.attr_val_is_uri = attr_val_is_uri
+        self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
+        self.svg_allow_local_href = svg_allow_local_href
+
    def __iter__(self):
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            token = self.sanitize_token(token)
            if token:
                yield token
+
+    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+    # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
+    # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
+    # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
+    # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
+    # allowed.
+    #
+    #   sanitize_html('<script> do_nasty_stuff() </script>')
+    #    => &lt;script> do_nasty_stuff() &lt;/script>
+    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
+    #    => <a>Click here for $100</a>
+    def sanitize_token(self, token):
+
+        # accommodate filters which use token_type differently
+        token_type = token["type"]
+        if token_type in ("StartTag", "EndTag", "EmptyTag"):
+            name = token["name"]
+            namespace = token["namespace"]
+            if ((namespace, name) in self.allowed_elements or
+                (namespace is None and
+                 (namespaces["html"], name) in self.allowed_elements)):
+                return self.allowed_token(token)
+            else:
+                return self.disallowed_token(token)
+        elif token_type == "Comment":
+            pass
+        else:
+            return token
+
+    def allowed_token(self, token):
+        if "data" in token:
+            attrs = token["data"]
+            attr_names = set(attrs.keys())
+
+            # Remove forbidden attributes
+            for to_remove in (attr_names - self.allowed_attributes):
+                del token["data"][to_remove]
+                attr_names.remove(to_remove)
+
+            # Remove attributes with disallowed URL values
+            for attr in (attr_names & self.attr_val_is_uri):
+                assert attr in attrs
+                # I don't have a clue where this regexp comes from or why it matches those
+                # characters, nor why we call unescape. I just know it's always been here.
+                # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
+                # this will do is remove *more* than it otherwise would.
+                val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
+                                       unescape(attrs[attr])).lower()
+                # remove replacement characters from unescaped characters
+                val_unescaped = val_unescaped.replace("\ufffd", "")
+                try:
+                    uri = urlparse.urlparse(val_unescaped)
+                except ValueError:
+                    uri = None
+                    del attrs[attr]
+                if uri and uri.scheme:
+                    if uri.scheme not in self.allowed_protocols:
+                        del attrs[attr]
+                    if uri.scheme == 'data':
+                        m = data_content_type.match(uri.path)
+                        if not m:
+                            del attrs[attr]
+                        elif m.group('content_type') not in self.allowed_content_types:
+                            del attrs[attr]
+
+            for attr in self.svg_attr_val_allows_ref:
+                if attr in attrs:
+                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                         ' ',
+                                         unescape(attrs[attr]))
+            if (token["name"] in self.svg_allow_local_href and
+                (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
+                                                                     attrs[(namespaces['xlink'], 'href')])):
+                del attrs[(namespaces['xlink'], 'href')]
+            if (None, 'style') in attrs:
+                attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
+            token["data"] = attrs
+        return token
+
+    def disallowed_token(self, token):
+        token_type = token["type"]
+        if token_type == "EndTag":
+            token["data"] = "</%s>" % token["name"]
+        elif token["data"]:
+            assert token_type in ("StartTag", "EmptyTag")
+            attrs = []
+            for (ns, name), v in token["data"].items():
+                attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
+            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
+        else:
+            token["data"] = "<%s>" % token["name"]
+        if token.get("selfClosing"):
+            token["data"] = token["data"][:-1] + "/>"
+
+        token["type"] = "Characters"
+
+        del token["name"]
+        return token
+
+    def sanitize_css(self, style):
+        # disallow urls
+        style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+        # gauntlet
+        if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+            return ''
+        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+            return ''
+
+        clean = []
+        for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
+            if not value:
+                continue
+            if prop.lower() in self.allowed_css_properties:
+                clean.append(prop + ': ' + value + ';')
+            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
+                                                'padding']:
+                for keyword in value.split():
+                    if keyword not in self.allowed_css_keywords and \
+                            not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):  # noqa
+                        break
+                else:
+                    clean.append(prop + ': ' + value + ';')
+            elif prop.lower() in self.allowed_svg_properties:
+                clean.append(prop + ': ' + value + ';')
+
+        return ' '.join(clean)
@@ -2,20 +2,20 @@ from __future__ import absolute_import, division, unicode_literals

 import re

-from . import _base
+from . import base
 from ..constants import rcdataElements, spaceCharacters
 spaceCharacters = "".join(spaceCharacters)

 SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)


-class Filter(_base.Filter):
-
+class Filter(base.Filter):
+    """Collapses whitespace except in pre, textarea, and script elements"""
    spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))

    def __iter__(self):
        preserve = 0
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            type = token["type"]
            if type == "StartTag" \
                    and (preserve or token["name"] in self.spacePreserveElements):
@@ -1,271 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-
-import re
-from xml.sax.saxutils import escape, unescape
-
-from .tokenizer import HTMLTokenizer
-from .constants import tokenTypes
-
-
-class HTMLSanitizerMixin(object):
-    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
-
-    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
-                           'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
-                           'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
-                           'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
-                           'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
-                           'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
-                           'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
-                           'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
-                           'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
-                           'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
-                           'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
-                           'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
-                           'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
-
-    mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
-                       'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
-                       'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
-                       'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
-                       'munderover', 'none']
-
-    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
-                    'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
-                    'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
-                    'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
-                    'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
-                    'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
-
-    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
-                             'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
-                             'background', 'balance', 'bgcolor', 'bgproperties', 'border',
-                             'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
-                             'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
-                             'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
-                             'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
-                             'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
-                             'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
-                             'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
-                             'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
-                             'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
-                             'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
-                             'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
-                             'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
-                             'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
-                             'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
-                             'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
-                             'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
-                             'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
-                             'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
-                             'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
-                             'width', 'wrap', 'xml:lang']
-
-    mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
-                         'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
-                         'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
-                         'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
-                         'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
-                         'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
-                         'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
-                         'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
-                         'xlink:type', 'xmlns', 'xmlns:xlink']
-
-    svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
-                      'arabic-form', 'ascent', 'attributeName', 'attributeType',
-                      'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
-                      'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
-                      'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
-                      'fill-opacity', 'fill-rule', 'font-family', 'font-size',
-                      'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
-                      'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
-                      'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
-                      'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
-                      'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
-                      'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
-                      'opacity', 'orient', 'origin', 'overline-position',
-                      'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
-                      'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
-                      'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
-                      'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
-                      'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
-                      'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
-                      'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
-                      'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
-                      'transform', 'type', 'u1', 'u2', 'underline-position',
-                      'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
-                      'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
-                      'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
-                      'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
-                      'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
-                      'y1', 'y2', 'zoomAndPan']
-
-    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
-                       'xlink:href', 'xml:base']
-
-    svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
-                               'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
-                               'mask', 'stroke']
-
-    svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
-                            'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
-                            'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
-                            'set', 'use']
-
-    acceptable_css_properties = ['azimuth', 'background-color',
-                                 'border-bottom-color', 'border-collapse', 'border-color',
-                                 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
-                                 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
-                                 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
-                                 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
-                                 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
-                                 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
-                                 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
-                                 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
-                                 'white-space', 'width']
-
-    acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
-                               'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
-                               'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
-                               'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
-                               'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
-                               'transparent', 'underline', 'white', 'yellow']
-
-    acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
-                                 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
-                                 'stroke-opacity']
-
-    acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
-                            'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
-                            'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
-                            'ssh', 'sftp', 'rtsp', 'afs']
-
-    # subclasses may define their own versions of these constants
-    allowed_elements = acceptable_elements + mathml_elements + svg_elements
-    allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
-    allowed_css_properties = acceptable_css_properties
-    allowed_css_keywords = acceptable_css_keywords
-    allowed_svg_properties = acceptable_svg_properties
-    allowed_protocols = acceptable_protocols
-
-    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
-    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
-    # attributes are parsed, and a restricted set, # specified by
-    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
-    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
-    # in ALLOWED_PROTOCOLS are allowed.
-    #
-    #   sanitize_html('<script> do_nasty_stuff() </script>')
-    #    => &lt;script> do_nasty_stuff() &lt;/script>
-    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
-    #    => <a>Click here for $100</a>
-    def sanitize_token(self, token):
-
-        # accommodate filters which use token_type differently
-        token_type = token["type"]
-        if token_type in list(tokenTypes.keys()):
-            token_type = tokenTypes[token_type]
-
-        if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
-                          tokenTypes["EmptyTag"]):
-            if token["name"] in self.allowed_elements:
-                return self.allowed_token(token, token_type)
-            else:
-                return self.disallowed_token(token, token_type)
-        elif token_type == tokenTypes["Comment"]:
-            pass
-        else:
-            return token
-
-    def allowed_token(self, token, token_type):
-        if "data" in token:
-            attrs = dict([(name, val) for name, val in
-                          token["data"][::-1]
-                          if name in self.allowed_attributes])
-            for attr in self.attr_val_is_uri:
-                if attr not in attrs:
-                    continue
-                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
-                                       unescape(attrs[attr])).lower()
-                # remove replacement characters from unescaped characters
-                val_unescaped = val_unescaped.replace("\ufffd", "")
-                if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
-                    (val_unescaped.split(':')[0] not in
-                     self.allowed_protocols)):
-                    del attrs[attr]
-            for attr in self.svg_attr_val_allows_ref:
-                if attr in attrs:
-                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
-                                         ' ',
-                                         unescape(attrs[attr]))
-            if (token["name"] in self.svg_allow_local_href and
-                'xlink:href' in attrs and re.search('^\s*[^#\s].*',
-                                                    attrs['xlink:href'])):
-                del attrs['xlink:href']
-            if 'style' in attrs:
-                attrs['style'] = self.sanitize_css(attrs['style'])
-            token["data"] = [[name, val] for name, val in list(attrs.items())]
-        return token
-
-    def disallowed_token(self, token, token_type):
-        if token_type == tokenTypes["EndTag"]:
-            token["data"] = "</%s>" % token["name"]
-        elif token["data"]:
-            attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
-            token["data"] = "<%s%s>" % (token["name"], attrs)
-        else:
-            token["data"] = "<%s>" % token["name"]
-        if token.get("selfClosing"):
-            token["data"] = token["data"][:-1] + "/>"
-
-        if token["type"] in list(tokenTypes.keys()):
-            token["type"] = "Characters"
-        else:
-            token["type"] = tokenTypes["Characters"]
-
-        del token["name"]
-        return token
-
-    def sanitize_css(self, style):
-        # disallow urls
-        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
-
-        # gauntlet
-        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
-            return ''
-        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
-            return ''
-
-        clean = []
-        for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
-            if not value:
-                continue
-            if prop.lower() in self.allowed_css_properties:
-                clean.append(prop + ': ' + value + ';')
-            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
-                                                'padding']:
-                for keyword in value.split():
-                    if not keyword in self.acceptable_css_keywords and \
-                            not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
-                        break
-                else:
-                    clean.append(prop + ': ' + value + ';')
-            elif prop.lower() in self.allowed_svg_properties:
-                clean.append(prop + ': ' + value + ';')
-
-        return ' '.join(clean)
-
-
-class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
-    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
-                 lowercaseElementName=False, lowercaseAttrName=False, parser=None):
-        # Change case matching defaults as we only output lowercase html anyway
-        # This solution doesn't seem ideal...
-        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
-                               lowercaseElementName, lowercaseAttrName, parser=parser)
-
-    def __iter__(self):
-        for token in HTMLTokenizer.__iter__(self):
-            token = self.sanitize_token(token)
-            if token:
-                yield token
@@ -0,0 +1,409 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+import re
+
+from codecs import register_error, xmlcharrefreplace_errors
+
+from .constants import voidElements, booleanAttributes, spaceCharacters
+from .constants import rcdataElements, entities, xmlEntities
+from . import treewalkers, _utils
+from xml.sax.saxutils import escape
+
+_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
+_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
+_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
+                                   "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
+                                   "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
+                                   "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                                   "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
+                                   "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+                                   "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
+                                   "\u3000]")
+
+
+_encode_entity_map = {}
+_is_ucs4 = len("\U0010FFFF") == 1
+for k, v in list(entities.items()):
+    # skip multi-character entities
+    if ((_is_ucs4 and len(v) > 1) or
+            (not _is_ucs4 and len(v) > 2)):
+        continue
+    if v != "&":
+        if len(v) == 2:
+            v = _utils.surrogatePairToCodepoint(v)
+        else:
+            v = ord(v)
+        if v not in _encode_entity_map or k.islower():
+            # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
+            _encode_entity_map[v] = k
+
+
+def htmlentityreplace_errors(exc):
+    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+        res = []
+        codepoints = []
+        skip = False
+        for i, c in enumerate(exc.object[exc.start:exc.end]):
+            if skip:
+                skip = False
+                continue
+            index = i + exc.start
+            if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
+                codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
+                skip = True
+            else:
+                codepoint = ord(c)
+            codepoints.append(codepoint)
+        for cp in codepoints:
+            e = _encode_entity_map.get(cp)
+            if e:
+                res.append("&")
+                res.append(e)
+                if not e.endswith(";"):
+                    res.append(";")
+            else:
+                res.append("&#x%s;" % (hex(cp)[2:]))
+        return ("".join(res), exc.end)
+    else:
+        return xmlcharrefreplace_errors(exc)
+
+
+register_error("htmlentityreplace", htmlentityreplace_errors)
+
+
+def serialize(input, tree="etree", encoding=None, **serializer_opts):
+    """Serializes the input token stream using the specified treewalker
+
+    :arg input: the token stream to serialize
+
+    :arg tree: the treewalker to use
+
+    :arg encoding: the encoding to use
+
+    :arg serializer_opts: any options to pass to the
+        :py:class:`html5lib.serializer.HTMLSerializer` that gets created
+
+    :returns: the tree serialized as a string
+
+    Example:
+
+    >>> from html5lib.html5parser import parse
+    >>> from html5lib.serializer import serialize
+    >>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
+    >>> serialize(token_stream, omit_optional_tags=False)
+    '<html><head></head><body><p>Hi!</p></body></html>'
+
+    """
+    # XXX: Should we cache this?
+    walker = treewalkers.getTreeWalker(tree)
+    s = HTMLSerializer(**serializer_opts)
+    return s.render(walker(input), encoding)
+
+
+class HTMLSerializer(object):
+
+    # attribute quoting options
+    quote_attr_values = "legacy"  # be secure by default
+    quote_char = '"'
+    use_best_quote_char = True
+
+    # tag syntax options
+    omit_optional_tags = True
+    minimize_boolean_attributes = True
+    use_trailing_solidus = False
+    space_before_trailing_solidus = True
+
+    # escaping options
+    escape_lt_in_attrs = False
+    escape_rcdata = False
+    resolve_entities = True
+
+    # miscellaneous options
+    alphabetical_attributes = False
+    inject_meta_charset = True
+    strip_whitespace = False
+    sanitize = False
+
+    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
+               "omit_optional_tags", "minimize_boolean_attributes",
+               "use_trailing_solidus", "space_before_trailing_solidus",
+               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
+               "alphabetical_attributes", "inject_meta_charset",
+               "strip_whitespace", "sanitize")
+
+    def __init__(self, **kwargs):
+        """Initialize HTMLSerializer
+
+        :arg inject_meta_charset: Whether or not to inject the meta charset.
+
+            Defaults to ``True``.
+
+        :arg quote_attr_values: Whether to quote attribute values that don't
+            require quoting per legacy browser behavior (``"legacy"``), when
+            required by the standard (``"spec"``), or always (``"always"``).
+
+            Defaults to ``"legacy"``.
+
+        :arg quote_char: Use given quote character for attribute quoting.
+
+            Defaults to ``"`` which will use double quotes unless attribute
+            value contains a double quote, in which case single quotes are
+            used.
+
+        :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
+            values.
+
+            Defaults to ``False``.
+
+        :arg escape_rcdata: Whether to escape characters that need to be
+            escaped within normal elements within rcdata elements such as
+            style.
+
+            Defaults to ``False``.
+
+        :arg resolve_entities: Whether to resolve named character entities that
+            appear in the source tree. The XML predefined entities &lt; &gt;
+            &amp; &quot; &apos; are unaffected by this setting.
+
+            Defaults to ``True``.
+
+        :arg strip_whitespace: Whether to remove semantically meaningless
+            whitespace. (This compresses all whitespace to a single space
+            except within ``pre``.)
+
+            Defaults to ``False``.
+
+        :arg minimize_boolean_attributes: Shortens boolean attributes to give
+            just the attribute value, for example::
+
+              <input disabled="disabled">
+
+            becomes::
+
+              <input disabled>
+
+            Defaults to ``True``.
+
+        :arg use_trailing_solidus: Includes a close-tag slash at the end of the
+            start tag of void elements (empty elements whose end tag is
+            forbidden). E.g. ``<hr/>``.
+
+            Defaults to ``False``.
+
+        :arg space_before_trailing_solidus: Places a space immediately before
+            the closing slash in a tag using a trailing solidus. E.g.
+            ``<hr />``. Requires ``use_trailing_solidus=True``.
+
+            Defaults to ``True``.
+
+        :arg sanitize: Strip all unsafe or unknown constructs from output.
+            See :py:class:`html5lib.filters.sanitizer.Filter`.
+
+            Defaults to ``False``.
+
+        :arg omit_optional_tags: Omit start/end tags that are optional.
+
+            Defaults to ``True``.
+
+        :arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
+
+            Defaults to ``False``.
+
+        """
+        unexpected_args = frozenset(kwargs) - frozenset(self.options)
+        if len(unexpected_args) > 0:
+            raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
+        if 'quote_char' in kwargs:
+            self.use_best_quote_char = False
+        for attr in self.options:
+            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
+        self.errors = []
+        self.strict = False
+
+    def encode(self, string):
+        assert(isinstance(string, text_type))
+        if self.encoding:
+            return string.encode(self.encoding, "htmlentityreplace")
+        else:
+            return string
+
+    def encodeStrict(self, string):
+        assert(isinstance(string, text_type))
+        if self.encoding:
+            return string.encode(self.encoding, "strict")
+        else:
+            return string
+
+    def serialize(self, treewalker, encoding=None):
+        # pylint:disable=too-many-nested-blocks
+        self.encoding = encoding
+        in_cdata = False
+        self.errors = []
+
+        if encoding and self.inject_meta_charset:
+            from .filters.inject_meta_charset import Filter
+            treewalker = Filter(treewalker, encoding)
+        # Alphabetical attributes is here under the assumption that none of
+        # the later filters add or change order of attributes; it needs to be
+        # before the sanitizer so escaped elements come out correctly
+        if self.alphabetical_attributes:
+            from .filters.alphabeticalattributes import Filter
+            treewalker = Filter(treewalker)
+        # WhitespaceFilter should be used before OptionalTagFilter
+        # for maximum efficiently of this latter filter
+        if self.strip_whitespace:
+            from .filters.whitespace import Filter
+            treewalker = Filter(treewalker)
+        if self.sanitize:
+            from .filters.sanitizer import Filter
+            treewalker = Filter(treewalker)
+        if self.omit_optional_tags:
+            from .filters.optionaltags import Filter
+            treewalker = Filter(treewalker)
+
+        for token in treewalker:
+            type = token["type"]
+            if type == "Doctype":
+                doctype = "<!DOCTYPE %s" % token["name"]
+
+                if token["publicId"]:
+                    doctype += ' PUBLIC "%s"' % token["publicId"]
+                elif token["systemId"]:
+                    doctype += " SYSTEM"
+                if token["systemId"]:
+                    if token["systemId"].find('"') >= 0:
+                        if token["systemId"].find("'") >= 0:
+                            self.serializeError("System identifer contains both single and double quote characters")
+                        quote_char = "'"
+                    else:
+                        quote_char = '"'
+                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
+
+                doctype += ">"
+                yield self.encodeStrict(doctype)
+
+            elif type in ("Characters", "SpaceCharacters"):
+                if type == "SpaceCharacters" or in_cdata:
+                    if in_cdata and token["data"].find("</") >= 0:
+                        self.serializeError("Unexpected </ in CDATA")
+                    yield self.encode(token["data"])
+                else:
+                    yield self.encode(escape(token["data"]))
+
+            elif type in ("StartTag", "EmptyTag"):
+                name = token["name"]
+                yield self.encodeStrict("<%s" % name)
+                if name in rcdataElements and not self.escape_rcdata:
+                    in_cdata = True
+                elif in_cdata:
+                    self.serializeError("Unexpected child element of a CDATA element")
+                for (_, attr_name), attr_value in token["data"].items():
+                    # TODO: Add namespace support here
+                    k = attr_name
+                    v = attr_value
+                    yield self.encodeStrict(' ')
+
+                    yield self.encodeStrict(k)
+                    if not self.minimize_boolean_attributes or \
+                        (k not in booleanAttributes.get(name, tuple()) and
+                         k not in booleanAttributes.get("", tuple())):
+                        yield self.encodeStrict("=")
+                        if self.quote_attr_values == "always" or len(v) == 0:
+                            quote_attr = True
+                        elif self.quote_attr_values == "spec":
+                            quote_attr = _quoteAttributeSpec.search(v) is not None
+                        elif self.quote_attr_values == "legacy":
+                            quote_attr = _quoteAttributeLegacy.search(v) is not None
+                        else:
+                            raise ValueError("quote_attr_values must be one of: "
+                                             "'always', 'spec', or 'legacy'")
+                        v = v.replace("&", "&amp;")
+                        if self.escape_lt_in_attrs:
+                            v = v.replace("<", "&lt;")
+                        if quote_attr:
+                            quote_char = self.quote_char
+                            if self.use_best_quote_char:
+                                if "'" in v and '"' not in v:
+                                    quote_char = '"'
+                                elif '"' in v and "'" not in v:
+                                    quote_char = "'"
+                            if quote_char == "'":
+                                v = v.replace("'", "&#39;")
+                            else:
+                                v = v.replace('"', "&quot;")
+                            yield self.encodeStrict(quote_char)
+                            yield self.encode(v)
+                            yield self.encodeStrict(quote_char)
+                        else:
+                            yield self.encode(v)
+                if name in voidElements and self.use_trailing_solidus:
+                    if self.space_before_trailing_solidus:
+                        yield self.encodeStrict(" /")
+                    else:
+                        yield self.encodeStrict("/")
+                yield self.encode(">")
+
+            elif type == "EndTag":
+                name = token["name"]
+                if name in rcdataElements:
+                    in_cdata = False
+                elif in_cdata:
+                    self.serializeError("Unexpected child element of a CDATA element")
+                yield self.encodeStrict("</%s>" % name)
+
+            elif type == "Comment":
+                data = token["data"]
+                if data.find("--") >= 0:
+                    self.serializeError("Comment contains --")
+                yield self.encodeStrict("<!--%s-->" % token["data"])
+
+            elif type == "Entity":
+                name = token["name"]
+                key = name + ";"
+                if key not in entities:
+                    self.serializeError("Entity %s not recognized" % name)
+                if self.resolve_entities and key not in xmlEntities:
+                    data = entities[key]
+                else:
+                    data = "&%s;" % name
+                yield self.encodeStrict(data)
+
+            else:
+                self.serializeError(token["data"])
+
+    def render(self, treewalker, encoding=None):
+        """Serializes the stream from the treewalker into a string
+
+        :arg treewalker: the treewalker to serialize
+
+        :arg encoding: the string encoding to use
+
+        :returns: the serialized tree
+
+        Example:
+
+        >>> from html5lib import parse, getTreeWalker
+        >>> from html5lib.serializer import HTMLSerializer
+        >>> token_stream = parse('<html><body>Hi!</body></html>')
+        >>> walker = getTreeWalker('etree')
+        >>> serializer = HTMLSerializer(omit_optional_tags=False)
+        >>> serializer.render(walker(token_stream))
+        '<html><head></head><body>Hi!</body></html>'
+
+        """
+        if encoding:
+            return b"".join(list(self.serialize(treewalker, encoding)))
+        else:
+            return "".join(list(self.serialize(treewalker)))
+
+    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
+        # XXX The idea is to make data mandatory.
+        self.errors.append(data)
+        if self.strict:
+            raise SerializeError
+
+
+class SerializeError(Exception):
+    """Error in serialized tree"""
+    pass
@@ -1,16 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-
-from .. import treewalkers
-
-from .htmlserializer import HTMLSerializer
-
-
-def serialize(input, tree="etree", format="html", encoding=None,
-              **serializer_opts):
-    # XXX: Should we cache this?
-    walker = treewalkers.getTreeWalker(tree)
-    if format == "html":
-        s = HTMLSerializer(**serializer_opts)
-    else:
-        raise ValueError("type must be html")
-    return s.render(walker(input), encoding)
@@ -1,320 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-from six import text_type
-
-import gettext
-_ = gettext.gettext
-
-try:
-    from functools import reduce
-except ImportError:
-    pass
-
-from ..constants import voidElements, booleanAttributes, spaceCharacters
-from ..constants import rcdataElements, entities, xmlEntities
-from .. import utils
-from xml.sax.saxutils import escape
-
-spaceCharacters = "".join(spaceCharacters)
-
-try:
-    from codecs import register_error, xmlcharrefreplace_errors
-except ImportError:
-    unicode_encode_errors = "strict"
-else:
-    unicode_encode_errors = "htmlentityreplace"
-
-    encode_entity_map = {}
-    is_ucs4 = len("\U0010FFFF") == 1
-    for k, v in list(entities.items()):
-        # skip multi-character entities
-        if ((is_ucs4 and len(v) > 1) or
-                (not is_ucs4 and len(v) > 2)):
-            continue
-        if v != "&":
-            if len(v) == 2:
-                v = utils.surrogatePairToCodepoint(v)
-            else:
-                v = ord(v)
-            if not v in encode_entity_map or k.islower():
-                # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
-                encode_entity_map[v] = k
-
-    def htmlentityreplace_errors(exc):
-        if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
-            res = []
-            codepoints = []
-            skip = False
-            for i, c in enumerate(exc.object[exc.start:exc.end]):
-                if skip:
-                    skip = False
-                    continue
-                index = i + exc.start
-                if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
-                    codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
-                    skip = True
-                else:
-                    codepoint = ord(c)
-                codepoints.append(codepoint)
-            for cp in codepoints:
-                e = encode_entity_map.get(cp)
-                if e:
-                    res.append("&")
-                    res.append(e)
-                    if not e.endswith(";"):
-                        res.append(";")
-                else:
-                    res.append("&#x%s;" % (hex(cp)[2:]))
-            return ("".join(res), exc.end)
-        else:
-            return xmlcharrefreplace_errors(exc)
-
-    register_error(unicode_encode_errors, htmlentityreplace_errors)
-
-    del register_error
-
-
-class HTMLSerializer(object):
-
-    # attribute quoting options
-    quote_attr_values = False
-    quote_char = '"'
-    use_best_quote_char = True
-
-    # tag syntax options
-    omit_optional_tags = True
-    minimize_boolean_attributes = True
-    use_trailing_solidus = False
-    space_before_trailing_solidus = True
-
-    # escaping options
-    escape_lt_in_attrs = False
-    escape_rcdata = False
-    resolve_entities = True
-
-    # miscellaneous options
-    alphabetical_attributes = False
-    inject_meta_charset = True
-    strip_whitespace = False
-    sanitize = False
-
-    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
-               "omit_optional_tags", "minimize_boolean_attributes",
-               "use_trailing_solidus", "space_before_trailing_solidus",
-               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
-               "alphabetical_attributes", "inject_meta_charset",
-               "strip_whitespace", "sanitize")
-
-    def __init__(self, **kwargs):
-        """Initialize HTMLSerializer.
-
-        Keyword options (default given first unless specified) include:
-
-        inject_meta_charset=True|False
-          Whether it insert a meta element to define the character set of the
-          document.
-        quote_attr_values=True|False
-          Whether to quote attribute values that don't require quoting
-          per HTML5 parsing rules.
-        quote_char=u'"'|u"'"
-          Use given quote character for attribute quoting. Default is to
-          use double quote unless attribute value contains a double quote,
-          in which case single quotes are used instead.
-        escape_lt_in_attrs=False|True
-          Whether to escape < in attribute values.
-        escape_rcdata=False|True
-          Whether to escape characters that need to be escaped within normal
-          elements within rcdata elements such as style.
-        resolve_entities=True|False
-          Whether to resolve named character entities that appear in the
-          source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
-          are unaffected by this setting.
-        strip_whitespace=False|True
-          Whether to remove semantically meaningless whitespace. (This
-          compresses all whitespace to a single space except within pre.)
-        minimize_boolean_attributes=True|False
-          Shortens boolean attributes to give just the attribute value,
-          for example <input disabled="disabled"> becomes <input disabled>.
-        use_trailing_solidus=False|True
-          Includes a close-tag slash at the end of the start tag of void
-          elements (empty elements whose end tag is forbidden). E.g. <hr/>.
-        space_before_trailing_solidus=True|False
-          Places a space immediately before the closing slash in a tag
-          using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
-        sanitize=False|True
-          Strip all unsafe or unknown constructs from output.
-          See `html5lib user documentation`_
-        omit_optional_tags=True|False
-          Omit start/end tags that are optional.
-        alphabetical_attributes=False|True
-          Reorder attributes to be in alphabetical order.
-
-        .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
-        """
-        if 'quote_char' in kwargs:
-            self.use_best_quote_char = False
-        for attr in self.options:
-            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
-        self.errors = []
-        self.strict = False
-
-    def encode(self, string):
-        assert(isinstance(string, text_type))
-        if self.encoding:
-            return string.encode(self.encoding, unicode_encode_errors)
-        else:
-            return string
-
-    def encodeStrict(self, string):
-        assert(isinstance(string, text_type))
-        if self.encoding:
-            return string.encode(self.encoding, "strict")
-        else:
-            return string
-
-    def serialize(self, treewalker, encoding=None):
-        self.encoding = encoding
-        in_cdata = False
-        self.errors = []
-
-        if encoding and self.inject_meta_charset:
-            from ..filters.inject_meta_charset import Filter
-            treewalker = Filter(treewalker, encoding)
-        # WhitespaceFilter should be used before OptionalTagFilter
-        # for maximum efficiently of this latter filter
-        if self.strip_whitespace:
-            from ..filters.whitespace import Filter
-            treewalker = Filter(treewalker)
-        if self.sanitize:
-            from ..filters.sanitizer import Filter
-            treewalker = Filter(treewalker)
-        if self.omit_optional_tags:
-            from ..filters.optionaltags import Filter
-            treewalker = Filter(treewalker)
-        # Alphabetical attributes must be last, as other filters
-        # could add attributes and alter the order
-        if self.alphabetical_attributes:
-            from ..filters.alphabeticalattributes import Filter
-            treewalker = Filter(treewalker)
-
-        for token in treewalker:
-            type = token["type"]
-            if type == "Doctype":
-                doctype = "<!DOCTYPE %s" % token["name"]
-
-                if token["publicId"]:
-                    doctype += ' PUBLIC "%s"' % token["publicId"]
-                elif token["systemId"]:
-                    doctype += " SYSTEM"
-                if token["systemId"]:
-                    if token["systemId"].find('"') >= 0:
-                        if token["systemId"].find("'") >= 0:
-                            self.serializeError(_("System identifer contains both single and double quote characters"))
-                        quote_char = "'"
-                    else:
-                        quote_char = '"'
-                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
-
-                doctype += ">"
-                yield self.encodeStrict(doctype)
-
-            elif type in ("Characters", "SpaceCharacters"):
-                if type == "SpaceCharacters" or in_cdata:
-                    if in_cdata and token["data"].find("</") >= 0:
-                        self.serializeError(_("Unexpected </ in CDATA"))
-                    yield self.encode(token["data"])
-                else:
-                    yield self.encode(escape(token["data"]))
-
-            elif type in ("StartTag", "EmptyTag"):
-                name = token["name"]
-                yield self.encodeStrict("<%s" % name)
-                if name in rcdataElements and not self.escape_rcdata:
-                    in_cdata = True
-                elif in_cdata:
-                    self.serializeError(_("Unexpected child element of a CDATA element"))
-                for (attr_namespace, attr_name), attr_value in token["data"].items():
-                    # TODO: Add namespace support here
-                    k = attr_name
-                    v = attr_value
-                    yield self.encodeStrict(' ')
-
-                    yield self.encodeStrict(k)
-                    if not self.minimize_boolean_attributes or \
-                        (k not in booleanAttributes.get(name, tuple())
-                         and k not in booleanAttributes.get("", tuple())):
-                        yield self.encodeStrict("=")
-                        if self.quote_attr_values or not v:
-                            quote_attr = True
-                        else:
-                            quote_attr = reduce(lambda x, y: x or (y in v),
-                                                spaceCharacters + ">\"'=", False)
-                        v = v.replace("&", "&amp;")
-                        if self.escape_lt_in_attrs:
-                            v = v.replace("<", "&lt;")
-                        if quote_attr:
-                            quote_char = self.quote_char
-                            if self.use_best_quote_char:
-                                if "'" in v and '"' not in v:
-                                    quote_char = '"'
-                                elif '"' in v and "'" not in v:
-                                    quote_char = "'"
-                            if quote_char == "'":
-                                v = v.replace("'", "&#39;")
-                            else:
-                                v = v.replace('"', "&quot;")
-                            yield self.encodeStrict(quote_char)
-                            yield self.encode(v)
-                            yield self.encodeStrict(quote_char)
-                        else:
-                            yield self.encode(v)
-                if name in voidElements and self.use_trailing_solidus:
-                    if self.space_before_trailing_solidus:
-                        yield self.encodeStrict(" /")
-                    else:
-                        yield self.encodeStrict("/")
-                yield self.encode(">")
-
-            elif type == "EndTag":
-                name = token["name"]
-                if name in rcdataElements:
-                    in_cdata = False
-                elif in_cdata:
-                    self.serializeError(_("Unexpected child element of a CDATA element"))
-                yield self.encodeStrict("</%s>" % name)
-
-            elif type == "Comment":
-                data = token["data"]
-                if data.find("--") >= 0:
-                    self.serializeError(_("Comment contains --"))
-                yield self.encodeStrict("<!--%s-->" % token["data"])
-
-            elif type == "Entity":
-                name = token["name"]
-                key = name + ";"
-                if not key in entities:
-                    self.serializeError(_("Entity %s not recognized" % name))
-                if self.resolve_entities and key not in xmlEntities:
-                    data = entities[key]
-                else:
-                    data = "&%s;" % name
-                yield self.encodeStrict(data)
-
-            else:
-                self.serializeError(token["data"])
-
-    def render(self, treewalker, encoding=None):
-        if encoding:
-            return b"".join(list(self.serialize(treewalker, encoding)))
-        else:
-            return "".join(list(self.serialize(treewalker)))
-
-    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
-        # XXX The idea is to make data mandatory.
-        self.errors.append(data)
-        if self.strict:
-            raise SerializeError
-
-
-def SerializeError(Exception):
-    """Error in serialized tree"""
-    pass
@@ -0,0 +1,108 @@
+from __future__ import print_function
+import os.path
+import sys
+
+import pkg_resources
+import pytest
+
+from .tree_construction import TreeConstructionFile
+from .tokenizer import TokenizerFile
+from .sanitizer import SanitizerFile
+
+_dir = os.path.abspath(os.path.dirname(__file__))
+_root = os.path.join(_dir, "..", "..")
+_testdata = os.path.join(_dir, "testdata")
+_tree_construction = os.path.join(_testdata, "tree-construction")
+_tokenizer = os.path.join(_testdata, "tokenizer")
+_sanitizer_testdata = os.path.join(_dir, "sanitizer-testdata")
+
+
+def fail_if_missing_pytest_expect():
+    """Throws an exception halting pytest if pytest-expect isn't working"""
+    try:
+        from pytest_expect import expect  # noqa
+    except ImportError:
+        header = '*' * 78
+        print(
+            '\n' +
+            header + '\n' +
+            'ERROR: Either pytest-expect or its dependency u-msgpack-python is not\n' +
+            'installed. Please install them both before running pytest.\n' +
+            header + '\n',
+            file=sys.stderr
+        )
+        raise
+
+
+fail_if_missing_pytest_expect()
+
+
+def pytest_configure(config):
+    msgs = []
+
+    if not os.path.exists(_testdata):
+        msg = "testdata not available! "
+        if os.path.exists(os.path.join(_root, ".git")):
+            msg += ("Please run git submodule update --init --recursive " +
+                    "and then run tests again.")
+        else:
+            msg += ("The testdata doesn't appear to be included with this package, " +
+                    "so finding the right version will be hard. :(")
+        msgs.append(msg)
+
+    if config.option.update_xfail:
+        # Check for optional requirements
+        req_file = os.path.join(_root, "requirements-optional.txt")
+        if os.path.exists(req_file):
+            with open(req_file, "r") as fp:
+                for line in fp:
+                    if (line.strip() and
+                        not (line.startswith("-r") or
+                             line.startswith("#"))):
+                        if ";" in line:
+                            spec, marker = line.strip().split(";", 1)
+                        else:
+                            spec, marker = line.strip(), None
+                        req = pkg_resources.Requirement.parse(spec)
+                        if marker and not pkg_resources.evaluate_marker(marker):
+                            msgs.append("%s not available in this environment" % spec)
+                        else:
+                            try:
+                                installed = pkg_resources.working_set.find(req)
+                            except pkg_resources.VersionConflict:
+                                msgs.append("Outdated version of %s installed, need %s" % (req.name, spec))
+                            else:
+                                if not installed:
+                                    msgs.append("Need %s" % spec)
+
+        # Check cElementTree
+        import xml.etree.ElementTree as ElementTree
+
+        try:
+            import xml.etree.cElementTree as cElementTree
+        except ImportError:
+            msgs.append("cElementTree unable to be imported")
+        else:
+            if cElementTree.Element is ElementTree.Element:
+                msgs.append("cElementTree is just an alias for ElementTree")
+
+    if msgs:
+        pytest.exit("\n".join(msgs))
+
+
+def pytest_collect_file(path, parent):
+    dir = os.path.abspath(path.dirname)
+    dir_and_parents = set()
+    while dir not in dir_and_parents:
+        dir_and_parents.add(dir)
+        dir = os.path.dirname(dir)
+
+    if _tree_construction in dir_and_parents:
+        if path.ext == ".dat":
+            return TreeConstructionFile(path, parent)
+    elif _tokenizer in dir_and_parents:
+        if path.ext == ".test":
+            return TokenizerFile(path, parent)
+    elif _sanitizer_testdata in dir_and_parents:
+        if path.ext == ".dat":
+            return SanitizerFile(path, parent)
@@ -1,41 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-
-import sys
-import os
-
-if __name__ == '__main__':
-    # Allow us to import from the src directory
-    os.chdir(os.path.split(os.path.abspath(__file__))[0])
-    sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src")))
-
-from html5lib.tokenizer import HTMLTokenizer
-
-
-class HTMLParser(object):
-    """ Fake parser to test tokenizer output """
-    def parse(self, stream, output=True):
-        tokenizer = HTMLTokenizer(stream)
-        for token in tokenizer:
-            if output:
-                print(token)
-
-if __name__ == "__main__":
-    x = HTMLParser()
-    if len(sys.argv) > 1:
-        if len(sys.argv) > 2:
-            import hotshot
-            import hotshot.stats
-            prof = hotshot.Profile('stats.prof')
-            prof.runcall(x.parse, sys.argv[1], False)
-            prof.close()
-            stats = hotshot.stats.load('stats.prof')
-            stats.strip_dirs()
-            stats.sort_stats('time')
-            stats.print_stats()
-        else:
-            x.parse(sys.argv[1])
-    else:
-        print("""Usage: python mockParser.py filename [stats]
-        If stats is specified the hotshots profiler will run and output the
-        stats instead.
-        """)
@@ -1,36 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-
-
-def f1():
-    x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    x += y + z
-
-
-def f2():
-    x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    x = x + y + z
-
-
-def f3():
-    x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    x = "".join((x, y, z))
-
-
-def f4():
-    x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    x = "%s%s%s" % (x, y, z)
-
-import timeit
-for x in range(4):
-    statement = "f%s" % (x + 1)
-    t = timeit.Timer(statement, "from __main__ import " + statement)
-    r = t.repeat(3, 1000000)
-    print(r, min(r))
@@ -0,0 +1,50 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import codecs
+import json
+
+import pytest
+
+from html5lib import parseFragment, serialize
+
+
+class SanitizerFile(pytest.File):
+    def collect(self):
+        with codecs.open(str(self.fspath), "r", encoding="utf-8") as fp:
+            tests = json.load(fp)
+        for i, test in enumerate(tests):
+            yield SanitizerTest(str(i), self, test=test)
+
+
+class SanitizerTest(pytest.Item):
+    def __init__(self, name, parent, test):
+        super(SanitizerTest, self).__init__(name, parent)
+        self.obj = lambda: 1  # this is to hack around skipif needing a function!
+        self.test = test
+
+    def runtest(self):
+        input = self.test["input"]
+        expected = self.test["output"]
+
+        parsed = parseFragment(input)
+        serialized = serialize(parsed,
+                               sanitize=True,
+                               omit_optional_tags=False,
+                               use_trailing_solidus=True,
+                               space_before_trailing_solidus=False,
+                               quote_attr_values="always",
+                               quote_char="'",
+                               alphabetical_attributes=True)
+        errorMsg = "\n".join(["\n\nInput:", input,
+                              "\nExpected:", expected,
+                              "\nReceived:", serialized])
+        assert expected == serialized, errorMsg
+
+    def repr_failure(self, excinfo):
+        traceback = excinfo.traceback
+        ntraceback = traceback.cut(path=__file__)
+        excinfo.traceback = ntraceback.filter()
+
+        return excinfo.getrepr(funcargs=True,
+                               showlocals=False,
+                               style="short", tbfilter=False)
@@ -1,5 +1,7 @@
 from __future__ import absolute_import, division, unicode_literals

+# pylint:disable=wrong-import-position
+
 import os
 import sys
 import codecs
@@ -13,44 +15,66 @@ sys.path.insert(0, os.path.abspath(os.path.join(base_path,
                                                os.path.pardir,
                                                os.path.pardir)))

-from html5lib import treebuilders
+from html5lib import treebuilders, treewalkers, treeadapters  # noqa
 del base_path

-# Build a dict of avaliable trees
-treeTypes = {"DOM": treebuilders.getTreeBuilder("dom")}
+# Build a dict of available trees
+treeTypes = {}

-# Try whatever etree implementations are avaliable from a list that are
-#"supposed" to work
-try:
-    import xml.etree.ElementTree as ElementTree
-    treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
-except ImportError:
-    try:
-        import elementtree.ElementTree as ElementTree
-        treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
-    except ImportError:
-        pass
+# DOM impls
+treeTypes["DOM"] = {
+    "builder": treebuilders.getTreeBuilder("dom"),
+    "walker": treewalkers.getTreeWalker("dom")
+}
+
+# ElementTree impls
+import xml.etree.ElementTree as ElementTree  # noqa
+treeTypes['ElementTree'] = {
+    "builder": treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True),
+    "walker": treewalkers.getTreeWalker("etree", ElementTree)
+}

 try:
-    import xml.etree.cElementTree as cElementTree
-    treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
+    import xml.etree.cElementTree as cElementTree  # noqa
 except ImportError:
-    try:
-        import cElementTree
-        treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
-    except ImportError:
-        pass
-
-try:
-    import lxml.etree as lxml  # flake8: noqa
-except ImportError:
-    pass
+    treeTypes['cElementTree'] = None
 else:
-    treeTypes['lxml'] = treebuilders.getTreeBuilder("lxml")
+    # On Python 3.3 and above cElementTree is an alias, don't run them twice.
+    if cElementTree.Element is ElementTree.Element:
+        treeTypes['cElementTree'] = None
+    else:
+        treeTypes['cElementTree'] = {
+            "builder": treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True),
+            "walker": treewalkers.getTreeWalker("etree", cElementTree)
+        }
+
+try:
+    import lxml.etree as lxml  # noqa
+except ImportError:
+    treeTypes['lxml'] = None
+else:
+    treeTypes['lxml'] = {
+        "builder": treebuilders.getTreeBuilder("lxml"),
+        "walker": treewalkers.getTreeWalker("lxml")
+    }
+
+# Genshi impls
+try:
+    import genshi  # noqa
+except ImportError:
+    treeTypes["genshi"] = None
+else:
+    treeTypes["genshi"] = {
+        "builder": treebuilders.getTreeBuilder("dom"),
+        "adapter": lambda tree: treeadapters.genshi.to_genshi(treewalkers.getTreeWalker("dom")(tree)),
+        "walker": treewalkers.getTreeWalker("genshi")
+    }
+
+# pylint:enable=wrong-import-position


-def get_data_files(subdirectory, files='*.dat'):
-    return glob.glob(os.path.join(test_dir, subdirectory, files))
+def get_data_files(subdirectory, files='*.dat', search_dir=test_dir):
+    return sorted(glob.glob(os.path.join(search_dir, subdirectory, files)))


 class DefaultDict(dict):
@@ -71,9 +95,6 @@ class TestData(object):
        self.encoding = encoding
        self.newTestHeading = newTestHeading

-    def __del__(self):
-        self.f.close()
-
    def __iter__(self):
        data = DefaultDict(None)
        key = None
@@ -128,7 +149,7 @@ convertExpected = convert(2)
 def errorMessage(input, expected, actual):
    msg = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n" %
           (repr(input), repr(expected), repr(actual)))
-    if sys.version_info.major == 2:
+    if sys.version_info[0] == 2:
        msg = msg.encode("ascii", "backslashreplace")
    return msg

--- a/Show More
+++ b/Show More