Compare commits

..

188 Commits

Author SHA1 Message Date
pannal bed93bf928 RC5.2 info 2015-10-14 22:13:59 +02:00
pannal 7697ceffef RC 5.2 readme 2015-10-14 22:13:32 +02:00
panni 81dd24a9bd Merge branch 'detached' 2015-10-14 22:05:23 +02:00
panni 729d7d97c4 revert back from plex/localmedia/master to plex/localmedia/dist 2015-10-14 22:04:15 +02:00
pannal c7a4b3c0a4 README.md not so outdated anymore 2015-10-14 19:17:44 +02:00
pannal 3da044ada9 forgot Info.plist update 2015-10-14 19:01:32 +02:00
pannal 44bbc93dae Update README.md 2015-10-14 17:41:13 +02:00
pannal 54341a0afc RC5.1 2015-10-14 17:41:05 +02:00
pannal 599eab3e5b Merge pull request #40 from pannal/RC5
RC5.1
2015-10-14 17:33:44 +02:00
panni 9f9c875234 Merge remote-tracking branch 'origin' into RC5 2015-10-14 17:32:25 +02:00
panni 74c0ed80c5 make hearing impaired more configurable and clear 2015-10-14 17:32:06 +02:00
pannal 5ecb7aea5e update download links 2015-10-14 16:42:10 +02:00
pannal 829eacc4d6 RC5 2015-10-14 16:41:46 +02:00
pannal f7b3f924b4 Merge pull request #39 from pannal/RC5
RC5
2015-10-14 16:32:45 +02:00
panni e247bc0e59 add optional boost for addic7ed subtitles; partly fixes #8 2015-10-14 16:31:56 +02:00
panni 4158416183 hard bail-out if hearing_impaired didn't match 2015-10-14 16:30:33 +02:00
panni cf1181f2af add custom language field; fixes #27 2015-10-14 15:39:42 +02:00
panni a2d1335403 pass known video type info to guessit; fixes #38 2015-10-14 14:53:20 +02:00
panni 520cbb5189 patch subtitle repr to include download/page link; fixes #34 2015-10-14 14:37:44 +02:00
panni e8eeadb094 add colon and single quote to punctuation fix mixin; resolves #36 2015-10-14 13:57:27 +02:00
panni 92a2336dba Merge remote-tracking branch 'origin' into RC5 2015-10-14 13:56:06 +02:00
panni cbc75c8b85 update to newest LocalMediaExtended 2015-10-14 13:40:06 +02:00
panni 563973163e only pass the file name and three parent directories to guessit; should fix #38 2015-10-14 13:24:10 +02:00
panni e147a7a0ca use persistent Daemon mode; use correct bundle versioning; short: 1.0.9, build: 1.0.9.5 2015-10-14 13:16:18 +02:00
panni b494dc7bec cosmetic guessit update; add LICENSE and README 2015-10-14 12:49:10 +02:00
pannal 9ce4b02610 most likely fix punctuation issues with quotes in series names 2015-10-13 10:15:37 +02:00
pannal d0ff69d224 Update README.md 2015-10-11 04:17:56 +02:00
pannal cde09e0f56 add plex forum thread link 2015-10-11 04:17:39 +02:00
pannal 84409395d1 Update README.md 2015-10-11 03:36:40 +02:00
pannal e4e6bcfad2 Update README.md 2015-10-11 03:25:39 +02:00
panni 2103215e41 add dynamic animated logo from github 2015-10-11 03:24:17 +02:00
panni d086569f09 add correct plugin info; test animated subzero :) 2015-10-11 03:13:59 +02:00
panni 28064767ea update Info.plist 2015-10-11 02:42:53 +02:00
panni e996e4d4b6 replace default icon 2015-10-11 02:16:38 +02:00
pannal 422100f9fc Update README.md 2015-10-11 02:12:31 +02:00
pannal c9a7ffd778 Update README.md 2015-10-11 02:11:41 +02:00
pannal db009abf79 Merge pull request #30 from pannal/RC4
decouple from Subliminal.bundle
2015-10-11 02:07:24 +02:00
pannal c1cc7c98ef Update README.md 2015-10-11 02:06:31 +02:00
pannal a08b00d5c4 Update README.md 2015-10-11 02:06:17 +02:00
panni 16a22ab7b2 move more 2015-10-11 02:02:27 +02:00
panni da32ee2504 move moving 2015-10-11 02:01:36 +02:00
panni 54eaa9e695 move stuff 2015-10-11 02:00:11 +02:00
peter penis 28c1481a48 move to Sub-Zero; RC4; add LocalMediaExtended.bundle into SS 2015-10-11 01:57:48 +02:00
pannal cac340ad43 Update Info.plist 2015-10-11 01:53:05 +02:00
pannal d6994d9a60 Update README.md 2015-10-11 01:52:35 +02:00
pannal 90372ad30d Update DefaultPrefs.json 2015-10-10 14:43:12 +02:00
pannal 24fc22dbe6 Update DefaultPrefs.json 2015-10-10 14:42:39 +02:00
pannal 7b7adac774 Update README.md 2015-10-10 00:51:08 +02:00
pannal 7f0ff6ae2f Update README.md 2015-10-10 00:50:27 +02:00
pannal 1b3e58b326 Update README.md 2015-10-10 00:45:55 +02:00
pannal dc47fc60b8 Update README.md 2015-10-09 19:22:16 +02:00
pannal 6c588964a7 Update README.md 2015-10-09 02:42:20 +02:00
pannal f65b24094a Merge pull request #25 from pannal/rc3
pull RC3 into master
2015-10-09 02:36:57 +02:00
panni 6b807be0e6 opensubtitles: add optional credentials for VIPs; fixes #17 2015-10-09 02:35:33 +02:00
panni a794eb8310 providers: move punctuation fix into seperate mixins.py and use it 2015-10-09 02:08:43 +02:00
panni 8290c8a371 tvsubtitles: fix series with punctuation 2015-10-09 02:04:30 +02:00
panni 475152a7eb podnapisi: fix logging 2015-10-09 01:40:24 +02:00
panni 4e75e20ede add download retry option; fixes #24; move questionable only_one setting to the bottom 2015-10-09 01:28:56 +02:00
panni d36823c7ca better score logging; move patched providers to separate folder; better addic7ed punctuation handling in get_show_ids 2015-10-09 00:48:11 +02:00
panni 2a6b387112 addic7ed: fix series detection with punctuation; add missing self 2015-10-08 10:38:29 +02:00
panni a83822bff9 more verbose logging on subtitle download fail 2015-10-08 10:37:51 +02:00
panni 8e7538f6e6 fix broken import 2015-10-07 19:05:48 +02:00
panni 9cdb26f7cc forgot second clean_punctuation 2015-10-07 19:03:45 +02:00
panni 9659c913c4 Merge branch 'master' of github.com:pannal/Subliminal.bundle 2015-10-07 19:02:46 +02:00
panni c9506cb95e fix getting addic7ed show IDs for series with punctuation in their names 2015-10-07 19:02:33 +02:00
pannal 43e6ce3997 Update README.md 2015-10-07 05:13:36 +02:00
pannal dfd12edcb3 Update DefaultPrefs.json 2015-10-07 05:11:10 +02:00
pannal 154a8072f6 Update README.md 2015-10-07 04:07:59 +02:00
pannal 904abaf26b Update README.md 2015-10-07 02:58:32 +02:00
panni bea18a27ba set default TV score to 15; movie score to 30 2015-10-07 02:55:56 +02:00
pannal 2d998eab50 Update README.md 2015-10-07 02:47:40 +02:00
pannal a25a67572b Update README.md 2015-10-07 02:45:23 +02:00
pannal 1bdf6f9969 Merge pull request #22 from pannal/rc1-fix
RC1 fixes
2015-10-07 02:44:10 +02:00
panni 0b32892fa8 better existing subtitles debug logging 2015-10-07 02:42:14 +02:00
panni fea5b8a716 switch to tonswieb/enzyme 2015-10-07 02:06:47 +02:00
panni 90b3707409 update enzyme 2015-10-07 01:07:01 +02:00
panni 1c0224fbe7 skip empty folder creation if not subtitles found; should fix #20 2015-10-07 00:59:07 +02:00
pannal 626fcd1140 Update README.md 2015-09-24 02:57:23 +02:00
pannal b01c84b14c Update README.md 2015-09-24 02:55:53 +02:00
pannal 412492b4d1 Update README.md 2015-09-24 02:55:37 +02:00
panni 9a6f7a4316 forgot import, again 2015-09-24 02:44:30 +02:00
panni 660f887923 correct number casting; fixes #16 2015-09-24 02:34:34 +02:00
panni fe9c67ed91 forgot import 2015-09-24 02:13:20 +02:00
panni d3bbd05e4f subliminal: fix wrong usage of logger; fixes #15 2015-09-24 01:58:18 +02:00
panni 34585129aa Merge branch 'master' of github.com:pannal/Subliminal.bundle 2015-09-24 01:27:26 +02:00
panni 955cd4c173 allow only one subtitle optionally; fixes #3 2015-09-24 01:27:15 +02:00
pannal 4da63a8fd7 Update README.md 2015-09-23 14:40:42 +02:00
panni fa27789608 fixed typo 2015-09-23 14:31:55 +02:00
panni f9e9f35157 Merge branch 'deep_scan_subs'
Conflicts:
	Contents/Code/__init__.py
2015-09-23 14:29:21 +02:00
panni 4a6604f0ab custom folder now takes precedence; also scan subfolders for existing subtitles if configured; update custom folder settings description; remove direct subliminal.video patch and move it to subliminal_patch.patch_video 2015-09-23 14:26:21 +02:00
panni 971d1221da don't die on missing header; maybe fixes #13 2015-09-23 13:36:18 +02:00
panni ba69885477 fix saving subs to video folder without custom_path given; should fix #14 2015-09-23 12:46:07 +02:00
panni 8e23098037 add basic functionality to scan custom (sub-) folders for subtitles 2015-09-19 04:35:48 +02:00
pannal 8da7bf029c Update README.md 2015-09-18 03:48:34 +02:00
pannal e16e58cbfa Update README.md 2015-09-18 03:29:34 +02:00
pannal abb7cd3bfa Update README.md 2015-09-18 03:19:04 +02:00
pannal bfa06f3989 Update README.md 2015-09-18 03:16:37 +02:00
pannal c63529939d Merge pull request #11 from pannal/guessit-0.11.0
update guessit to 0.11.0
2015-09-18 03:16:20 +02:00
panni 2814f57e89 update guessit to 0.11.0 2015-09-18 03:14:21 +02:00
panni 70476883c6 Merge branch 'master' of github.com:pannal/Subliminal.bundle 2015-09-18 03:11:20 +02:00
panni b5ed209453 Revert "update guessit to 0.11.0"
This reverts commit be7687f15d.
2015-09-18 03:10:58 +02:00
panni be7687f15d update guessit to 0.11.0 2015-09-18 03:08:55 +02:00
pannal b7fb8e1e76 Update README.md 2015-09-18 02:56:40 +02:00
pannal 1a03720a7d Update README.md 2015-09-18 02:49:34 +02:00
pannal cb4099109a Update README.md 2015-09-18 02:49:19 +02:00
pannal 131504e7ee Merge pull request #10 from pannal/provider_fixes
Provider fixes/addons
2015-09-18 02:42:31 +02:00
pannal b0c7b480d6 Update README.md 2015-09-18 02:40:03 +02:00
panni e543c927cf add third optional language; update option description 2015-09-18 02:32:16 +02:00
panni 897b602d71 correct typo 2015-09-18 02:27:13 +02:00
panni d94421dcf3 add support for 'fa', Persian (Farsi) 2015-09-18 02:17:30 +02:00
panni e371b99dca add support for pt-br, Portuguese Brasil 2015-09-18 02:16:03 +02:00
panni 49d10e5ff7 remove leftover addic7ed score boost; add use_random_agents option to addic7ed 2015-09-18 02:08:01 +02:00
pannal d959f5b826 Update README.md 2015-09-18 01:07:47 +02:00
pannal 709f5cb605 Merge pull request #7 from pannal/provider_fixes
Provider fixes for newest subliminal
2015-09-18 01:06:48 +02:00
panni b11a051c23 patch language converted for addic7ed to support French (Canadian) 2015-09-18 00:57:54 +02:00
panni 1a77902079 move injection of language converters to subliminal_patch; don't discard provider simply because of LanguageReverseError 2015-09-18 00:43:33 +02:00
pannal 481dc2f3b4 Update README.md 2015-09-13 04:40:55 +02:00
panni 732aa91889 re-add language converters for addic7ed and tvsubtitles 2015-09-12 16:20:34 +02:00
panni 0df4c55548 update babelfish to 0.5.5-dev; remove leftover patch.py 2015-09-12 16:20:10 +02:00
panni 7c72ed41fb moved contents of patch.py into separate files; patch addic7ed provider 2015-09-12 16:04:39 +02:00
panni 83ace14faf patch addic7ed provider to use random user agents (again); honor selected providers again; more info on why a provider was discarded 2015-09-12 15:57:19 +02:00
panni 9b1c3538b3 Merge branch 'master' of github.com:pannal/Subliminal.bundle 2015-09-11 22:11:21 +02:00
panni 27a6e51cd3 bugfix; forgot six.py in the last release 2015-09-11 22:10:39 +02:00
pannal 86fad21cf0 Update README.md 2015-09-11 18:58:28 +02:00
pannal 5d081c3d65 Update README.md 2015-09-11 18:48:17 +02:00
pannal ca74c0af0a Merge pull request #1 from pannal/test
Merge test branch into master
2015-09-11 18:46:34 +02:00
panni 002ec90b09 patch subliminal to work inside Plex's sandbox; this now works with the newest subliminal version 2015-09-11 16:09:04 +02:00
panni 6f42199100 subliminal: don't ignore 'badly encoded' filenames 2015-09-10 23:58:51 +02:00
panni 87bb2493d1 adjust addicted score 2015-09-10 23:57:12 +02:00
panni 716a66e9fa update subliminal to current master 2015-09-10 19:40:41 +02:00
panni 88cc95239a update guessit 2015-06-26 15:03:06 +02:00
panni 924470d2c0 contribute to thetvdbdvdorder 2015-06-26 14:42:34 +02:00
panni 45d5200b89 adjust addicted parameters 2015-06-26 14:42:07 +02:00
panni 8f82554927 fix guessit's detection of release_group 2015-05-26 02:41:13 +02:00
panni 423688c352 boost addic7ed score 2015-05-25 21:17:27 +02:00
panni 8207223002 Merge remote-tracking branch 'source/master' 2015-05-25 20:05:32 +02:00
Bram Walet 3b3fdb34e3 Merge branch 'master' of https://github.com/bramwalet/Subliminal.bundle 2015-02-15 17:19:11 +01:00
Bram Walet ecce1fca9c Fix #12 2015-02-15 17:16:31 +01:00
Bram Walet 2011100251 updated guessit 0.8 -> 0.10.1 2015-02-15 17:04:59 +01:00
Bram Walet 14e42e57ea Update subliminal with latest 0.80-dev 2015-02-15 16:59:09 +01:00
bramwalet 834f18f3d5 Merge pull request #7 from pannal/fix-origin-1
fix wrong usage of subFolder setting
2014-08-01 09:13:23 +02:00
panni cf9a916e95 fix wrong usage of subFolder setting 2014-08-01 03:01:30 +02:00
panni a8a26ec642 fix wrong setting usage in saveSubtitlesToFile 2014-08-01 02:49:05 +02:00
panni e6c398589c Merge remote-tracking branch 'source/master' 2014-08-01 01:42:29 +02:00
bramwalet c5332644f1 Added installation and configuration instructions 2014-07-20 09:27:26 +02:00
bramwalet bd0c134ae0 Update README.md 2014-07-19 16:11:12 +02:00
bramwalet d3282648fd Update README.md 2014-07-19 16:10:51 +02:00
bramwalet 1156817c71 Update README.md 2014-07-19 16:08:05 +02:00
panni e1af48bbc2 Merge remote-tracking branch 'source/master' 2014-07-19 16:04:41 +02:00
bramwalet d6dd8379ab Create License 2014-07-19 15:54:32 +02:00
bramwalet bc73e559d1 Updated README 2014-07-19 15:53:45 +02:00
Bram Walet a6d8c9d5fc Added license files of libraries 2014-07-19 15:43:48 +02:00
Bram Walet a5d8a8b1d8 Added prefs for hearing impaired and minimum score, fixes #3 2014-07-19 15:25:24 +02:00
Bram Walet ae28116c59 Properties to influence what existing subtitles are scanned, bugfix for TV series 2014-07-19 15:06:21 +02:00
Bram Walet b03403cf72 Added preference to save subtitle to filesystem, default false. Fixes #5 2014-07-19 14:50:56 +02:00
Bram Walet f5736fcd3b Changed preferences names, prepared scan output for saving as metadata 2014-07-19 14:23:49 +02:00
Bram Walet 4193c245a5 Renamed and reordered Preferences 2014-07-19 14:00:33 +02:00
Bram Walet c649d5b5fd Refactored movie and tv scanning (remove duplicates) 2014-07-19 13:56:12 +02:00
Bram Walet 1fa70995a3 Merge branch 'pannal-master' 2014-07-19 13:15:52 +02:00
Bram Walet 3afee79415 Merge branch 'master' of https://github.com/pannal/Subliminal.bundle into pannal-master 2014-07-19 13:15:04 +02:00
panni 5e43c1936e remove obsolete folder 2014-07-19 13:11:50 +02:00
Bram Walet fcae524771 TV series subtitles are stored as metadata. 2014-07-19 12:11:27 +02:00
panni 098da50e23 need to bool() this...why? 2014-07-19 05:36:54 +02:00
panni 9b3544bff7 broken, fix None 2014-07-19 05:34:08 +02:00
panni 063cae161b move subfolder logic to function; add support for movies; add scan all default subfolders option 2014-07-19 05:11:13 +02:00
panni 825e073e08 wrong variable used 2014-07-19 04:45:15 +02:00
panni 77098e1dc3 basic custom subfolder support; needs more localmedia support 2014-07-19 04:27:15 +02:00
panni e2210b7624 add prefs for subfolder; use prefs 2014-07-19 04:15:07 +02:00
panni ffa9051d69 Merge remote-tracking branch 'source/master' 2014-07-19 03:43:06 +02:00
panni bad0dbfc71 reapply gitignore 2014-07-19 03:42:40 +02:00
panni 53b938b83d basic support for subtitles in subfolders; temp hardcoded 2014-07-19 03:36:14 +02:00
Bram Walet e6c0e5fe7a Removed deprecated constants 2014-07-18 17:05:26 +02:00
Bram Walet d8513e910d Merged origin/master to local branch 2014-07-18 17:00:41 +02:00
Bram Walet c2d984a908 Provider settings from preferences as dict 2014-07-18 16:57:52 +02:00
bramwalet 15b7f134be Fixes issue #2
Configure dogpile cache to be in memory. Tvsubtitles & Addic7ed providers will work.
2014-07-18 10:23:13 +02:00
bramwalet 3463718195 Merge pull request #1 from pannal/master
Fixed movie subtitles uninitialized variable error
2014-07-18 08:20:04 +02:00
panni 9cf1b759d7 Fixed movie subtitles uninitialized variable error 2014-07-18 00:34:51 +02:00
Bram Walet e8b9d6dd1f Removed .pyc files 2014-07-17 20:20:20 +02:00
Bram Walet 66280ded50 Fixed movie update 2014-07-17 20:18:59 +02:00
Bram Walet 55860e5f18 Implemented find best subtitles, enabling/disabling providers, provider settings
Enabled dogpile.cache to be in memory to make tvsubtitles and addic7ed work.
2014-07-17 20:08:03 +02:00
Bram Walet d7bc17a485 Implemented find best subtitles, enabling/disabling providers, provider settings 2014-07-16 22:26:18 +02:00
Bram Walet be3a291cbc Removed obsolete setuptools library, only pkg_resources was needed. 2014-07-14 21:46:25 +02:00
Bram Walet 4696bfe364 Scanning TV media file by subliminal and searching opensubtitles for subtitles 2014-07-14 21:38:37 +02:00
Bram Walet a81533d2cf Added stevedore to dependencies (guessit needs it) 2014-07-14 21:36:03 +02:00
unknown fc9a8dcf48 Added .settings to .gitignore 2014-07-13 12:10:56 +02:00
unknown 5e6d53fe63 Merge branch 'master' of https://github.com/bramwalet/Subliminal.bundle 2014-07-13 12:09:46 +02:00
unknown 0c68e8cf47 Initial commit, plugin boots (dependencies work) in Plex, but no subtitles are searched. 2014-07-13 12:08:02 +02:00
bramwalet d41a0cdda4 Initial commit 2014-07-13 12:02:05 +02:00
781 changed files with 233349 additions and 104349 deletions
-8
View File
@@ -1,8 +0,0 @@
[report]
exclude_lines =
pragma: no cover
raise NotImplementedError
def __repr__
if __name__ == .__main__.:
omit =
subliminal/cli.py
+11 -20
View File
@@ -1,7 +1,6 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
@@ -9,12 +8,11 @@ __pycache__/
# Distribution / packaging
.Python
env/
bin/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
@@ -24,12 +22,6 @@ var/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
@@ -38,27 +30,26 @@ pip-delete-this-directory.txt
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
# Translations
*.mo
*.pot
# Mr Developer
.mr.developer.cfg
.project
.pydevproject
.settings
# Rope
.ropeproject
# Django stuff:
*.log
*.pot
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Pycharm
.idea
# Subliminal
tests/data/mkv/
-49
View File
@@ -1,49 +0,0 @@
sudo: false
language: python
python:
- "2.7"
- "3.3"
- "3.4"
- "3.5"
env:
- PARSER=native
- PARSER=lxml
addons:
apt:
packages:
- unrar
matrix:
include:
- python: "3.5"
env:
- PARSER=native
- VCR_RECORD_MODE=all
- PYTEST_ADDOPTS="-m integration"
allow_failures:
- python: "3.5"
env:
- PARSER=native
- VCR_RECORD_MODE=all
- PYTEST_ADDOPTS="-m integration"
cache:
directories:
- $HOME/.cache/pip
- tests/data/mkv
before_cache:
- rm -f $HOME/.cache/pip/log/debug.log
install:
- pip install -e .[test]
- if [ $PARSER = "lxml" ]; then pip install lxml; fi
- pip install coveralls
script: python setup.py test --addopts "--cov subliminal --verbose $PYTEST_ADDOPTS"
after_success: coveralls
-19
View File
@@ -1,19 +0,0 @@
Contributing
============
Issues
------
Issues are intended for bug report and feature requests. For any bug report please make sure to include the complete
stack trace and DEBUG level logs as well as reproduce steps.
If you use the CLI, you can create a debug log file with `subliminal --debug [...] 2> debug.log`.
Pull Requests
-------------
You can contribute code and documentation with pull requests. Any code contribution must be unit tested and the pull
request open against the *develop* branch.
Translations
------------
Contribution to translations can be made on [subliminal's transifex page](https://www.transifex.com/subliminal/subliminal/)
Subliminal is configured to work with [transifex-client](http://docs.transifex.com/client/)
-272
View File
@@ -1,272 +0,0 @@
Changelog
---------
2.0.3
^^^^^
**release date:** 2016-06-10
* Fix clearing cache in CLI
2.0.2
^^^^^
**release date:** 2016-06-06
* Fix for dogpile.cache>=0.6.0
* Fix missing sphinx_rtd_theme dependency
2.0.1
^^^^^
**release date:** 2016-06-06
* Fix beautifulsoup4 minimal requirement
2.0.0
^^^^^
**release date:** 2016-06-04
* Add refiners to enrich videos with information from metadata, tvdb and omdb
* Add asynchronous provider search for faster searches
* Add registrable managers so subliminal can run without install
* Add archive support
* Add the ability to customize scoring logic
* Add an age argument to scan_videos for faster scanning
* Add legendas.tv provider
* Add shooter.cn provider
* Improve matching and scoring
* Improve documentation
* Split nautilus integration into its own project
1.1.1
^^^^^
**release date:** 2016-01-03
* Fix scanning videos on bad MKV files
1.1
^^^
**release date:** 2015-12-29
* Fix library usage example in README
* Fix for series name with special characters in addic7ed provider
* Fix id property in thesubdb provider
* Improve matching on titles
* Add support for nautilus context menu with translations
* Add support for searching subtitles in a separate directory
* Add subscenter provider
* Add support for python 3.5
1.0.1
^^^^^
**release date:** 2015-07-23
* Fix unicode issues in CLI (python 2 only)
* Fix score scaling in CLI (python 2 only)
* Improve error handling in CLI
* Color collect report in CLI
1.0
^^^
**release date:** 2015-07-22
* Many changes and fixes
* New test suite
* New documentation
* New CLI
* Added support for SubsCenter
0.7.5
^^^^^
**release date:** 2015-03-04
* Update requirements
* Remove BierDopje provider
* Add pre-guessed video optional argument in scan_video
* Improve hearing impaired support
* Fix TVSubtitles and Podnapisi providers
0.7.4
^^^^^
**release date:** 2014-01-27
* Fix requirements for guessit and babelfish
0.7.3
^^^^^
**release date:** 2013-11-22
* Fix windows compatibility
* Improve subtitle validation
* Improve embedded subtitle languages detection
* Improve unittests
0.7.2
^^^^^
**release date:** 2013-11-10
* Fix TVSubtitles for ambiguous series
* Add a CACHE_VERSION to force cache reloading on version change
* Set CLI default cache expiration time to 30 days
* Add podnapisi provider
* Support script for languages e.g. Latn, Cyrl
* Improve logging levels
* Fix subtitle validation in some rare cases
0.7.1
^^^^^
**release date:** 2013-11-06
* Improve CLI
* Add login support for Addic7ed
* Remove lxml dependency
* Many fixes
0.7.0
^^^^^
**release date:** 2013-10-29
**WARNING:** Complete rewrite of subliminal with backward incompatible changes
* Use enzyme to parse metadata of videos
* Use babelfish to handle languages
* Use dogpile.cache for caching
* Use charade to detect subtitle encoding
* Use pysrt for subtitle validation
* Use entry points for subtitle providers
* New subtitle score computation
* Hearing impaired subtitles support
* Drop async support
* Drop a few providers
* And much more...
0.6.4
^^^^^
**release date:** 2013-05-19
* Fix requirements due to enzyme 0.3
0.6.3
^^^^^
**release date:** 2013-01-17
* Fix requirements due to requests 1.0
0.6.2
^^^^^
**release date:** 2012-09-15
* Fix BierDopje
* Fix Addic7ed
* Fix SubsWiki
* Fix missing enzyme import
* Add Catalan and Galician languages to Addic7ed
* Add possible services in help message of the CLI
* Allow existing filenames to be passed without the ./ prefix
0.6.1
^^^^^
**release date:** 2012-06-24
* Fix subtitle release name in BierDopje
* Fix subtitles being downloaded multiple times
* Add Chinese support to TvSubtitles
* Fix encoding issues
* Fix single download subtitles without the force option
* Add Spanish (Latin America) exception to Addic7ed
* Fix group_by_video when a list entry has None as subtitles
* Add support for Galician language in Subtitulos
* Add an integrity check after subtitles download for Addic7ed
* Add error handling for if not strict in Language
* Fix TheSubDB hash method to return None if the file is too small
* Fix guessit.Language in Video.scan
* Fix language detection of subtitles
0.6.0
^^^^^
**release date:** 2012-06-16
**WARNING:** Backward incompatible changes
* Fix --workers option in CLI
* Use a dedicated module for languages
* Use beautifulsoup4
* Improve return types
* Add scan_filter option
* Add --age option in CLI
* Add TvSubtitles service
* Add Addic7ed service
0.5.1
^^^^^
**release date:** 2012-03-25
* Improve error handling of enzyme parsing
0.5
^^^
**release date:** 2012-03-25
**WARNING:** Backward incompatible changes
* Use more unicode
* New list_subtitles and download_subtitles methods
* New Pool object for asynchronous work
* Improve sort algorithm
* Better error handling
* Make sorting customizable
* Remove class Subliminal
* Remove permissions handling
0.4
^^^
**release date:** 2011-11-11
* Many fixes
* Better error handling
0.3
^^^
**release date:** 2011-08-18
* Fix a bug when series is not guessed by guessit
* Fix dependencies failure when installing package
* Fix encoding issues with logging
* Add a script to ease subtitles download
* Add possibility to choose mode of created files
* Add more checks before adjusting permissions
0.2
^^^
**release date:** 2011-07-11
* Fix plugin configuration
* Fix some encoding issues
* Remove extra logging
0.1
^^^
**release date:** *private release*
* Initial release
+13 -12
View File
@@ -1,20 +1,21 @@
The MIT License (MIT)
Copyright (c) 2016 Antoine Bertin
Copyright (c) 2014 Bram Walet
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
@@ -0,0 +1,444 @@
#local media assets agent
import os, string, hashlib, base64, re, plistlib, unicodedata
import config
import helpers
import localmedia
import audiohelpers
import videohelpers
from mutagen import File
from mutagen.mp4 import MP4
from mutagen.id3 import ID3
from mutagen.flac import FLAC
from mutagen.flac import Picture
from mutagen.oggvorbis import OggVorbis
PERSONAL_MEDIA_IDENTIFIER = "com.plexapp.agents.none"
#####################################################################################################################
@expose
def ReadTags(f):
try:
return dict(File(f, easy=True))
except Exception, e:
Log('Error reading tags from file: %s' % f)
return {}
#####################################################################################################################
class localMediaMovie(Agent.Movies):
name = 'Local Media Assets Extended (Movies)'
languages = [Locale.Language.NoLanguage]
primary_provider = False
persist_stored_files = False
contributes_to = ['com.plexapp.agents.imdb', 'com.plexapp.agents.none']
def search(self, results, media, lang):
results.Append(MetadataSearchResult(id = 'null', score = 100))
def update(self, metadata, media, lang):
# Clear out the title to ensure stale data doesn't clobber other agents' contributions.
metadata.title = None
part = media.items[0].parts[0]
path = os.path.dirname(part.file)
# Look for local media.
try: localmedia.findAssets(metadata, media.title, [path], 'movie', media.items[0].parts)
except Exception, e:
Log('Error finding media for movie %s: %s' % (media.title, str(e)))
# Look for subtitles
for item in media.items:
for part in item.parts:
localmedia.findSubtitles(part)
# If there is an appropriate VideoHelper, use it.
video_helper = videohelpers.VideoHelpers(part.file)
if video_helper:
video_helper.process_metadata(metadata)
#####################################################################################################################
def FindUniqueSubdirs(dirs):
final_dirs = {}
for dir in dirs:
final_dirs[dir] = True
try:
parent = os.path.split(dir)[0]
final_dirs[parent] = True
try: final_dirs[os.path.split(parent)[0]] = True
except: pass
except: pass
if final_dirs.has_key(''):
del final_dirs['']
return final_dirs
class localMediaTV(Agent.TV_Shows):
name = 'Local Media Assets Extended (TV)'
languages = [Locale.Language.NoLanguage]
primary_provider = False
persist_stored_files = False
contributes_to = ['com.plexapp.agents.thetvdb', 'com.plexapp.agents.thetvdbdvdorder', 'com.plexapp.agents.none']
def search(self, results, media, lang):
results.Append(MetadataSearchResult(id = 'null', score = 100))
def update(self, metadata, media, lang):
# Clear out the title to ensure stale data doesn't clobber other agents' contributions.
metadata.title = None
# Look for media, collect directories.
dirs = {}
for s in media.seasons:
Log('Creating season %s', s)
metadata.seasons[s].index = int(s)
for e in media.seasons[s].episodes:
# Make sure metadata exists, and find sidecar media.
episodeMetadata = metadata.seasons[s].episodes[e]
episodeMedia = media.seasons[s].episodes[e].items[0]
dir = os.path.dirname(episodeMedia.parts[0].file)
dirs[dir] = True
try: localmedia.findAssets(episodeMetadata, media.title, [dir], 'episode', episodeMedia.parts)
except Exception, e:
Log('Error finding media for episode: %s' % str(e))
# Figure out the directories we should be looking in.
try: dirs = FindUniqueSubdirs(dirs)
except: dirs = []
# Look for show images.
Log("Looking for show media for %s.", metadata.title)
try: localmedia.findAssets(metadata, media.title, dirs, 'show')
except: Log("Error finding show media.")
# Look for season images.
for s in metadata.seasons:
Log('Looking for season media for %s season %s.', metadata.title, s)
try: localmedia.findAssets(metadata.seasons[s], media.title, dirs, 'season')
except: Log("Error finding season media for season %s" % s)
# Look for subtitles for each episode.
for s in media.seasons:
# If we've got a date based season, ignore it for now, otherwise it'll collide with S/E folders/XML and PMS
# prefers date-based (why?)
if int(s) < 1900 or metadata.guid.startswith(PERSONAL_MEDIA_IDENTIFIER):
for e in media.seasons[s].episodes:
for i in media.seasons[s].episodes[e].items:
# Look for subtitles.
for part in i.parts:
localmedia.findSubtitles(part)
# If there is an appropriate VideoHelper, use it.
video_helper = videohelpers.VideoHelpers(part.file)
if video_helper:
video_helper.process_metadata(metadata, episode = metadata.seasons[s].episodes[e])
else:
# Whack it in case we wrote it.
#del metadata.seasons[s]
pass
#####################################################################################################################
class localMediaArtistCommon(object):
name = 'Local Media Assets Extended (Artists)'
languages = [Locale.Language.NoLanguage]
primary_provider = False
persist_stored_files = False
def update(self, metadata, media, lang):
# Clear out the title to ensure stale data doesn't clobber other agents' contributions.
metadata.title = None
if shouldFindExtras():
extra_type_map = getExtraTypeMap()
artist_file_dirs = []
artist_extras = {}
# First look for track extras.
checked_tag = False
for album in media.children:
for track in album.children:
part = helpers.unicodize(track.items[0].parts[0].file)
findTrackExtra(part, extra_type_map, artist_extras)
artist_file_dirs.append(os.path.dirname(part))
# Look for artist sort field.
if checked_tag == False:
checked_tag = True
audio_helper = audiohelpers.AudioHelpers(part)
if audio_helper and hasattr(audio_helper, 'get_artist_sort_title'):
artist_sort_title = audio_helper.get_artist_sort_title()
if artist_sort_title and hasattr(metadata, 'title_sort'):
metadata.title_sort = artist_sort_title
# Now go through this artist's directories looking for additional extras.
for artist_file_dir in set(artist_file_dirs):
findArtistExtras(helpers.unicodize(artist_file_dir), extra_type_map, artist_extras, media.title)
for extra in sorted(artist_extras.values(), key = lambda v: (getExtraSortOrder()[type(v)], v.title)):
metadata.extras.add(extra)
class localMediaArtistLegacy(localMediaArtistCommon, Agent.Artist):
contributes_to = ['com.plexapp.agents.discogs', 'com.plexapp.agents.lastfm', 'com.plexapp.agents.plexmusic', 'com.plexapp.agents.none']
def search(self, results, media, lang):
results.Append(MetadataSearchResult(id = 'null', name=media.artist, score = 100))
class localMediaArtistModern(localMediaArtistCommon, Agent.Artist):
version = 2
contributes_to = ['com.plexapp.agents.plexmusic']
def search(self, results, tree, hints, lang='en', manual=False):
results.add(SearchResult(id='null', type='artist', parentName=hints.artist, score=100))
def update(self, metadata, media, lang='en', child_guid=None):
super(localMediaArtistModern, self).update(metadata, media, lang)
class localMediaAlbum(Agent.Album):
name = 'Local Media Assets Extended (Albums)'
languages = [Locale.Language.NoLanguage]
primary_provider = False
persist_stored_files = False
contributes_to = ['com.plexapp.agents.discogs', 'com.plexapp.agents.lastfm', 'com.plexapp.agents.plexmusic', 'com.plexapp.agents.none']
def search(self, results, media, lang):
results.Append(MetadataSearchResult(id = 'null', score = 100))
def update(self, metadata, media, lang):
find_extras = shouldFindExtras()
extra_type_map = getExtraTypeMap() if find_extras else None
updateAlbum(metadata, media, lang, find_extras, artist_extras=[], extra_type_map=extra_type_map)
def updateAlbum(metadata, media, lang, find_extras=False, artist_extras={}, extra_type_map=None):
# Clear out the title to ensure stale data doesn't clobber other agents' contributions.
metadata.title = None
valid_posters = []
path = None
for track in media.tracks:
for item in media.tracks[track].items:
for part in item.parts:
filename = helpers.unicodize(part.file)
path = os.path.dirname(filename)
(file_root, fext) = os.path.splitext(filename)
path_files = {}
for p in os.listdir(path):
path_files[p.lower()] = p
# Look for posters
poster_files = config.POSTER_FILES + [ os.path.basename(file_root), helpers.splitPath(path)[-1] ]
for ext in config.ART_EXTS:
for name in poster_files:
file = (name + '.' + ext).lower()
if file in path_files.keys():
data = Core.storage.load(os.path.join(path, path_files[file]))
poster_name = hashlib.md5(data).hexdigest()
valid_posters.append(poster_name)
if poster_name not in metadata.posters:
metadata.posters[poster_name] = Proxy.Media(data)
Log('Local asset image added: ' + file + ', for file: ' + filename)
else:
Log('Skipping local poster since its already added')
# If there is an appropriate AudioHelper, use it.
audio_helper = audiohelpers.AudioHelpers(part.file)
if audio_helper != None:
try:
valid_posters = valid_posters + audio_helper.process_metadata(metadata)
# Album sort title.
if hasattr(audio_helper, 'get_album_sort_title'):
album_sort_title = audio_helper.get_album_sort_title()
if album_sort_title and hasattr(metadata, 'title_sort'):
metadata.title_sort = album_sort_title
if hasattr(audio_helper, 'get_track_sort_title'):
track_sort_title = audio_helper.get_track_sort_title()
track_key = media.tracks[track].guid or track
if track_sort_title and hasattr(metadata.tracks[track_key], 'title_sort'):
metadata.tracks[track_key].title_sort = track_sort_title
except:
pass
# Look for a video extra for this track.
if find_extras:
track_video = findTrackExtra(helpers.unicodize(part.file), extra_type_map)
if track_video is not None:
track_key = media.tracks[track].guid or track
metadata.tracks[track_key].extras.add(track_video)
metadata.posters.validate_keys(valid_posters)
def findTrackExtra(file_path, extra_type_map, artist_extras={}):
# Look for music videos for this track of the format: "track file name - pretty name (optional) - type (optional).ext"
file_name = os.path.basename(file_path)
file_root, file_ext = os.path.splitext(file_name)
track_videos = []
for video in [f for f in os.listdir(os.path.dirname(file_path))
if os.path.splitext(f)[1][1:].lower() in config.VIDEO_EXTS
and helpers.unicodize(f).lower().startswith(file_root.lower())]:
video_file, ext = os.path.splitext(video)
name_components = video_file.split('-')
extra_type = MusicVideoObject
if len(name_components) > 1:
type_component = re.sub(r'[ ._]+', '', name_components[-1].lower())
if type_component in extra_type_map:
extra_type = extra_type_map[type_component]
name_components.pop(-1)
# Use the video file name for the title unless we have a prettier one.
pretty_title = '-'.join(name_components).strip()
if len(pretty_title) - len(file_root) > 0:
pretty_title = pretty_title.replace(file_root, '')
if pretty_title.startswith(file_ext):
pretty_title = pretty_title[len(file_ext):]
pretty_title = re.sub(r'^[- ]+', '', pretty_title)
track_video = extra_type(title=pretty_title, file=os.path.join(os.path.dirname(file_path), video))
artist_extras[video] = track_video
if extra_type in [MusicVideoObject, LyricMusicVideoObject]:
Log('Found video %s for track: %s from file: %s' % (pretty_title, file_name, os.path.join(os.path.dirname(file_path), video)))
track_videos.append(track_video)
else:
Log('Skipping track video %s (only regular music videos allowed on tracks)' % video)
if len(track_videos) > 0:
track_videos = sorted(track_videos, key = lambda v: (getExtraSortOrder()[type(v)], v.title))
return track_videos[0]
else:
return None
def findArtistExtras(path, extra_type_map, artist_extras, artist_name):
# Look for other videos in this directory.
for video in [f for f in os.listdir(path)
if os.path.splitext(f)[1][1:].lower() in config.VIDEO_EXTS
and f not in artist_extras]:
if video not in artist_extras:
Log('Found artist video: %s' % video)
extra = parseArtistExtra(os.path.join(path, video), extra_type_map, artist_name)
if extra is not None:
artist_extras[video] = extra
# Look for artist videos in the custom path if present.
artist_name = normalizeArtist(artist_name)
music_video_path = Prefs['music_video_path']
if music_video_path is not None and len(music_video_path) > 0:
if not os.path.exists(music_video_path):
Log('The specified local music video path doesn\'t exist: %s' % music_video_path)
return
else:
local_files = [f for f in os.listdir(music_video_path)
if (os.path.splitext(f)[1][1:].lower() in config.VIDEO_EXTS or os.path.isdir(os.path.join(music_video_path, f)))
and normalizeArtist(os.path.basename(f)).startswith(artist_name)
and f not in artist_extras]
for local_file in local_files:
# Go ahead and add files directly in the specific path matching the "artist - title - type (optional).ext" convention.
if os.path.isfile(os.path.join(music_video_path, local_file)) and local_file not in artist_extras:
Log('Found artist video: %s' % local_file)
extra = parseArtistExtra(os.path.join(music_video_path, local_file), extra_type_map, artist_name)
if extra is not None:
artist_extras[local_file] = extra
# Also add all the videos in the "local video root/artist" directory if we found one.
elif os.path.isdir(os.path.join(music_video_path, local_file)) and normalizeArtist(os.path.basename(local_file)) == artist_name:
for artist_dir_file in [f for f in os.listdir(os.path.join(music_video_path, local_file))
if os.path.splitext(f)[1][1:].lower() in config.VIDEO_EXTS
and f not in artist_extras]:
if artist_dir_file not in artist_extras:
Log('Found artist video: %s' % artist_dir_file)
extra = parseArtistExtra(os.path.join(music_video_path, local_file, artist_dir_file), extra_type_map, artist_name)
if extra is not None:
artist_extras[artist_dir_file] = extra
def parseArtistExtra(path, extra_type_map, artist_name):
video_file, ext = os.path.splitext(os.path.basename(path))
name_components = video_file.split('-')
# Set the type and whack the type component from the name if we found one.
if len(name_components) > 1 and name_components[-1].lower().strip() in extra_type_map:
extra_type = extra_type_map[name_components.pop(-1).lower().strip()]
else:
extra_type = MusicVideoObject
# Only return concerts if we're new enough.
if extra_type in [ConcertVideoObject] and not Util.VersionAtLeast(Platform.ServerVersion, 0,9,12,2):
Log('Found concert, but skipping, not new enough server.')
return None
# Whack the artist name if it's the first component and we have more than one.
if len(name_components) > 1 and normalizeArtist(name_components[0]) == artist_name:
name_components.pop(0)
return extra_type(title='-'.join(name_components), file=helpers.unicodize(path))
def normalizeArtist(artist_name):
try:
u_artist_name = helpers.unicodize(artist_name)
ret = ''
for i in range(len(u_artist_name)):
if not unicodedata.category(u_artist_name[i]).startswith('P'):
ret += u_artist_name[i]
ret = ret.replace(' ', '').lower()
if len(ret) > 0:
return ret
else:
return artist_name
except Exception, e:
Log('Error normalizing artist: %s' % e)
return artist_name
def shouldFindExtras():
# Determine whether we should look for video extras.
try:
v = ConcertVideoObject()
if Util.VersionAtLeast(Platform.ServerVersion, 0,9,12,0):
find_extras = True
else:
find_extras = False
Log('Not adding extras: Server v0.9.12.0+ required')
except NameError, e:
Log('Not adding extras: Framework v2.6.2+ required')
find_extras = False
return find_extras
def getExtraTypeMap():
return {'video' : MusicVideoObject,
'live' : LiveMusicVideoObject,
'lyrics' : LyricMusicVideoObject,
'behindthescenes' : BehindTheScenesObject,
'interview' : InterviewObject,
'concert' : ConcertVideoObject }
def getExtraSortOrder():
return {MusicVideoObject : 0, LyricMusicVideoObject : 1, ConcertVideoObject : 2, LiveMusicVideoObject : 3, BehindTheScenesObject : 4, InterviewObject : 5}
@@ -0,0 +1,286 @@
import os
import helpers
from mutagen import File as MFile
from mutagen.flac import Picture
class AudioHelper(object):
def __init__(self, filename):
self.filename = filename
def AudioHelpers(filename):
filename = helpers.unicodize(filename)
try:
tag = MFile(filename, None, True)
except Exception, e:
Log('Error getting file details for %s: %s' % (filename, e))
return None
if tag is not None:
for cls in [ ID3AudioHelper, MP4AudioHelper, FLACAudioHelper, OGGAudioHelper ]:
if cls.is_helper_for(type(tag).__name__):
return cls(filename)
return None
def parse_genres(genre):
if genre.find(';') != -1:
genre_list = genre.split(';')
else:
genre_list = genre.split('/')
return genre_list
#####################################################################################################################
class ID3AudioHelper(AudioHelper):
@classmethod
def is_helper_for(cls, tagType):
return tagType in ('EasyID3', 'EasyMP3', 'EasyTrueAudio', 'ID3', 'MP3', 'TrueAudio', 'AIFF') # All of these file types use ID3 tags like MP3
def get_album_sort_title(self):
return self.tags.get('TSOA')
def get_track_sort_title(self):
return self.tags.get('TSOT')
def get_artist_sort_title(self):
try:
self.tags = tags = MFile(self.filename)
tag = self.tags.get('TSO2')
if tag:
return tag
return self.tags.get('TSOP')
except:
pass
return None
def process_metadata(self, metadata):
Log('Reading ID3 tags from: ' + self.filename)
try:
self.tags = tags = MFile(self.filename)
Log('Found tags: ' + str(tags.keys()))
except:
Log('An error occurred while attempting to read ID3 tags from ' + self.filename)
return
# Release Date
try:
year = tags.get('TDRC')
if year is not None and len(year.text) > 0:
metadata.originally_available_at = Datetime.ParseDate('01-01-' + str(year.text[0])).date()
except Exception, e:
Log('Exception reading TDRC (year): ' + str(e))
# Genres
try:
genres = tags.get('TCON')
if genres is not None and len(genres.text) > 0:
metadata.genres.clear()
for genre in genres.text:
for sub_genre in parse_genres(genre):
metadata.genres.add(sub_genre.strip())
except Exception, e:
Log('Exception reading TCON (genre): ' + str(e))
# Posters
try:
valid_posters = []
frames = [f for f in tags if f.startswith('APIC:')]
for frame in frames:
if (tags[frame].mime == 'image/jpeg') or (tags[frame].mime == 'image/jpg'): ext = 'jpg'
elif tags[frame].mime == 'image/png': ext = 'png'
elif tags[frame].mime == 'image/gif': ext = 'gif'
else: ext = ''
poster_name = hashlib.md5(tags[frame].data).hexdigest()
valid_posters.append(poster_name)
if poster_name not in metadata.posters:
Log('Adding embedded APIC art: ' + poster_name)
metadata.posters[poster_name] = Proxy.Media(tags[frame].data, ext = ext)
except Exception, e:
Log('Exception adding posters: ' + str(e))
return valid_posters
#####################################################################################################################
class MP4AudioHelper(AudioHelper):
@classmethod
def is_helper_for(cls, tagType):
return tagType in ['MP4','EasyMP4']
def get_track_sort_title(self):
try:
tags = MFile(self.filename, easy=True)
return tags.get('titlesort')[0] # 'sonm'
except:
return None
def get_album_sort_title(self):
try:
tags = MFile(self.filename, easy=True)
return tags.get('albumsort')[0] # 'soal'
except:
return None
def get_artist_sort_title(self):
try:
tags = MFile(self.filename, easy=True)
return tags.get('artistsort')[0] # 'soar'
except:
return None
def process_metadata(self, metadata):
Log('Reading MP4 tags from: ' + self.filename)
try:
tags = MFile(self.filename)
Log('Found tags: ' + str(tags.keys()))
except:
Log('An error occurred while attempting to parse the MP4 file: ' + self.filename)
return
# Genres
try:
genres = tags.get('\xa9gen')
if genres is not None and len(genres) > 0:
metadata.genres.clear()
for genre in genres:
for sub_genre in parse_genres(genre):
metadata.genres.add(sub_genre.strip())
except Exception, e:
Log('Exception reading \xa9gen (genre): ' + str(e))
# Release Date
try:
release_date = tags.get('\xa9day')
if release_date is not None and len(release_date) > 0:
metadata.originally_available_at = Datetime.ParseDate(release_date[0].split('T')[0])
except Exception, e:
Log('Exception reading \xa9day (release date)' + str(e))
# Posters
valid_posters = []
try:
covers = tags.get('covr')
if covers is not None and len(covers) > 0:
for cover in covers:
poster_name = hashlib.md5(cover).hexdigest()
valid_posters.append(poster_name)
if poster_name not in metadata.posters:
Log('Adding embedded cover art: ' + poster_name)
metadata.posters[poster_name] = Proxy.Media(cover)
except Exception, e:
Log('Exception adding posters: ' + str(e))
return valid_posters
#####################################################################################################################
class FLACAudioHelper(AudioHelper):
@classmethod
def is_helper_for(cls, tagType):
return tagType in ['FLAC']
def process_metadata(self, metadata):
Log('Reading FLAC tags from: ' + self.filename)
try:
tags = MFile(self.filename)
Log('Found tags: ' + str(tags.keys()))
except:
Log('An error occurred while attempting to parse the FLAC file: ' + self.filename)
return
# Genres
try:
genres = tags.get('genre')
if genres is not None and len(genres) > 0:
metadata.genres.clear()
for genre in genres:
for sub_genre in parse_genres(genre):
metadata.genres.add(sub_genre.strip())
except Exception, e:
Log('Exception reading genre: ' + str(e))
# Release Date
try:
release_date = tags.get('date')
if release_date is not None and len(release_date) > 0:
metadata.originally_available_at = Datetime.ParseDate(release_date[0])
except Exception, e:
Log('Exception reading release date' + str(e))
# Posters
valid_posters = []
try:
covers = tags.pictures
if covers is not None and len(covers) > 0:
for cover in covers:
poster_name = hashlib.md5(cover.data).hexdigest()
valid_posters.append(poster_name)
if poster_name not in metadata.posters:
Log('Adding embedded cover art: ' + poster_name)
metadata.posters[poster_name] = Proxy.Media(cover.data)
except Exception, e:
Log('Exception adding posters: ' + str(e))
return valid_posters
#####################################################################################################################
class OGGAudioHelper(AudioHelper):
@classmethod
def is_helper_for(cls, tagType):
return tagType in ['OggVorbis']
def process_metadata(self, metadata):
Log('Reading OGG tags from: ' + self.filename)
try:
tags = MFile(self.filename)
Log('Found tags: ' + str(tags.keys()))
except:
Log('An error occured while attempting to parse the OGG file: ' + self.filename)
return
# Genres
try:
genres = tags.get('genre')
if genres is not None and len(genres) > 0:
metadata.genres.clear()
for genre in genres:
for sub_genre in parse_genres(genre):
metadata.genres.add(sub_genre.strip())
except Exception, e:
Log('Exception reading genre: ' + str(e))
# Release Date
try:
release_date = tags.get('date')
if release_date is not None and len(release_date) > 0:
metadata.originally_available_at = Datetime.ParseDate(release_date[0])
except Exception, e:
Log('Exception reading release date' + str(e))
# Posters
valid_posters = []
try:
covers = tags.get('metadata_block_picture')
if covers is not None and len(covers) > 0:
for cover in covers:
poster = Picture(base64.standard_b64decode(cover))
poster_name = hashlib.md5(poster.data).hexdigest()
valid_posters.append(poster_name)
if poster_name not in metadata.posters:
Log('Adding embedded cover art: ' + poster_name)
metadata.posters[poster_name] = Proxy.Media(poster.data)
except Exception, e:
Log('Exception adding posters: ' + str(e))
return valid_posters
@@ -0,0 +1,11 @@
IMAGE_EXTS = ['jpg', 'png', 'jpeg', 'tbn']
ART_EXTS = ['jpg','jpeg','png','tbn']
AUDIO_EXTS = ['mp3']
SUBTITLE_EXTS = ['utf','utf8','utf-8','srt','smi','rt','ssa','aqt','jss','ass','idx','sub','txt', 'psb']
VIDEO_EXTS = ['3g2', '3gp', 'asf', 'asx', 'avc', 'avi', 'avs', 'bivx', 'bup', 'divx', 'dv', 'dvr-ms', 'evo', 'fli', 'flv',
'm2t', 'm2ts', 'm2v', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'mpg', 'mts', 'nsv', 'nuv', 'ogm', 'ogv', 'tp',
'pva', 'qt', 'rm', 'rmvb', 'sdp', 'svq3', 'strm', 'ts', 'ty', 'vdr', 'viv', 'vob', 'vp3', 'wmv', 'wpl', 'wtv', 'xsp', 'xvid', 'webm']
POSTER_FILES = ['poster','default','cover','movie','folder']
ART_FILES = ['fanart','art','background','backdrop']
@@ -0,0 +1,34 @@
import unicodedata
# Unicode control characters can appear in ID3v2 tags but are not legal in XML.
RE_UNICODE_CONTROL = u'([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])' + \
u'|' + \
u'([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])' % \
(
unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff),
unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff),
unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff)
)
# A platform independent way to split paths which might come in with different separators.
def splitPath(str):
if str.find('\\') != -1:
return str.split('\\')
else:
return str.split('/')
def unicodize(s):
filename = s
try:
filename = unicodedata.normalize('NFC', unicode(s.decode('utf-8')))
except:
Log('Failed to unicodize: ' + filename)
try:
filename = re.sub(RE_UNICODE_CONTROL, '', filename)
except:
Log('Couldn\'t strip control characters: ' + filename)
return filename
def cleanFilename(filename):
#this will remove any whitespace and punctuation chars and replace them with spaces, strip and return as lowercase
return string.translate(filename.encode('utf-8'), string.maketrans(string.punctuation + string.whitespace, ' ' * len (string.punctuation + string.whitespace))).strip().lower()
@@ -0,0 +1,338 @@
import os, unicodedata
import config
import helpers
import subtitlehelpers
#####################################################################################################################
def findAssets(metadata, media_title, paths, type, parts=[]):
ignore_samples = ['[-\._ ]sample', 'sample[-\._ ]']
ignore_trailers = ['-trailer\.']
# Do a quick check to make sure we've got the extra types available in this framework version,
# and that the server is new enough to support them.
#
try:
t = InterviewObject()
if Util.VersionAtLeast(Platform.ServerVersion, 0,9,9,13):
find_extras = True
else:
find_extras = False
Log('Not adding extras: Server v0.9.9.13+ required')
except NameError, e:
Log('Not adding extras: Framework v2.5.0+ required')
find_extras = False
if find_extras:
extra_type_map = {'trailer' : TrailerObject,
'deleted' : DeletedSceneObject,
'behindthescenes' : BehindTheScenesObject,
'interview' : InterviewObject,
'scene' : SceneOrSampleObject}
# We start by building a dictionary of files to their absolute paths. We also need to know
# the number of media files that are actually present, in case the found local media asset
# is limited to a single instance per media file.
#
path_files = {}
multi_parts = []
total_media_files = 0
root_file = getRootFile(helpers.unicodize(parts[0].file)) if parts else None
for path in paths:
path = helpers.unicodize(path)
for file_path in sorted(os.listdir(path)):
# When using os.listdir with a unicode path, it will always return a string using the
# NFD form. However, we internally are using the form NFC and therefore need to convert
# it to allow correct regex / comparisons to be performed.
#
file_path = helpers.unicodize(file_path)
full_path = os.path.join(path,file_path)
if os.path.isfile(full_path):
path_files[file_path.lower()] = full_path
# Only count real and distinct (not stacked) video files.
(root, ext) = os.path.splitext(file_path)
should_count = True
# Check for valid video file extension.
if ext.lower()[1:] not in config.VIDEO_EXTS:
should_count = False
# Don't count sample files if they're smaller than 300MB.
if should_count:
for rx in ignore_samples:
if re.search(rx, full_path, re.IGNORECASE) and os.path.getsize(full_path) < 300 * 1024 * 1024:
Log('%s looks like a sample, won\'t contribute to total media file count.' % file_path)
should_count = False
# Don't count trailer files.
if should_count:
for rx in ignore_trailers:
if re.search(rx, full_path, re.IGNORECASE):
Log('%s looks like a trailer, won\'t contribute to total media file count.' % file_path)
should_count = False
# Don't count dot files.
if should_count:
if root.lower().startswith('.'):
Log('%s won\'t contribute to total media file count.' % file_path)
should_count = False
# Don't count multi-part files (stack everything up to and including the year).
if should_count:
year = re.search(r'([\(\[\.\-])([1-2][0-9]{3})([\.\-\)\]_,+])', file_path)
if year:
multi_part = file_path[0:year.end()]
if multi_part in multi_parts:
should_count = False
Log('%s looks like part of a multi-version set, won\'t contribute to total media file count.' % file_path)
else:
multi_parts.append(multi_part)
# Don't count stacked parts.
if should_count:
if full_path in [p.file for p in parts[1:]]:
should_count = False
Log('%s looks like a stacked part, won\'t contribute to total media file count.' % file_path)
# Don't count things that follow the "-extra" naming convention.
if should_count and find_extras:
for key in extra_type_map.keys():
if root.endswith('-' + key):
Log('%s looks like a %s extra, won\'t contribute to total media file count.' % (file_path, key))
should_count = False
# Don't count things that follow specific trailer naming conventions.
if should_count:
if root == 'trailer' or root.startswith('movie-trailer'):
Log('%s looks like a trailer, won\'t contribute to total media file count.' % (file_path))
should_count = False
if should_count:
total_media_files += 1
if find_extras and type == 'movie':
extras = []
re_strip = Regex('[\W ]+')
if total_media_files != 1:
Log('Found %d media files in this directory, skipping local extras search: %s' % (total_media_files, path))
else:
# Look for extras in named directories.
Log('Looking for local extras in path: ' + path)
for root, dirs, files in os.walk(path):
for d in dirs:
for key in extra_type_map.keys():
if re_strip.sub('', d.lower()).startswith(key):
for f in os.listdir(os.path.join(root, d)):
(fn, ext) = os.path.splitext(f)
if not fn.startswith('.') and ext[1:] in config.VIDEO_EXTS:
# On Windows, os.walk() likes to prepend the "extended-length path prefix" to root.
# This causes issues later on when this path is converted to the file:// URL for
# serialization and later consumption by PMS, so clean it up here.
#
root = re.sub(r'^\\\\\?\\', '', root)
Log('Found %s extra: %s' % (key, f))
extras.append({'type' : key, 'title' : helpers.unicodize(fn), 'file' : os.path.join(root, d, f)})
continue
# Look for filenames following the "-extra" convention and a couple of other special cases.
for f in os.listdir(path):
(fn, ext) = os.path.splitext(f)
# Files named exactly 'trailer' or starting with 'movie-trailer'.
if (fn == 'trailer' or fn.startswith('movie-trailer')) and not fn.startswith('.') and ext[1:] in config.VIDEO_EXTS:
Log('Found trailer extra, renaming with title: ' + media_title)
extras.append({'type' : key, 'title' : media_title, 'file' : os.path.join(path, f)})
# Files following the "-extra" convention.
else:
for key in extra_type_map.keys():
if not fn.startswith('.') and fn.endswith('-' + key) and ext[1:] in config.VIDEO_EXTS:
Log('Found %s extra: %s' % (key, f))
title = ' '.join(fn.split('-')[:-1])
extras.append({'type' : key, 'title' : helpers.unicodize(title), 'file' : os.path.join(path, f)})
# Make sure extras are sorted alphabetically and by type.
type_order = ['trailer', 'behindthescenes', 'interview', 'deleted', 'scene', 'sample']
extras.sort(key=lambda e: e['title'])
extras.sort(key=lambda e: type_order.index(e['type']))
for extra in extras:
metadata.extras.add(extra_type_map[extra['type']](title=extra['title'], file=extra['file']))
Log('Added %d extras' % len(metadata.extras))
Log('Looking for %s media (%s) in %d paths (root file: %s) with %d media files.', type, media_title, len(paths), root_file, total_media_files)
Log('Paths: %s', ", ".join([ helpers.unicodize(p) for p in paths ]))
# Figure out what regexs to use.
search_tuples = []
if type == 'season':
search_tuples += [['season-?0?%s[-a-z]?(-poster)?' % metadata.index, metadata.posters, config.IMAGE_EXTS, False]]
search_tuples += [['season-?0?%s-banner[-a-z]?' % metadata.index, metadata.banners, config.IMAGE_EXTS, False]]
if int(metadata.index) == 0: # Season zero, also look for Frodo-compliant 'specials' artwork.
search_tuples += [['season-specials-poster', metadata.posters, config.IMAGE_EXTS, False]]
search_tuples += [['season-specials-banner', metadata.banners, config.IMAGE_EXTS, False]]
elif type == 'show':
search_tuples += [['(show|poster|folder)-?[0-9]?', metadata.posters, config.IMAGE_EXTS, False]]
search_tuples += [['banner-?[0-9]?', metadata.banners, config.IMAGE_EXTS, False]]
search_tuples += [['(fanart|art|background|backdrop)-?[0-9]?', metadata.art, config.IMAGE_EXTS, False]]
search_tuples += [['theme-?[0-9]?', metadata.themes, config.AUDIO_EXTS, False]]
elif type == 'episode':
search_tuples += [[re.escape(root_file) + '(-|-thumb)?[0-9]?', metadata.thumbs, config.IMAGE_EXTS, False]]
elif type == 'movie':
search_tuples += [['(poster|default|cover|movie|folder|' + re.escape(root_file) + ')-?[0-9]?', metadata.posters, config.IMAGE_EXTS, True]]
search_tuples += [['(fanart|art|background|backdrop|' + re.escape(root_file) + '-fanart' + ')-?[0-9]?', metadata.art, config.IMAGE_EXTS, True]]
for (pattern, media_list, extensions, limited) in search_tuples:
valid_keys = []
sort_index = 1
file_path_keys = sorted(path_files.keys(), key = lambda x: os.path.splitext(x)[0])
for file_path in file_path_keys:
for ext in extensions:
if re.match('%s.%s' % (pattern, ext), file_path, re.IGNORECASE):
# Use a pattern if it's unlimited, or if there's only one media file.
if (limited and total_media_files == 1) or (not limited) or (file_path.find(root_file.lower()) == 0):
# Read data and hash it.
data = Core.storage.load(path_files[file_path])
media_hash = hashlib.md5(data).hexdigest()
# See if we need to add it.
valid_keys.append(media_hash)
if media_hash not in media_list:
media_list[media_hash] = Proxy.Media(data, sort_order = sort_index)
sort_index = sort_index + 1
Log(' Local asset added: %s (%s)', path_files[file_path], media_hash)
else:
Log('Skipping file %s because there are %d media files.', file_path, total_media_files)
Log('Found %d valid things for pattern %s (ext: %s)', len(valid_keys), pattern, str(extensions))
media_list.validate_keys(valid_keys)
def getRootFile(filename):
path = os.path.dirname(filename)
if 'video_ts' == helpers.splitPath(path.lower())[-1]:
path = '/'.join(helpers.splitPath(path)[:-1])
basename = os.path.basename(filename)
(root_file, ext) = os.path.splitext(basename)
return root_file
#####################################################################################################################
def findSubtitles(part):
lang_sub_map = {}
part_filename = helpers.unicodize(part.file)
part_basename = os.path.splitext(os.path.basename(part_filename))[0]
paths = [ os.path.dirname(part_filename) ]
# Check for local subtitles subdirectory
sub_dirs_default = ["sub", "subs", "subtitle", "subtitles"]
sub_dir_base = paths[0]
sub_dir_list = []
if Prefs["scanAll"]:
# not only use the subtitle sub-folders we know, but also search for capitalized versions of them
for sub_dir in sub_dirs_default + [s.capitalize() for s in sub_dirs_default]:
sub_dir_list.append(os.path.join(sub_dir_base, sub_dir))
else:
if Prefs["subFolder"] != "current folder":
# got selected subfolder
sub_dir_list.append(os.path.join(sub_dir_base, Prefs["subFolder"]))
sub_dir_custom = Prefs["subFolderCustom"].strip() if bool(Prefs["subFolderCustom"]) else None
if sub_dir_custom:
# got custom subfolder
if sub_dir_custom.startswith("/"):
# absolute folder
sub_dir_list.append(sub_dir_custom)
else:
# relative folder
sub_dir_list.append(os.path.join(sub_dir_base, sub_dir_custom))
for sub_dir in sub_dir_list:
if os.path.isdir(sub_dir):
paths.append(sub_dir)
# Check for a global subtitle location
global_subtitle_folder = os.path.join(Core.app_support_path, 'Subtitles')
if os.path.exists(global_subtitle_folder):
paths.append(global_subtitle_folder)
# We start by building a dictionary of files to their absolute paths. We also need to know
# the number of media files that are actually present, in case the found local media asset
# is limited to a single instance per media file.
#
file_paths = {}
total_media_files = 0
for path in paths:
path = helpers.unicodize(path)
for file_path_listing in os.listdir(path):
# When using os.listdir with a unicode path, it will always return a string using the
# NFD form. However, we internally are using the form NFC and therefore need to convert
# it to allow correct regex / comparisons to be performed.
#
file_path_listing = helpers.unicodize(file_path_listing)
if os.path.isfile(os.path.join(path, file_path_listing)):
file_paths[file_path_listing.lower()] = os.path.join(path, file_path_listing)
# If we've found an actual media file, we should record it.
(root, ext) = os.path.splitext(file_path_listing)
if ext.lower()[1:] in config.VIDEO_EXTS:
total_media_files += 1
Log('Looking for subtitle media in %d paths with %d media files.', len(paths), total_media_files)
Log('Paths: %s', ", ".join([ helpers.unicodize(p) for p in paths ]))
for file_path in file_paths.values():
local_basename = helpers.unicodize(os.path.splitext(os.path.basename(file_path))[0])
local_basename2 = local_basename.rsplit('.', 1)[0]
filename_matches_part = local_basename == part_basename or local_basename2 == part_basename
# If the file is located within the global subtitle folder and it's name doesn't match exactly
# then we should simply ignore it.
#
if file_path.count(global_subtitle_folder) and not filename_matches_part:
continue
# If we have more than one media file within the folder and located filename doesn't match
# exactly then we should simply ignore it.
#
if total_media_files > 1 and not filename_matches_part:
continue
subtitle_helper = subtitlehelpers.SubtitleHelpers(file_path)
if subtitle_helper != None:
local_lang_map = subtitle_helper.process_subtitles(part)
for new_language, subtitles in local_lang_map.items():
# Add the possible new language along with the located subtitles so that we can validate them
# at the end...
#
if not lang_sub_map.has_key(new_language):
lang_sub_map[new_language] = []
lang_sub_map[new_language] = lang_sub_map[new_language] + subtitles
# Now whack subtitles that don't exist anymore.
for language in lang_sub_map.keys():
part.subtitles[language].validate_keys(lang_sub_map[language])
# Now whack the languages that don't exist anymore.
for language in list(set(part.subtitles.keys()) - set(lang_sub_map.keys())):
part.subtitles[language].validate_keys({})
@@ -0,0 +1,127 @@
import re, unicodedata
import config
import helpers
class SubtitleHelper(object):
def __init__(self, filename):
self.filename = filename
def SubtitleHelpers(filename):
filename = helpers.unicodize(filename)
for cls in [ VobSubSubtitleHelper, DefaultSubtitleHelper ]:
if cls.is_helper_for(filename):
return cls(filename)
return None
#####################################################################################################################
class VobSubSubtitleHelper(SubtitleHelper):
@classmethod
def is_helper_for(cls, filename):
(file, file_extension) = os.path.splitext(filename)
# We only support idx (and maybe sub)
if not file_extension.lower() in ['.idx', '.sub']:
return False
# If we've been given a sub, we only support it if there exists a matching idx file
return os.path.exists(file + '.idx')
def process_subtitles(self, part):
lang_sub_map = {}
# We don't directly process the sub file, only the idx. Therefore if we are passed on of these files, we simply
# ignore it.
(file, ext) = os.path.splitext(self.filename)
if ext == '.sub':
return lang_sub_map
# If we have an idx file, we need to confirm there is an identically names sub file before we can proceed.
sub_filename = file + ".sub"
if os.path.exists(sub_filename) == False:
return lang_sub_map
Log('Attempting to parse VobSub file: ' + self.filename)
idx = Core.storage.load(os.path.join(self.filename))
if idx.count('VobSub index file') == 0:
Log('The idx file does not appear to be a VobSub, skipping...')
return lang_sub_map
languages = {}
language_index = 0
basename = os.path.basename(self.filename)
for language in re.findall('\nid: ([A-Za-z]{2})', idx):
if not languages.has_key(language):
languages[language] = []
Log('Found .idx subtitle file: ' + self.filename + ' language: ' + language + ' stream index: ' + str(language_index))
languages[language].append(Proxy.LocalFile(self.filename, index = str(language_index), format = "vobsub"))
language_index += 1
if not lang_sub_map.has_key(language):
lang_sub_map[language] = []
lang_sub_map[language].append(basename)
for language, subs in languages.items():
part.subtitles[language][basename] = subs
return lang_sub_map
#####################################################################################################################
class DefaultSubtitleHelper(SubtitleHelper):
@classmethod
def is_helper_for(cls, filename):
(file, file_extension) = os.path.splitext(filename)
return file_extension.lower()[1:] in config.SUBTITLE_EXTS
def process_subtitles(self, part):
lang_sub_map = {}
basename = os.path.basename(self.filename)
(file, ext) = os.path.splitext(self.filename)
# Remove the initial '.' from the extension
ext = ext[1:]
# Attempt to extract the language from the filename (e.g. Avatar (2009).eng)
language = ""
language_match = re.match(".+\.([^\.]+)$", file)
if language_match and len(language_match.groups()) == 1:
language = language_match.groups()[0]
language = Locale.Language.Match(language)
codec = None
format = None
if ext in ['txt', 'sub']:
try:
file_contents = Core.storage.load(self.filename)
lines = [ line.strip() for line in file_contents.splitlines(True) ]
if re.match('^\{[0-9]+\}\{[0-9]*\}', lines[1]):
format = 'microdvd'
elif re.match('^[0-9]{1,2}:[0-9]{2}:[0-9]{2}[:=,]', lines[1]):
format = 'txt'
elif '[SUBTITLE]' in lines[1]:
format = 'subviewer'
else:
Log("The subtitle file does not have a known format, skipping... : " + self.filename)
return lang_sub_map
except:
Log("An error occurred while attempting to parse the subtitle file, skipping... : " + self.filename)
return lang_sub_map
if codec is None and ext in ['ass', 'ssa', 'smi', 'srt', 'psb']:
codec = ext.replace('ass', 'ssa')
if format is None:
format = codec
Log('Found subtitle file: ' + self.filename + ' language: ' + language + ' codec: ' + str(codec) + ' format: ' + str(format))
part.subtitles[language][basename] = Proxy.LocalFile(self.filename, codec = codec, format = format)
lang_sub_map[language] = [ basename ]
return lang_sub_map
@@ -0,0 +1,170 @@
import os
import helpers
from mutagen import File
from mutagen.mp4 import MP4
class VideoHelper(object):
def __init__(self, filename):
self.filename = filename
def VideoHelpers(filename):
filename = helpers.unicodize(filename)
file = os.path.basename(filename)
(file, ext) = os.path.splitext(file)
for cls in [ MP4VideoHelper ]:
if cls.is_helper_for(ext):
return cls(filename)
return None
#####################################################################################################################
class MP4VideoHelper(VideoHelper):
@classmethod
def is_helper_for(cls, file_extension):
return file_extension.lower() in ['.mp4', '.m4v', '.mov']
def process_metadata(self, metadata, episode = None):
if episode == None:
item = metadata
else:
item = episode
Log('Reading MP4 tags')
try: tags = File(self.filename, options=[MP4])
except Exception, e:
Log('An error occurred while attempting to parse the MP4 file: ' + self.filename)
Log(str(e))
return
if tags == None:
Log('Not reading tags from %s because it doesn\'t look like an MP4 file.' % self.filename)
return
# Coverart
try:
picture = Proxy.Media(str(tags["covr"][0]))
# If we're dealing with an actual episode, it uses thumbs rather than posters.
if episode != None:
item.thumbs['atom_coverart'] = picture
else:
item.posters['atom_coverart'] = picture
except: pass
# Title
try:
title = tags["\xa9nam"][0]
item.title = title
except: pass
# Sort Title
try:
title_sort = tags["sonm"][0]
item.title_sort = title_sort
except: pass
# Summary (long or short)
try:
try:
summary = tags["ldes"][0]
except:
summary = tags["desc"][0]
item.summary = summary
except: pass
# Genres
try:
if "\xa9gen" in tags:
genres = tags["\xa9gen"][0]
else:
genres = tags["gnre"][0]
if len(genres) > 0:
if ':' in genres:
genre_list = genres.split(':')
elif ',' in genres:
genre_list = genres.split(',')
else:
genre_list = genres.split('/')
metadata.genres.clear()
for genre in genre_list:
metadata.genres.add(genre.strip())
except: pass
# Release Date & Year
try:
releaseDate = tags["\xa9day"][0]
releaseDate = releaseDate.split('T')[0]
parsedDate = Datetime.ParseDate(releaseDate)
item.originally_available_at = parsedDate.date()
item.year = parsedDate.year
except: pass
# Content Rating
try:
rating = tags["----:com.apple.iTunes:iTunEXTC"][0].split('|')[1]
if len(rating) > 0:
item.content_rating = rating
except: pass
# Look for iTunes-style metadata, use regular tags otherwise
try:
pl = plistlib.readPlistFromString(str(tags["----:com.apple.iTunes:iTunMOVI"][0]))
except:
pl = None
# Directors
try:
if pl and 'directors' in pl and pl['directors']:
item.directors.clear()
for director in pl['directors']:
item.directors.add(director['name'])
except: pass
# Writers
try:
if pl and 'screenwriters' in pl and pl['screenwriters']:
item.writers.clear()
for writer in pl['screenwriters']:
item.writers.add(writer['name'])
except: pass
# Cast
try:
if pl and 'cast' in pl and pl['cast']:
item.roles.clear()
for actor in pl['cast']:
role = item.roles.new()
role.actor = actor['name']
else:
artists = tags["\xa9ART"][0]
if len(artists) > 0:
artist_list = artists.split(',')
item.roles.clear()
for artist in artist_list:
role = item.roles.new()
role.actor = artist.strip()
except: pass
# Studio
try:
if pl and 'studio' in pl and pl['studio']:
item.studio = pl['studio']
else:
try:
copyright = tags["cprt"][0]
if len(copyright) > 0:
item.studio = copyright
except: pass
except: pass
# Collection
try:
albums = tags["\xa9alb"][0]
if len(albums) > 0:
album_list = albums.split('/')
item.collections.clear()
for album in album_list:
item.collections.add(album.strip())
except: pass
@@ -0,0 +1,27 @@
[
{
"id": "scanAll",
"label": "Scan for subtitles in all default folders (sub, subs, subtitle, subtitles) + custom if specified",
"type": "bool",
"default": "true"
},
{
"id": "subFolder",
"label": "Subtitle Folder (\"current folder\" is the folder the current media file lives in)",
"type": "enum",
"values": ["current folder", "sub", "subs", "subtitle", "subtitles"],
"default": "current folder"
},
{
"id": "subFolderCustom",
"label": "Custom Subtitle folder (computes to real paths; use for example \"bla\" as a subfolder of the current media file folder - can use real paths aswell)",
"type": "text",
"default": ""
},
{
"id": "music_video_path",
"label": "Local music video path",
"type": "text",
"default": ""
}
]
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleIdentifier</key>
<string>com.plexapp.agents.localmediaextended</string>
<key>PlexFrameworkVersion</key>
<string>2</string>
<key>PlexPluginClass</key>
<string>Agent</string>
<key>PlexPluginCodePolicy</key>
<string>Elevated</string>
</dict>
</plist>
+315
View File
@@ -0,0 +1,315 @@
# ID3.py version 1.0
# Module for manipulating ID3 informational tags in MP3 audio files
# $Id: ID3.py,v 1.1 2002/09/10 21:04:52 elan Exp $
# Written 2 May 1999 by Ben Gertzfield <che@debian.org>
# This work is released under the GNU GPL, version 2 or later.
# Modified 10 June 1999 by Arne Zellentin <arne@unix-ag.org> to
# fix bug with overwriting last 128 bytes of a file without an
# ID3 tag
# Patches from Jim Speth <speth@end.com> and someone whose email
# I've forgotten at the moment (huge apologies, I didn't save the
# entire mail, just the patch!) for so-called ID3 v1.1 support,
# which makes the last two bytes of the comment field signify a
# track number. If the first byte is null but the second byte
# is not, the second byte is assumed to signify a track number.
# Also thanks to Jim for the simple function to remove nulls and
# whitespace from the ends of ID3 tags. I'd like to add a boolean
# flag defaulting to false to the ID3() constructor signifying whether
# or not to remove whitespace, just in case old code depended on the
# old behavior for some reason, but that'd make any code that wanted
# to use the stripping behavior not work with old ID3.py. Bleh.
# This is the first thing I've ever written in Python, so bear with
# me if it looks terrible. In a few years I'll probably look back at
# this and laugh and laugh..
# Constructor:
#
# ID3(filename)
# Opens filename and tries to parse its ID3 header. If the ID3 header
# is invalid or the file access failed, raises InvalidTagError.
#
# When object is deconstructed, if any of the class data (below) have
# been changed, opens the file again read-write and writes out the
# new header. If the header is to be deleted, truncates the last
# 128 bytes of the file.
#
# Note that if ID3 cannot write the tag out to the file upon
# deconstruction, InvalidTagError will be raised and ignored
# (as we are in __del__, and exceptions just give warnings when
# raised in __del__.)
# Class Data of Interest:
#
# Note that all ID3 fields, unless otherwise specified, are a maximum of
# 30 characters in length. If a field is set to a string longer than
# the maximum, it will be truncated when it's written to disk.
#
# ID3.title
# Title of the song.
# ID3.artist
# Artist/creator of the song.
# ID3.album
# Title of the album the song is from.
# ID3.year
# Year the song was released. Maximum of 4 characters (Y10K bug!)
# ID3.genre
# Genre of the song. Integer value from 0 to 255. Genre specification
# comes from (sorry) WinAMP. http://mp3.musichall.cz/id3master/faq.htm
# has a list of current genres; I spell-checked this list against
# WinAMP's by running strings(1) on the file Winamp/Plugins/in_mp3.dll
# and made a few corrections.
# ID3.comment
# Comment about the song.
# ID3.track
# Track number of the song. None if undefined.
#
# ID3.genres
# List of all genres. ID3.genre above is used to index into this
# list. ID3.genres is current as of WinAMP 1.92.
# Methods of Interest:
#
# write()
# If the class data above have changed, opens the file given
# to the constructor read-write and writes out the new header.
# If the header is flagged for deletion (see delete() below)
# truncates the last 128 bytes of the file to remove the header.
#
# NOTE: write() is called from ID3's deconstructor, so it's technically
# unnecessary to call it. However, write() can raise an InvalidTagError,
# which can't be caught during deconstruction, so generally it's
# nicer to call it when writing is desired.
#
# delete()
# Flags the ID3 tag for deletion upon destruction of the object
#
# find_genre(genre_string)
# Searches for the numerical value of the given genre string in the
# ID3.genres table. The search is performed case-insensitively. Returns
# an integer from 0 to len(ID3.genres).
#
import string
import re
def lengthen(string, num_spaces):
string = string[:num_spaces]
return string + ('\0' * (num_spaces - len(string)))
# We would normally use string.rstrip(), but that doesn't remove \0 characters.
def strip_padding(s):
try:
s = s.decode('iso-8859-1').encode('utf-8')
except:
pass
while len(s) > 0 and s[-1] in (string.whitespace + "\0"):
s = s[:-1]
# Get rid of everything after \0...the above code doesn't always
# work.
#
rx = re.compile("\000.*");
s = rx.sub("", s)
# Change \222 to ' (must be strange keyboard)
s = s.replace("\222", "'")
return s
class InvalidTagError:
def __init__(self, msg):
self.msg = msg
def __str__(self):
return self.msg
class ID3:
genres = [
"Blues", "Classic Rock", "Country", "Dance", "Disco", "Funk",
"Grunge", "Hip-Hop", "Jazz", "Metal", "New Age", "Oldies", "Other",
"Pop", "R&B", "Rap", "Reggae", "Rock", "Techno", "Industrial",
"Alternative", "Ska", "Death Metal", "Pranks", "Soundtrack",
"Euro-Techno", "Ambient", "Trip-Hop", "Vocal", "Jazz+Funk", "Fusion",
"Trance", "Classical", "Instrumental", "Acid", "House", "Game",
"Sound Clip", "Gospel", "Noise", "Alt. Rock", "Bass", "Soul",
"Punk", "Space", "Meditative", "Instrum. Pop", "Instrum. Rock",
"Ethnic", "Gothic", "Darkwave", "Techno-Indust.", "Electronic",
"Pop-Folk", "Eurodance", "Dream", "Southern Rock", "Comedy",
"Cult", "Gangsta", "Top 40", "Christian Rap", "Pop/Funk", "Jungle",
"Native American", "Cabaret", "New Wave", "Psychadelic", "Rave",
"Showtunes", "Trailer", "Lo-Fi", "Tribal", "Acid Punk", "Acid Jazz",
"Polka", "Retro", "Musical", "Rock & Roll", "Hard Rock", "Folk",
"Folk/Rock", "National Folk", "Swing", "Fusion", "Bebob", "Latin",
"Revival", "Celtic", "Bluegrass", "Avantgarde", "Gothic Rock",
"Progress. Rock", "Psychadel. Rock", "Symphonic Rock", "Slow Rock",
"Big Band", "Chorus", "Easy Listening", "Acoustic", "Humour",
"Speech", "Chanson", "Opera", "Chamber Music", "Sonata", "Symphony",
"Booty Bass", "Primus", "Porn Groove", "Satire", "Slow Jam",
"Club", "Tango", "Samba", "Folklore", "Ballad", "Power Ballad",
"Rhythmic Soul", "Freestyle", "Duet", "Punk Rock", "Drum Solo",
"A Capella", "Euro-House", "Dance Hall", "Goa", "Drum & Bass",
"Club-House", "Hardcore", "Terror", "Indie", "BritPop", "Negerpunk",
"Polsk Punk", "Beat", "Christian Gangsta Rap", "Heavy Metal",
"Black Metal", "Crossover", "Contemporary Christian", "Christian Rock",
"Merengue", "Salsa", "Thrash Metal", "Anime", "Jpop", "Synthpop"
]
def __init__(self, filename):
self.filename = filename
self.delete_tag = 0
self.zero()
self.modified = 0
self.has_tag = 0
self.had_tag = 0
try:
self.file = open(filename, 'rb')
self.file.seek(-128, 2)
except IOError, msg:
self.modified = 0
raise InvalidTagError("Can't open %s: %s" % (filename, msg))
return
try:
if self.file.read(3) == 'TAG':
self.has_tag = 1
self.had_tag = 1
self.title = self.file.read(30)
self.artist = self.file.read(30)
self.album = self.file.read(30)
self.year = self.file.read(4)
self.comment = self.file.read(30)
if ord(self.comment[-2]) == 0 and ord(self.comment[-1]) != 0:
self.track = ord(self.comment[-1])
self.comment = self.comment[:-2]
else:
self.track = None
self.genre = ord(self.file.read(1))
self.file.close()
self.title = strip_padding(self.title)
self.artist = strip_padding(self.artist)
self.album = strip_padding(self.album)
self.year = strip_padding(self.year)
self.comment = strip_padding(self.comment)
except IOError, msg:
self.modified = 0
raise InvalidTagError("Invalid ID3 tag in %s: %s" % (filename, msg))
self.modified = 0
def delete(self):
self.zero()
self.delete_tag = 1
self.has_tag = 0
def zero(self):
self.title = ''
self.artist = ''
self.album = ''
self.year = ''
self.comment = ''
self.track = None
self.genre = 0
def find_genre(self, genre_to_find):
i = 0
find_me = string.lower(genre_to_find)
for genre in self.genres:
if string.lower(genre) == find_me:
break
i = i + 1
if i == len(self.genres):
return -1
else:
return i
def write(self):
if self.modified:
try:
self.file = open(self.filename, 'rb+')
if self.had_tag:
self.file.seek(-128, 2)
else:
self.file.seek(0, 2) # a new tag is appended at the end
if self.delete_tag and self.had_tag:
self.file.truncate()
self.had_tag = 0
elif self.has_tag:
go_on = 1
if self.had_tag:
#if self.file.read(3) == "TAG":
self.file.seek(-128, 2)
#else:
# someone has changed the file in the mean time
# go_on = 0
# raise IOError("File has been modified, losing tag changes")
if go_on:
self.file.write('TAG')
self.file.write(lengthen(self.title, 30))
self.file.write(lengthen(self.artist, 30))
self.file.write(lengthen(self.album, 30))
self.file.write(lengthen(self.year, 4))
comment = lengthen(self.comment, 30)
if self.track < 0 or self.track > 255:
self.track = None
if self.track != None:
comment = comment[:-2] + "\0" + chr(self.track)
self.file.write(comment)
if self.genre < 0 or self.genre > 255:
self.genre = 255
self.file.write(chr(self.genre))
self.had_tag = 1
self.file.close()
except IOError, msg:
raise InvalidTagError("Cannot write modified ID3 tag to %s: %s" % (self.filename, msg))
else:
self.modified = 0
def __del__(self):
self.write()
def __str__(self):
if self.has_tag:
if self.genre != None and self.genre > 0 and self.genre < len(self.genres):
genre = self.genres[self.genre]
else:
genre = 'Unknown'
if self.track != None:
track = str(self.track)
else:
track = 'Unknown'
return "File : %s\nTitle : %-30.30s Artist: %-30.30s\nAlbum : %-30.30s Track : %s Year: %-4.4s\nComment: %-30.30s Genre : %s (%i)" % (self.filename, self.title, self.artist, self.album, track, self.year, self.comment, genre, self.genre)
else:
return "%s: No ID3 tag." % self.filename
# intercept setting of attributes to set self.modified
def __setattr__(self, name, value):
if name in ['title', 'artist', 'album', 'year', 'comment',
'track', 'genre']:
self.__dict__['modified'] = 1
self.__dict__['has_tag'] = 1
self.__dict__[name] = value
import sys
if __name__ == '__main__':
id3 = ID3(sys.argv[1])
print id3.artist, id3.album, id3.title
@@ -0,0 +1,267 @@
#!/usr/bin/env python
# id3v2.py Version 0.1 (work still in progress)
# $Header: /mnt/haven/Source/Cleaners/ID3v2.py,v 1.4 2004/01/23 08:02:28 elan Exp $
#
# This takes a list of mp3 filenames and spits out an alternative
# filename based on the files id3v2 tag.
# There is another script id3.py which uses the v1 tag.
# I'll integrate it in when I get the time. Shouldnt be too hard
# but I'm often stupidly optimistic like this.
#
# This script only reads id3v2 tags.
# I suppose next enhancement might be to write id3v2 tags.
# and maybe some interaction with CDDB.
# Hmmmm, might need to next write some code to calculate the CDDB id.
#
# Would be nice if u sent me any enhancements/suggestions.
# mailto:calcium@altavista.net
# http://www.ozemail.com.au/~calcium
#
# -----------------
# TODO
# -----------------
# Handle extended headers properly
# Ability to create id3v2 tags.
# Make it more robust.
#
# -----------------
# The documentation.
# -----------------
# I doubt this will be of much use to anyone apart from curiousity value.
# I guess if u want to enhance it to handle additional tags, u'll need
# to write a function called "processXXXX" where XXXX is the frameId.
#
# I also suspect u'll need to have the id3v2 spec to make sense of
# some of the code.
# See http://www.id3.org
# See http://www.python.org
# See http://www.jython.org
# That's it.
#
# Ciao,
# Chai in Melbourne, Australia.
#
import sys
import string
import struct
from UnicodeHelper import fixEncoding
_encodings = ['iso8859-1', 'utf-16', 'utf-16be', 'utf-8']
#
# This gets the id3v2 tag from the file specified.
#
class ID3v2:
def __init__(self, filename, language=None):
self.artist = ''
self.album = ''
self.title = ''
self.year = ''
self.filename = filename
self.ok = 0
self.track = None
self.TPE2 = None
self.disk = None # None unless album has multiple disks
self.language = language
f = open(self.filename, 'rb')
# The header
self.header = f.read( 3 )
if self.header != "ID3":
return
# The version is the next 2 bytes
self.version = struct.unpack('>bb', f.read(2))[0]
# The flags. See the id3 v2 spec for details. Am ignoring it.
self.flags = f.read(1)
# I guess I shouldnt ignore the flags but could nt find any test data.
if ord( self.flags ) != 0:
print "Hey! There is an extended header present in %s" % filename
# The id3 Tag Size.
b1, b2, b3, b4 = struct.unpack( '>bbbb', f.read( 4 ) )
id3Size = self.syncSafeInt( b1, b2, b3, b4 )
'''
# Not ready
# If there is extended header.
if ord( self.flags ) != 0:
# The extended header Size.
b1, b2, b3, b4 = struct.unpack( '>bbbb', f.read( 4 ) )
self.extHeaderSize = self.syncSafeInt( b1, b2, b3, b4 )
self.extHeaderFlagBytes = f.read( 1 )
self.extHeaderExtendedFlags = f.read( 1 )
print "reading" + str ( self.extHeaderSize )
self.extHeaderData = f.read( self.extHeaderSize )
'''
# Reading in the id3 frames
while (1) :
# Assume that the id3size specified in the header is correct.
if f.tell() >= id3Size:
break
if self.version == 2:
self.frameId = f.read(3)
size = struct.unpack('>bbb', f.read(3))
self.frameSize = [size[0]*256*256+size[1]*256+size[2]]
else:
self.frameId = f.read(4)
self.frameSize = struct.unpack('>l', f.read(4))
# incase the id3 size is wrong, break anyway.
if self.frameSize[0] == 0:
break
if self.version > 2:
# read the frame header flags
self.frameFlags = f.read( 2 )
else:
self.frameFlags = 0
blkSize = self.frameSize[ 0 ]
if blkSize < 0:
#print ("[%s] Error in frame size(" + str( blkSize ) + ")") % filename
break
if blkSize > 1000000:
print "Too many bytes (%d) in '%s', aborting read" % (blkSize, filename)
return
try:
self.data = f.read( blkSize )
except:
print "Error reading %d bytes in %s." % (blkSize, filename)
break
# constructing the statement to process the header
# passing the TAG, EXTFLAGS, DATA as parameters.
pStr = "self.process" + self.frameId.replace(' ','') \
+ "( self.frameId, self.frameFlags, self.data )"
try:
exec pStr
self.ok = 1
except AttributeError:
#print "Warning: process" + self.frameId + "() unimpl."
continue
except:
print "Warning: strange ID3v2 tag in %s" % filename
print pStr
break
f.close()
#
# Gets the filename
#
def getFilename( self ):
return self.filename
#
# A guess as to whether file interrogation succeeded
#
def isOK( self ):
return self.ok
#
# Gets the version
#
def getVersion( self ):
return self.version
#
# Gets the flags
#
def getFlags( self ):
# print "Flags='%x" % ( ord( self.flags ) )
return self.flags
#
# Sets the album name
#
def processTALB( self, theString, theFlags, theValue ):
self.album = fixEncoding( theValue, self.language )
def processTAL(self, theString, theFlags, theValue):
self.processTALB(theString, theFlags, theValue)
def getAlbum( self ):
return self.album
#
# Sets the artist name
#
def processTPE1( self, theString, theFlags, theValue ):
self.artist = fixEncoding( theValue, self.language )
def processTP1(self, s, f, v):
self.processTPE1(s,f,v)
#
# Sets the TPE2
#
def processTPE2( self, theString, theFlags, theValue ):
self.TPE2 = fixEncoding( theValue, self.language )
#
# Sets the disk
#
def processTPOS( self, theString, theFlags, theValue ):
TPOS = fixEncoding( theValue, self.language )
try:
if TPOS == '1/1':
return
else:
sp = TPOS.split('/')
self.disk = int(sp[0])
except:
pass
def getArtist( self ):
return self.artist
#
# Sets the year.
#
def processTYER( self, theString, theFlags, theValue ):
self.year = fixEncoding( theValue, self.language )
def processTYE(self, s, f, v):
self.processTYER(s,f,v)
#
# Sets the track
#
def processTRCK(self,s,f,v):
track = fixEncoding(v, self.language)
slash = track.find('/')
if slash != -1:
track = track[0:slash]
self.track = int(track)
def processTRK(self,s,f,v):
self.processTRCK(s,f,v)
#
# Sets the title track name
#
def processTIT2( self, theString, theFlags, theValue ):
self.title = fixEncoding( theValue, self.language )
def processTT2( self, theString, theFlags, theValue ):
self.processTIT2(theString, theFlags, theValue)
def getSong( self ):
return title.song
def syncSafeInt( self, b1, b2, b3, b4 ):
return ( b4 & 0xff ) + \
+ ( ( b3 & 0xff ) << 7 ) \
+ ( ( b2 & 0xff ) << 14 ) \
+ ( ( b1 & 0xff ) << 21 )
import sys
if __name__ == '__main__':
id3 = ID3v2(sys.argv[1])
print id3.artist, id3.album, id3.title
@@ -0,0 +1,19 @@
import string
_encodings = ['iso8859-1', 'utf-16', 'utf-16be', 'utf-8']
def fixEncoding(theString, language=None):
encoding = ord(theString[0])
if 0 <= encoding < len(_encodings):
# If we're dealing with a particular language, we might want to try another code page.
if encoding == 0 and language == 'ko':
value = theString[1:].decode('cp949').encode('utf-8')
else:
value = theString[1:].decode(_encodings[encoding]).encode("utf-8")
else:
value = theString
if value:
value = value.strip('\0')
return value
@@ -0,0 +1,11 @@
Metadata-Version: 1.0
Name: mp4file
Version: 0.2
Summary: Library for rudimentary parsing of mp4 atoms, especially metadata atoms.
Home-page: UNKNOWN
Author: Bill Napier
Author-email: napier@pobox.com
License: PSF
Description: UNKNOWN
Keywords: mp4 atom quicktime
Platform: UNKNOWN
@@ -0,0 +1,232 @@
'''
Created on Dec 6, 2009
@author: napier
'''
#import logging
import os
import struct
from atomsearch import find_path, findall_path
#log = logging.getLogger("mp4file")
class EndOFFile(Exception):
def __init_(self):
Exception.__init__(self)
def read64(file):
data = file.read(8)
if (data is None or len(data) <> 8):
raise EndOFFile()
return struct.unpack(">Q", data)[0]
def read32(file):
data = file.read(4)
if (data is None or len(data) <> 4):
raise EndOFFile()
return struct.unpack(">I", data)[0]
def read16(file):
data = file.read(2)
if (data is None or len(data) <> 2):
raise EndOFFile()
return struct.unpack(">H", data)[0]
def read8(file):
data = file.read(1)
if (data is None or len(data) <> 1):
raise EndOFFile()
return struct.unpack(">B", data)[0]
def type_to_str(data):
a = (data >> 0) & 0xff
b = (data >> 8) & 0xff
c = (data >> 16) & 0xff
d = (data >> 24) & 0xff
return '%c%c%c%c' % (d, c, b, a)
def parse_atom(file):
try:
offset = file.tell()
size = read32(file)
type = type_to_str(read32(file))
if (size == 1):
size = read64(file)
return create_atom(size, type, offset, file)
except EndOFFile:
return None
ATOM_TYPE_MAP = { '\xa9too': 'encoder',
'\xa9nam': 'title',
'\xa9alb': 'album',
'\xa9ART': 'artist',
'\xa9art': 'artist',
'\xa9cmt': 'comment',
'\xa9gen': 'genre',
'gnre': 'genre',
'\xa9day': 'year',
'trkn': 'tracknum',
'disk': 'disknum',
'\xa9wrt': 'composer',
'tmpo': 'bpm',
'cptr': 'copyright',
'cpil': 'compilation',
'covr': 'coverart',
'rtng': 'rating',
'\xa9grp': 'grouping',
'pcst': 'podcast',
'catg': 'category',
'keyw': 'keyword',
'purl': 'podcasturl',
'egid': 'episodeguid',
'desc': 'description',
'ldes': 'long_description',
'\xa9lyr': 'lyrics',
'tvnn': 'tvnetwork',
'tvsh': 'tvshow',
'tven': 'tvepisodenum',
'tvsn': 'tvseason',
'tves': 'tvepisode',
'purd': 'purcahsedate',
'pgap': 'gapless',
}
# There are a lot of atom's with children. No need to create
# special classes for all of them
ATOM_WITH_CHILDREN = [ 'stik', 'moov', 'trak',
'udta', 'ilst', '\xa9too',
'\xa9nam', '\xa9alb', '\xa9ART', '\xa9art',
'\xa9cmt', '\xa9gen', 'gnre',
'\xa9day', 'trkn', 'disk',
'\xa9wrt', 'tmpo', 'cptr',
'cpil', 'covr', 'rtng',
'\xa9grp', 'pcst', 'catg',
'keyw', 'purl', 'egid',
'desc', 'ldes', '\xa9lyr', 'tvnn',
'tvsh', 'tven', 'tvsn',
'tves', 'purd', 'pgap',
]
def create_atom(size, type, offset, file):
clz = type.lower()
# Possibly remap atom types that aren't valid
# python variable names
if (ATOM_TYPE_MAP.has_key(type)):
clz = ATOM_TYPE_MAP[type]
if type in ATOM_WITH_CHILDREN:
return AtomWithChildren(size, type, clz, offset, file)
try:
# Try and eval the class into existance
return eval("%s(size, type, clz, offset, file)" % clz)
except (NameError, SyntaxError, TypeError):
# Not defined, use generic Atom
return Atom(size, type, clz, offset, file)
def parse_atoms(file, maxFileOffset):
atoms = []
while file.tell() < maxFileOffset:
atom = parse_atom(file)
if not atom or atom.size == 0:
break
atoms.append(atom)
# Seek to the end of the atom
file.seek(atom.offset + atom.size, os.SEEK_SET)
return atoms
class Atom(object):
def __init__(self, size, type, name, offset, file):
self.size = size
self.type = type
self.name = name
self.offset = offset
self.file = file
self.children = []
self.attrs = {}
def _set_attr(self, key, value):
self.attrs[key] = value
def _set_children(self, children):
# Tell the children who their parents are
for child in children:
child.parent = self
self.children = children
def get_attribute(self, key):
return self.attrs[key]
def get_atoms(self):
return self.children
def find(self, path):
return find_path(self, path)
def findall(self, path):
return findall_path(self, path)
class AtomWithChildren(Atom):
def __init__(self, size, type, name, offset, file):
Atom.__init__(self, size, type, name, offset, file)
self._set_children(parse_atoms(file, offset + size))
class ftyp(Atom):
def __init__(self, size, type, name, offset, file):
Atom.__init__(self, size, type, name, offset, file)
self._set_attr('major_version', type_to_str(read32(file)))
self._set_attr('minor_version', read32(file))
class meta(Atom):
def __init__(self, size, type, name, offset, file):
Atom.__init__(self, size, type, name, offset, file)
# meta has an extra null after the atom header. consume it here
read32(file)
self._set_children(parse_atoms(file, offset + size))
class data(Atom):
def __init__(self, size, type, name, offset, file):
Atom.__init__(self, size, type, name, offset, file)
# Mask off the version field
self.type = read32(file) & 0xFFFFFF
data = None
if self.type == 1:
data = self.parse_string()
self._set_attr("data", data)
elif self.type == 21 or self.type == 0:
# Another random null padding
read32(self.file)
data = read32(self.file)
# If this looks big-endian, swap it; I would assume there's an
# atom or something that indicates this, but I can't find it.
#
if (data & 0xff000000) != 0 and (data & 0xff) == 0:
data = (data & 0xff000000) >> 24
self._set_attr("data", data)
elif self.type == 13 or self.type == 14:
# Another random null padding
read32(self.file)
data = self.file.read(self.size - 16)
self._set_attr("data", data)
elif self.type == 22:
# uint8.
read32(self.file)
data = read8(self.file)
self._set_attr("data", data)
else:
print "UNKNOWN TYPE", self.type
def parse_string(self):
# consume extra null?
read32(self.file)
howMuch = self.size - 16
return unicode(self.file.read(howMuch), "utf-8", errors='ignore')
@@ -0,0 +1,59 @@
'''
Created on Dec 26, 2009
@author: napier
'''
import re
def path_compare(path, pattern):
# Handle the simple case
if pattern.find('*') == -1 and pattern.find('//') == -1:
return path == pattern
# Convert pattern into regexp
regexp = pattern.replace('*', '[^/]+').replace('//', '.*')
return re.match(regexp, path)
def find_path(atom, findpath):
if findpath == '.':
return atom
if (findpath[0] != '.'):
findpath = './' + findpath
for child in atom.children:
res = find_path_helper(child, findpath, '.', '.')
if res:
return res
def find_path_helper(atom, findpath,
typepath, namepath,
all=False):
typepath = typepath + '/' + str(atom.type)
namepath = namepath + '/' + atom.name
if path_compare(typepath, findpath):
if all:
return [atom]
return atom
if path_compare(namepath, findpath):
if all:
return [atom]
return atom
all_res = []
for child in atom.children:
res = find_path_helper(child, findpath, typepath, namepath, all)
if not all and res:
return res
if all:
all_res += res
if all:
return all_res
return None
def findall_path(atom, findpath):
if findpath == '.':
return atom
if (findpath[0] != '.'):
findpath = './' + findpath
all_res = []
for child in atom.children:
all_res += find_path_helper(child, findpath, '.', '.', True)
return all_res
@@ -0,0 +1,54 @@
'''
Created on Dec 26, 2009
@author: napier
'''
from atomsearch import find_path, findall_path
import unittest
class FakeAtom(object):
def __init__(self, name, type):
self.name = name
self.type = type
self.children = []
def get_atoms(self):
return self.children
class Test(unittest.TestCase):
def setUp(self):
self.mp4 = FakeAtom('root', 'root')
self.mp4.children = [FakeAtom('child1', 'child1'),
FakeAtom('child2', 'child2')]
child1 = self.mp4.children[0]
child1.children = [FakeAtom('grandchild', 'gc'),
FakeAtom('granchild', 'gc')]
def testFindSelf(self):
root = find_path(self.mp4, '.')
self.assertEquals(root.type, 'root')
def testFindStar(self):
gc1 = find_path(self.mp4, '*/gc')
self.assertNotEquals(None, gc1)
def testFindGc1(self):
gc1 = find_path(self.mp4, './child1/gc')
self.assertNotEqual(None, gc1)
gc2 = find_path(self.mp4, './child1/grandchild')
self.assertNotEqual(None, gc2)
gc3 = find_path(self.mp4, 'child1/grandchild')
self.assertNotEqual(None, gc3)
self.assertEquals(gc1.type, gc2.type)
self.assertEquals(gc2.type, gc3.type)
def testFindall(self):
res = findall_path(self.mp4, './child1/gc')
self.assertEquals(2, len(res))
all = findall_path(self.mp4, './/gc')
self.assertEquals(2, len(all))
if __name__ == "__main__":
#import sys;sys.argv = ['', 'Test.testName']
unittest.main()
@@ -0,0 +1,23 @@
'''
Created on Dec 6, 2009
@author: napier
'''
from atom import parse_atoms, AtomWithChildren
#import logging
import os
#log = logging.getLogger("mp4file")
def getFileSize(file):
file.seek(0, os.SEEK_END)
endFile = file.tell()
file.seek(0, os.SEEK_SET)
return endFile
class Mp4File(AtomWithChildren):
def __init__(self, filename):
file = open(filename, "rb")
self.atoms = parse_atoms(file, getFileSize(file))
AtomWithChildren.__init__(self, getFileSize(file),
'', '', 0, file)
@@ -0,0 +1,278 @@
# mutagen aims to be an all purpose media tagging library
# Copyright (C) 2005 Michael Urman
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
"""Mutagen aims to be an all purpose multimedia tagging library.
::
import mutagen.[format]
metadata = mutagen.[format].Open(filename)
`metadata` acts like a dictionary of tags in the file. Tags are generally a
list of string-like values, but may have additional methods available
depending on tag or format. They may also be entirely different objects
for certain keys, again depending on format.
"""
version = (1, 24)
"""Version tuple."""
version_string = ".".join(map(str, version))
"""Version string."""
import warnings
import mutagen._util
class Metadata(object):
"""An abstract dict-like object.
Metadata is the base class for many of the tag objects in Mutagen.
"""
def __init__(self, *args, **kwargs):
if args or kwargs:
self.load(*args, **kwargs)
def load(self, *args, **kwargs):
raise NotImplementedError
def save(self, filename=None):
"""Save changes to a file."""
raise NotImplementedError
def delete(self, filename=None):
"""Remove tags from a file."""
raise NotImplementedError
class FileType(mutagen._util.DictMixin):
"""An abstract object wrapping tags and audio stream information.
Attributes:
* info -- stream information (length, bitrate, sample rate)
* tags -- metadata tags, if any
Each file format has different potential tags and stream
information.
FileTypes implement an interface very similar to Metadata; the
dict interface, save, load, and delete calls on a FileType call
the appropriate methods on its tag data.
"""
info = None
tags = None
filename = None
_mimes = ["application/octet-stream"]
def __init__(self, filename=None, *args, **kwargs):
if filename is None:
warnings.warn("FileType constructor requires a filename",
DeprecationWarning)
else:
self.load(filename, *args, **kwargs)
def load(self, filename, *args, **kwargs):
raise NotImplementedError
def __getitem__(self, key):
"""Look up a metadata tag key.
If the file has no tags at all, a KeyError is raised.
"""
if self.tags is None:
raise KeyError(key)
else:
return self.tags[key]
def __setitem__(self, key, value):
"""Set a metadata tag.
If the file has no tags, an appropriate format is added (but
not written until save is called).
"""
if self.tags is None:
self.add_tags()
self.tags[key] = value
def __delitem__(self, key):
"""Delete a metadata tag key.
If the file has no tags at all, a KeyError is raised.
"""
if self.tags is None:
raise KeyError(key)
else:
del(self.tags[key])
def keys(self):
"""Return a list of keys in the metadata tag.
If the file has no tags at all, an empty list is returned.
"""
if self.tags is None:
return []
else:
return self.tags.keys()
def delete(self, filename=None):
"""Remove tags from a file."""
if self.tags is not None:
if filename is None:
filename = self.filename
else:
warnings.warn(
"delete(filename=...) is deprecated, reload the file",
DeprecationWarning)
return self.tags.delete(filename)
def save(self, filename=None, **kwargs):
"""Save metadata tags."""
if filename is None:
filename = self.filename
else:
warnings.warn(
"save(filename=...) is deprecated, reload the file",
DeprecationWarning)
if self.tags is not None:
return self.tags.save(filename, **kwargs)
else:
raise ValueError("no tags in file")
def pprint(self):
"""Print stream information and comment key=value pairs."""
stream = "%s (%s)" % (self.info.pprint(), self.mime[0])
try:
tags = self.tags.pprint()
except AttributeError:
return stream
else:
return stream + ((tags and "\n" + tags) or "")
def add_tags(self):
"""Adds new tags to the file.
Raises if tags already exist.
"""
raise NotImplementedError
@property
def mime(self):
"""A list of mime types"""
mimes = []
for Kind in type(self).__mro__:
for mime in getattr(Kind, '_mimes', []):
if mime not in mimes:
mimes.append(mime)
return mimes
@staticmethod
def score(filename, fileobj, header):
raise NotImplementedError
class StreamInfo(object):
"""Abstract stream information object.
Provides attributes for length, bitrate, sample rate etc.
See the implementations for details.
"""
def pprint(self):
"""Print stream information"""
raise NotImplementedError
def File(filename, options=None, easy=False):
"""Guess the type of the file and try to open it.
The file type is decided by several things, such as the first 128
bytes (which usually contains a file type identifier), the
filename extension, and the presence of existing tags.
If no appropriate type could be found, None is returned.
:param options: Sequence of :class:`FileType` implementations, defaults to
all included ones.
:param easy: If the easy wrappers should be returnd if available.
For example :class:`EasyMP3 <mp3.EasyMP3>` instead
of :class:`MP3 <mp3.MP3>`.
"""
if options is None:
from mutagen.asf import ASF
from mutagen.apev2 import APEv2File
from mutagen.flac import FLAC
if easy:
from mutagen.easyid3 import EasyID3FileType as ID3FileType
else:
from mutagen.id3 import ID3FileType
if easy:
from mutagen.mp3 import EasyMP3 as MP3
else:
from mutagen.mp3 import MP3
from mutagen.oggflac import OggFLAC
from mutagen.oggspeex import OggSpeex
from mutagen.oggtheora import OggTheora
from mutagen.oggvorbis import OggVorbis
from mutagen.oggopus import OggOpus
if easy:
from mutagen.trueaudio import EasyTrueAudio as TrueAudio
else:
from mutagen.trueaudio import TrueAudio
from mutagen.wavpack import WavPack
if easy:
from mutagen.easymp4 import EasyMP4 as MP4
else:
from mutagen.mp4 import MP4
from mutagen.musepack import Musepack
from mutagen.monkeysaudio import MonkeysAudio
from mutagen.optimfrog import OptimFROG
from mutagen.aiff import AIFF
options = [MP3, TrueAudio, OggTheora, OggSpeex, OggVorbis, OggFLAC,
FLAC, AIFF, APEv2File, MP4, ID3FileType, WavPack,
Musepack, MonkeysAudio, OptimFROG, ASF, OggOpus]
if not options:
return None
fileobj = open(filename, "rb")
try:
header = fileobj.read(128)
# Sort by name after score. Otherwise import order affects
# Kind sort order, which affects treatment of things with
# equals scores.
results = [(Kind.score(filename, fileobj, header), Kind.__name__)
for Kind in options]
finally:
fileobj.close()
results = list(zip(results, options))
results.sort()
(score, name), Kind = results[-1]
if score > 0:
return Kind(filename)
else:
return None
@@ -0,0 +1,82 @@
# Copyright 2013 Christoph Reiter
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
import sys
PY2 = sys.version_info[0] == 2
PY3 = not PY2
if PY2:
from StringIO import StringIO
BytesIO = StringIO
from cStringIO import StringIO as cBytesIO
long_ = long
integer_types = (int, long)
string_types = (str, unicode)
text_type = unicode
xrange = xrange
cmp = cmp
chr_ = chr
def endswith(text, end):
return text.endswith(end)
iteritems = lambda d: d.iteritems()
itervalues = lambda d: d.itervalues()
iterkeys = lambda d: d.iterkeys()
iterbytes = lambda b: iter(b)
exec("def reraise(tp, value, tb):\n raise tp, value, tb")
def swap_to_string(cls):
if hasattr(cls, '__str__'):
cls.__unicode__ = cls.__str__
if hasattr(cls, '__bytes__'):
cls.__str__ = cls.__bytes__
return cls
elif PY3:
from io import StringIO
StringIO = StringIO
from io import BytesIO
cBytesIO = BytesIO
long_ = int
integer_types = (int,)
string_types = (str,)
text_type = str
xrange = range
cmp = lambda a, b: (a > b) - (a < b)
chr_ = lambda x: bytes([x])
def endswith(text, end):
# usefull for paths which can be both, str and bytes
if isinstance(text, str):
if not isinstance(end, str):
end = end.decode("ascii")
else:
if not isinstance(end, bytes):
end = end.encode("ascii")
return text.endswith(end)
iteritems = lambda d: iter(d.items())
itervalues = lambda d: iter(d.values())
iterkeys = lambda d: iter(d.keys())
iterbytes = lambda b: (bytes([v]) for v in b)
def reraise(tp, value, tb):
raise tp(value).with_traceback(tb)
def swap_to_string(cls):
return cls
@@ -0,0 +1,197 @@
"""Constants used by Mutagen."""
GENRES = [
u"Blues",
u"Classic Rock",
u"Country",
u"Dance",
u"Disco",
u"Funk",
u"Grunge",
u"Hip-Hop",
u"Jazz",
u"Metal",
u"New Age",
u"Oldies",
u"Other",
u"Pop",
u"R&B",
u"Rap",
u"Reggae",
u"Rock",
u"Techno",
u"Industrial",
u"Alternative",
u"Ska",
u"Death Metal",
u"Pranks",
u"Soundtrack",
u"Euro-Techno",
u"Ambient",
u"Trip-Hop",
u"Vocal",
u"Jazz+Funk",
u"Fusion",
u"Trance",
u"Classical",
u"Instrumental",
u"Acid",
u"House",
u"Game",
u"Sound Clip",
u"Gospel",
u"Noise",
u"Alt. Rock",
u"Bass",
u"Soul",
u"Punk",
u"Space",
u"Meditative",
u"Instrumental Pop",
u"Instrumental Rock",
u"Ethnic",
u"Gothic",
u"Darkwave",
u"Techno-Industrial",
u"Electronic",
u"Pop-Folk",
u"Eurodance",
u"Dream",
u"Southern Rock",
u"Comedy",
u"Cult",
u"Gangsta Rap",
u"Top 40",
u"Christian Rap",
u"Pop/Funk",
u"Jungle",
u"Native American",
u"Cabaret",
u"New Wave",
u"Psychedelic",
u"Rave",
u"Showtunes",
u"Trailer",
u"Lo-Fi",
u"Tribal",
u"Acid Punk",
u"Acid Jazz",
u"Polka",
u"Retro",
u"Musical",
u"Rock & Roll",
u"Hard Rock",
u"Folk",
u"Folk-Rock",
u"National Folk",
u"Swing",
u"Fast-Fusion",
u"Bebop",
u"Latin",
u"Revival",
u"Celtic",
u"Bluegrass",
u"Avantgarde",
u"Gothic Rock",
u"Progressive Rock",
u"Psychedelic Rock",
u"Symphonic Rock",
u"Slow Rock",
u"Big Band",
u"Chorus",
u"Easy Listening",
u"Acoustic",
u"Humour",
u"Speech",
u"Chanson",
u"Opera",
u"Chamber Music",
u"Sonata",
u"Symphony",
u"Booty Bass",
u"Primus",
u"Porn Groove",
u"Satire",
u"Slow Jam",
u"Club",
u"Tango",
u"Samba",
u"Folklore",
u"Ballad",
u"Power Ballad",
u"Rhythmic Soul",
u"Freestyle",
u"Duet",
u"Punk Rock",
u"Drum Solo",
u"A Cappella",
u"Euro-House",
u"Dance Hall",
u"Goa",
u"Drum & Bass",
u"Club-House",
u"Hardcore",
u"Terror",
u"Indie",
u"BritPop",
u"Afro-Punk",
u"Polsk Punk",
u"Beat",
u"Christian Gangsta Rap",
u"Heavy Metal",
u"Black Metal",
u"Crossover",
u"Contemporary Christian",
u"Christian Rock",
u"Merengue",
u"Salsa",
u"Thrash Metal",
u"Anime",
u"JPop",
u"Synthpop",
u"Abstract",
u"Art Rock",
u"Baroque",
u"Bhangra",
u"Big Beat",
u"Breakbeat",
u"Chillout",
u"Downtempo",
u"Dub",
u"EBM",
u"Eclectic",
u"Electro",
u"Electroclash",
u"Emo",
u"Experimental",
u"Garage",
u"Global",
u"IDM",
u"Illbient",
u"Industro-Goth",
u"Jam Band",
u"Krautrock",
u"Leftfield",
u"Lounge",
u"Math Rock",
u"New Romantic",
u"Nu-Breakz",
u"Post-Punk",
u"Post-Rock",
u"Psytrance",
u"Shoegaze",
u"Space Rock",
u"Trop Rock",
u"World Music",
u"Neoclassical",
u"Audiobook",
u"Audio Theatre",
u"Neue Deutsche Welle",
u"Podcast",
u"Indie Rock",
u"G-Funk",
u"Dubstep",
u"Garage Rock",
u"Psybient",
]
"""The ID3v1 genre list."""
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,496 @@
# Copyright (C) 2005 Michael Urman
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
import struct
from struct import unpack, pack
from warnings import warn
from ._compat import text_type, chr_, PY3, swap_to_string, string_types
from mutagen._id3util import ID3JunkFrameError, ID3Warning, BitPaddedInt
from mutagen._util import total_ordering, decode_terminated
class Spec(object):
def __init__(self, name):
self.name = name
def __hash__(self):
raise TypeError("Spec objects are unhashable")
def _validate23(self, frame, value, **kwargs):
"""Return a possibly modified value which, if written,
results in valid id3v2.3 data.
"""
return value
class ByteSpec(Spec):
def read(self, frame, data):
return bytearray(data)[0], data[1:]
def write(self, frame, value):
return chr_(value)
def validate(self, frame, value):
if value is not None:
chr_(value)
return value
class IntegerSpec(Spec):
def read(self, frame, data):
return int(BitPaddedInt(data, bits=8)), ''
def write(self, frame, value):
return BitPaddedInt.to_str(value, bits=8, width=-1)
def validate(self, frame, value):
return value
class SizedIntegerSpec(Spec):
def __init__(self, name, size):
self.name, self.__sz = name, size
def read(self, frame, data):
return int(BitPaddedInt(data[:self.__sz], bits=8)), data[self.__sz:]
def write(self, frame, value):
return BitPaddedInt.to_str(value, bits=8, width=self.__sz)
def validate(self, frame, value):
return value
class EncodingSpec(ByteSpec):
def read(self, frame, data):
enc, data = super(EncodingSpec, self).read(frame, data)
if enc < 16:
return enc, data
else:
return 0, chr_(enc) + data
def validate(self, frame, value):
if value is None:
return None
if 0 <= value <= 3:
return value
raise ValueError('Invalid Encoding: %r' % value)
def _validate23(self, frame, value, **kwargs):
# only 0, 1 are valid in v2.3, default to utf-16
return min(1, value)
class StringSpec(Spec):
def __init__(self, name, length):
super(StringSpec, self).__init__(name)
self.len = length
def read(s, frame, data):
return data[:s.len], data[s.len:]
def write(s, frame, value):
if value is None:
return b'\x00' * s.len
else:
return (bytes(value) + b'\x00' * s.len)[:s.len]
def validate(s, frame, value):
if value is None:
return None
if not isinstance(value, bytes):
value = value.encode("ascii")
if len(value) == s.len:
return value
raise ValueError('Invalid StringSpec[%d] data: %r' % (s.len, value))
class BinaryDataSpec(Spec):
def read(self, frame, data):
return data, b''
def write(self, frame, value):
if value is None:
return b""
if isinstance(value, bytes):
return value
value = text_type(value).encode("ascii")
return value
def validate(self, frame, value):
if isinstance(value, bytes):
return value
value = text_type(value).encode("ascii")
return value
class EncodedTextSpec(Spec):
# Okay, seriously. This is private and defined explicitly and
# completely by the ID3 specification. You can't just add
# encodings here however you want.
_encodings = (
('latin1', b'\x00'),
('utf16', b'\x00\x00'),
('utf_16_be', b'\x00\x00'),
('utf8', b'\x00')
)
def read(self, frame, data):
enc, term = self._encodings[frame.encoding]
try:
# allow missing termination
return decode_terminated(data, enc, strict=False)
except ValueError:
# utf-16 termination with missing BOM, or single NULL
if not data[:len(term)].strip(b"\x00"):
return u"", data[len(term):]
# utf-16 data with single NULL, see issue 169
try:
return decode_terminated(data + b"\x00", enc)
except ValueError:
raise ID3JunkFrameError
def write(self, frame, value):
enc, term = self._encodings[frame.encoding]
return value.encode(enc) + term
def validate(self, frame, value):
return text_type(value)
class MultiSpec(Spec):
def __init__(self, name, *specs, **kw):
super(MultiSpec, self).__init__(name)
self.specs = specs
self.sep = kw.get('sep')
def read(self, frame, data):
values = []
while data:
record = []
for spec in self.specs:
value, data = spec.read(frame, data)
record.append(value)
if len(self.specs) != 1:
values.append(record)
else:
values.append(record[0])
return values, data
def write(self, frame, value):
data = []
if len(self.specs) == 1:
for v in value:
data.append(self.specs[0].write(frame, v))
else:
for record in value:
for v, s in zip(record, self.specs):
data.append(s.write(frame, v))
return b''.join(data)
def validate(self, frame, value):
if value is None:
return []
if self.sep and isinstance(value, string_types):
value = value.split(self.sep)
if isinstance(value, list):
if len(self.specs) == 1:
return [self.specs[0].validate(frame, v) for v in value]
else:
return [
[s.validate(frame, v) for (v, s) in zip(val, self.specs)]
for val in value]
raise ValueError('Invalid MultiSpec data: %r' % value)
def _validate23(self, frame, value, **kwargs):
if len(self.specs) != 1:
return [[s._validate23(frame, v, **kwargs)
for (v, s) in zip(val, self.specs)]
for val in value]
spec = self.specs[0]
# Merge single text spec multispecs only.
# (TimeStampSpec beeing the exception, but it's not a valid v2.3 frame)
if not isinstance(spec, EncodedTextSpec) or \
isinstance(spec, TimeStampSpec):
return value
value = [spec._validate23(frame, v, **kwargs) for v in value]
if kwargs.get("sep") is not None:
return [spec.validate(frame, kwargs["sep"].join(value))]
return value
class EncodedNumericTextSpec(EncodedTextSpec):
pass
class EncodedNumericPartTextSpec(EncodedTextSpec):
pass
class Latin1TextSpec(EncodedTextSpec):
def read(self, frame, data):
if b'\x00' in data:
data, ret = data.split(b'\x00', 1)
else:
ret = b''
return data.decode('latin1'), ret
def write(self, data, value):
return value.encode('latin1') + b'\x00'
def validate(self, frame, value):
return text_type(value)
@swap_to_string
@total_ordering
class ID3TimeStamp(object):
"""A time stamp in ID3v2 format.
This is a restricted form of the ISO 8601 standard; time stamps
take the form of:
YYYY-MM-DD HH:MM:SS
Or some partial form (YYYY-MM-DD HH, YYYY, etc.).
The 'text' attribute contains the raw text data of the time stamp.
"""
import re
def __init__(self, text):
if isinstance(text, ID3TimeStamp):
text = text.text
elif not isinstance(text, text_type):
if PY3:
raise TypeError("not a str")
text = text.decode("utf-8")
self.text = text
__formats = ['%04d'] + ['%02d'] * 5
__seps = ['-', '-', ' ', ':', ':', 'x']
def get_text(self):
parts = [self.year, self.month, self.day,
self.hour, self.minute, self.second]
pieces = []
for i, part in enumerate(parts):
if part is None:
break
pieces.append(self.__formats[i] % part + self.__seps[i])
return u''.join(pieces)[:-1]
def set_text(self, text, splitre=re.compile('[-T:/.]|\s+')):
year, month, day, hour, minute, second = \
splitre.split(text + ':::::')[:6]
for a in 'year month day hour minute second'.split():
try:
v = int(locals()[a])
except ValueError:
v = None
setattr(self, a, v)
text = property(get_text, set_text, doc="ID3v2.4 date and time.")
def __str__(self):
return self.text
def __bytes__(self):
return self.text.encode("utf-8")
def __repr__(self):
return repr(self.text)
def __eq__(self, other):
return self.text == other.text
def __lt__(self, other):
return self.text < other.text
__hash__ = object.__hash__
def encode(self, *args):
return self.text.encode(*args)
class TimeStampSpec(EncodedTextSpec):
def read(self, frame, data):
value, data = super(TimeStampSpec, self).read(frame, data)
return self.validate(frame, value), data
def write(self, frame, data):
return super(TimeStampSpec, self).write(frame,
data.text.replace(' ', 'T'))
def validate(self, frame, value):
try:
return ID3TimeStamp(value)
except TypeError:
raise ValueError("Invalid ID3TimeStamp: %r" % value)
class ChannelSpec(ByteSpec):
(OTHER, MASTER, FRONTRIGHT, FRONTLEFT, BACKRIGHT, BACKLEFT, FRONTCENTRE,
BACKCENTRE, SUBWOOFER) = range(9)
class VolumeAdjustmentSpec(Spec):
def read(self, frame, data):
value, = unpack('>h', data[0:2])
return value/512.0, data[2:]
def write(self, frame, value):
number = int(round(value * 512))
# pack only fails in 2.7, do it manually in 2.6
if not -32768 <= number <= 32767:
raise struct.error
return pack('>h', number)
def validate(self, frame, value):
if value is not None:
try:
self.write(frame, value)
except struct.error:
raise ValueError("out of range")
return value
class VolumePeakSpec(Spec):
def read(self, frame, data):
# http://bugs.xmms.org/attachment.cgi?id=113&action=view
peak = 0
bits = ord(data[0])
bytes = min(4, (bits + 7) >> 3)
# not enough frame data
if bytes + 1 > len(data):
raise ID3JunkFrameError
shift = ((8 - (bits & 7)) & 7) + (4 - bytes) * 8
for i in range(1, bytes+1):
peak *= 256
peak += ord(data[i])
peak *= 2 ** shift
return (float(peak) / (2**31-1)), data[1+bytes:]
def write(self, frame, value):
number = int(round(value * 32768))
# pack only fails in 2.7, do it manually in 2.6
if not 0 <= number <= 65535:
raise struct.error
# always write as 16 bits for sanity.
return b"\x10" + pack('>H', number)
def validate(self, frame, value):
if value is not None:
try:
self.write(frame, value)
except struct.error:
raise ValueError("out of range")
return value
class SynchronizedTextSpec(EncodedTextSpec):
def read(self, frame, data):
texts = []
encoding, term = self._encodings[frame.encoding]
while data:
try:
value, data = decode_terminated(data, encoding)
except ValueError:
raise ID3JunkFrameError
if len(data) < 4:
raise ID3JunkFrameError
time, = struct.unpack(">I", data[:4])
texts.append((value, time))
data = data[4:]
return texts, ""
def write(self, frame, value):
data = []
encoding, term = self._encodings[frame.encoding]
for text, time in value:
text = text.encode(encoding) + term
data.append(text + struct.pack(">I", time))
return b"".join(data)
def validate(self, frame, value):
return value
class KeyEventSpec(Spec):
def read(self, frame, data):
events = []
while len(data) >= 5:
events.append(struct.unpack(">bI", data[:5]))
data = data[5:]
return events, data
def write(self, frame, value):
return b"".join([struct.pack(">bI", *event) for event in value])
def validate(self, frame, value):
return value
class VolumeAdjustmentsSpec(Spec):
# Not to be confused with VolumeAdjustmentSpec.
def read(self, frame, data):
adjustments = {}
while len(data) >= 4:
freq, adj = struct.unpack(">Hh", data[:4])
data = data[4:]
freq /= 2.0
adj /= 512.0
adjustments[freq] = adj
adjustments = adjustments.items()
adjustments.sort()
return adjustments, data
def write(self, frame, value):
value.sort()
return b"".join([struct.pack(">Hh", int(freq * 2), int(adj * 512))
for (freq, adj) in value])
def validate(self, frame, value):
return value
class ASPIIndexSpec(Spec):
def read(self, frame, data):
if frame.b == 16:
format = "H"
size = 2
elif frame.b == 8:
format = "B"
size = 1
else:
warn("invalid bit count in ASPI (%d)" % frame.b, ID3Warning)
return [], data
indexes = data[:frame.N * size]
data = data[frame.N * size:]
return list(struct.unpack(">" + format * frame.N, indexes)), data
def write(self, frame, values):
if frame.b == 16:
format = "H"
elif frame.b == 8:
format = "B"
else:
raise ValueError("frame.b must be 8 or 16")
return struct.pack(">" + format * frame.N, *values)
def validate(self, frame, values):
return values
@@ -0,0 +1,178 @@
# Copyright (C) 2005 Michael Urman
# 2013 Christoph Reiter
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
from ._compat import long_, integer_types
class error(Exception):
pass
class ID3NoHeaderError(error, ValueError):
pass
class ID3BadUnsynchData(error, ValueError):
pass
class ID3BadCompressedData(error, ValueError):
pass
class ID3TagError(error, ValueError):
pass
class ID3UnsupportedVersionError(error, NotImplementedError):
pass
class ID3EncryptionUnsupportedError(error, NotImplementedError):
pass
class ID3JunkFrameError(error, ValueError):
pass
class ID3Warning(error, UserWarning):
pass
class unsynch(object):
@staticmethod
def decode(value):
output = bytearray()
safe = True
append = output.append
for val in bytearray(value):
if safe:
append(val)
safe = val != 0xFF
else:
if val >= 0xE0:
raise ValueError('invalid sync-safe string')
elif val != 0x00:
append(val)
safe = True
if not safe:
raise ValueError('string ended unsafe')
return bytes(output)
@staticmethod
def encode(value):
output = bytearray()
safe = True
append = output.append
for val in bytearray(value):
if safe:
append(val)
if val == 0xFF:
safe = False
elif val == 0x00 or val >= 0xE0:
append(0x00)
append(val)
safe = val != 0xFF
else:
append(val)
safe = True
if not safe:
append(0x00)
return bytes(output)
class _BitPaddedMixin(object):
def as_str(self, width=4, minwidth=4):
return self.to_str(self, self.bits, self.bigendian, width, minwidth)
@staticmethod
def to_str(value, bits=7, bigendian=True, width=4, minwidth=4):
mask = (1 << bits) - 1
if width != -1:
index = 0
bytes_ = bytearray(width)
try:
while value:
bytes_[index] = value & mask
value >>= bits
index += 1
except IndexError:
raise ValueError('Value too wide (>%d bytes)' % width)
else:
# PCNT and POPM use growing integers
# of at least 4 bytes (=minwidth) as counters.
bytes_ = bytearray()
append = bytes_.append
while value:
append(value & mask)
value >>= bits
bytes_ = bytes_.ljust(minwidth, b"\x00")
if bigendian:
bytes_.reverse()
return bytes(bytes_)
@staticmethod
def has_valid_padding(value, bits=7):
"""Whether the padding bits are all zero"""
assert bits <= 8
mask = (((1 << (8 - bits)) - 1) << bits)
if isinstance(value, integer_types):
while value:
if value & mask:
return False
value >>= 8
elif isinstance(value, bytes):
for byte in bytearray(value):
if byte & mask:
return False
else:
raise TypeError
return True
class BitPaddedInt(int, _BitPaddedMixin):
def __new__(cls, value, bits=7, bigendian=True):
mask = (1 << (bits)) - 1
numeric_value = 0
shift = 0
if isinstance(value, integer_types):
while value:
numeric_value += (value & mask) << shift
value >>= 8
shift += bits
elif isinstance(value, bytes):
if bigendian:
value = reversed(value)
for byte in bytearray(value):
numeric_value += (byte & mask) << shift
shift += bits
else:
raise TypeError
if isinstance(numeric_value, int):
self = int.__new__(BitPaddedInt, numeric_value)
else:
self = long_.__new__(BitPaddedLong, numeric_value)
self.bits = bits
self.bigendian = bigendian
return self
class BitPaddedLong(long_, _BitPaddedMixin):
pass
@@ -0,0 +1,422 @@
# Copyright 2006 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Utility classes for Mutagen.
You should not rely on the interfaces here being stable. They are
intended for internal use in Mutagen only.
"""
import struct
import codecs
from fnmatch import fnmatchcase
from ._compat import chr_, text_type, PY2, iteritems, iterbytes
def total_ordering(cls):
assert hasattr(cls, "__eq__")
assert hasattr(cls, "__lt__")
cls.__le__ = lambda self, other: self == other or self < other
cls.__gt__ = lambda self, other: not (self == other or self < other)
cls.__ge__ = lambda self, other: not self < other
cls.__ne__ = lambda self, other: not self.__eq__(other)
return cls
@total_ordering
class DictMixin(object):
"""Implement the dict API using keys() and __*item__ methods.
Similar to UserDict.DictMixin, this takes a class that defines
__getitem__, __setitem__, __delitem__, and keys(), and turns it
into a full dict-like object.
UserDict.DictMixin is not suitable for this purpose because it's
an old-style class.
This class is not optimized for very large dictionaries; many
functions have linear memory requirements. I recommend you
override some of these functions if speed is required.
"""
def __iter__(self):
return iter(self.keys())
def __has_key(self, key):
try:
self[key]
except KeyError:
return False
else:
return True
if PY2:
has_key = __has_key
__contains__ = __has_key
iterkeys = lambda self: iter(self.keys())
def values(self):
return [self[k] for k in self.keys()]
itervalues = lambda self: iter(self.values())
def items(self):
return list(zip(self.keys(), self.values()))
iteritems = lambda s: iter(s.items())
def clear(self):
for key in list(self.keys()):
self.__delitem__(key)
def pop(self, key, *args):
if len(args) > 1:
raise TypeError("pop takes at most two arguments")
try:
value = self[key]
except KeyError:
if args:
return args[0]
else:
raise
del(self[key])
return value
def popitem(self):
for key in self.keys():
break
else:
raise KeyError("dictionary is empty")
return key, self.pop(key)
def update(self, other=None, **kwargs):
if other is None:
self.update(kwargs)
other = {}
try:
for key, value in other.items():
self.__setitem__(key, value)
except AttributeError:
for key, value in other:
self[key] = value
def setdefault(self, key, default=None):
try:
return self[key]
except KeyError:
self[key] = default
return default
def get(self, key, default=None):
try:
return self[key]
except KeyError:
return default
def __repr__(self):
return repr(dict(self.items()))
def __eq__(self, other):
return dict(self.items()) == other
def __lt__(self, other):
return dict(self.items()) < other
__hash__ = object.__hash__
def __len__(self):
return len(self.keys())
class DictProxy(DictMixin):
def __init__(self, *args, **kwargs):
self.__dict = {}
super(DictProxy, self).__init__(*args, **kwargs)
def __getitem__(self, key):
return self.__dict[key]
def __setitem__(self, key, value):
self.__dict[key] = value
def __delitem__(self, key):
del(self.__dict[key])
def keys(self):
return self.__dict.keys()
class cdata(object):
"""C character buffer to Python numeric type conversions."""
from struct import error
error = error
short_le = staticmethod(lambda data: struct.unpack('<h', data)[0])
ushort_le = staticmethod(lambda data: struct.unpack('<H', data)[0])
short_be = staticmethod(lambda data: struct.unpack('>h', data)[0])
ushort_be = staticmethod(lambda data: struct.unpack('>H', data)[0])
int_le = staticmethod(lambda data: struct.unpack('<i', data)[0])
uint_le = staticmethod(lambda data: struct.unpack('<I', data)[0])
int_be = staticmethod(lambda data: struct.unpack('>i', data)[0])
uint_be = staticmethod(lambda data: struct.unpack('>I', data)[0])
longlong_le = staticmethod(lambda data: struct.unpack('<q', data)[0])
ulonglong_le = staticmethod(lambda data: struct.unpack('<Q', data)[0])
longlong_be = staticmethod(lambda data: struct.unpack('>q', data)[0])
ulonglong_be = staticmethod(lambda data: struct.unpack('>Q', data)[0])
to_short_le = staticmethod(lambda data: struct.pack('<h', data))
to_ushort_le = staticmethod(lambda data: struct.pack('<H', data))
to_short_be = staticmethod(lambda data: struct.pack('>h', data))
to_ushort_be = staticmethod(lambda data: struct.pack('>H', data))
to_int_le = staticmethod(lambda data: struct.pack('<i', data))
to_uint_le = staticmethod(lambda data: struct.pack('<I', data))
to_int_be = staticmethod(lambda data: struct.pack('>i', data))
to_uint_be = staticmethod(lambda data: struct.pack('>I', data))
to_longlong_le = staticmethod(lambda data: struct.pack('<q', data))
to_ulonglong_le = staticmethod(lambda data: struct.pack('<Q', data))
to_longlong_be = staticmethod(lambda data: struct.pack('>q', data))
to_ulonglong_be = staticmethod(lambda data: struct.pack('>Q', data))
bitswap = b''.join([chr_(sum([((val >> i) & 1) << (7-i)
for i in range(8)]))
for val in range(256)])
try:
del(i)
del(val)
except NameError:
pass
test_bit = staticmethod(lambda value, n: bool((value >> n) & 1))
def lock(fileobj):
"""Lock a file object 'safely'.
That means a failure to lock because the platform doesn't
support fcntl or filesystem locks is not considered a
failure. This call does block.
Returns whether or not the lock was successful, or
raises an exception in more extreme circumstances (full
lock table, invalid file).
"""
try:
import fcntl
except ImportError:
return False
else:
try:
fcntl.lockf(fileobj, fcntl.LOCK_EX)
except IOError:
# FIXME: There's possibly a lot of complicated
# logic that needs to go here in case the IOError
# is EACCES or EAGAIN.
return False
else:
return True
def unlock(fileobj):
"""Unlock a file object.
Don't call this on a file object unless a call to lock()
returned true.
"""
# If this fails there's a mismatched lock/unlock pair,
# so we definitely don't want to ignore errors.
import fcntl
fcntl.lockf(fileobj, fcntl.LOCK_UN)
def insert_bytes(fobj, size, offset, BUFFER_SIZE=2**16):
"""Insert size bytes of empty space starting at offset.
fobj must be an open file object, open rb+ or
equivalent. Mutagen tries to use mmap to resize the file, but
falls back to a significantly slower method if mmap fails.
"""
assert 0 < size
assert 0 <= offset
locked = False
fobj.seek(0, 2)
filesize = fobj.tell()
movesize = filesize - offset
fobj.write(b'\x00' * size)
fobj.flush()
try:
try:
import mmap
map = mmap.mmap(fobj.fileno(), filesize + size)
try:
map.move(offset + size, offset, movesize)
finally:
map.close()
except (ValueError, EnvironmentError, ImportError):
# handle broken mmap scenarios
locked = lock(fobj)
fobj.truncate(filesize)
fobj.seek(0, 2)
padsize = size
# Don't generate an enormous string if we need to pad
# the file out several megs.
while padsize:
addsize = min(BUFFER_SIZE, padsize)
fobj.write(b"\x00" * addsize)
padsize -= addsize
fobj.seek(filesize, 0)
while movesize:
# At the start of this loop, fobj is pointing at the end
# of the data we need to move, which is of movesize length.
thismove = min(BUFFER_SIZE, movesize)
# Seek back however much we're going to read this frame.
fobj.seek(-thismove, 1)
nextpos = fobj.tell()
# Read it, so we're back at the end.
data = fobj.read(thismove)
# Seek back to where we need to write it.
fobj.seek(-thismove + size, 1)
# Write it.
fobj.write(data)
# And seek back to the end of the unmoved data.
fobj.seek(nextpos)
movesize -= thismove
fobj.flush()
finally:
if locked:
unlock(fobj)
def delete_bytes(fobj, size, offset, BUFFER_SIZE=2**16):
"""Delete size bytes of empty space starting at offset.
fobj must be an open file object, open rb+ or
equivalent. Mutagen tries to use mmap to resize the file, but
falls back to a significantly slower method if mmap fails.
"""
locked = False
assert 0 < size
assert 0 <= offset
fobj.seek(0, 2)
filesize = fobj.tell()
movesize = filesize - offset - size
assert 0 <= movesize
try:
if movesize > 0:
fobj.flush()
try:
import mmap
map = mmap.mmap(fobj.fileno(), filesize)
try:
map.move(offset, offset + size, movesize)
finally:
map.close()
except (ValueError, EnvironmentError, ImportError):
# handle broken mmap scenarios
locked = lock(fobj)
fobj.seek(offset + size)
buf = fobj.read(BUFFER_SIZE)
while buf:
fobj.seek(offset)
fobj.write(buf)
offset += len(buf)
fobj.seek(offset + size)
buf = fobj.read(BUFFER_SIZE)
fobj.truncate(filesize - size)
fobj.flush()
finally:
if locked:
unlock(fobj)
def utf8(data):
"""Convert a basestring to a valid UTF-8 str."""
if isinstance(data, bytes):
return data.decode("utf-8", "replace").encode("utf-8")
elif isinstance(data, text_type):
return data.encode("utf-8")
else:
raise TypeError("only unicode/str types can be converted to UTF-8")
def dict_match(d, key, default=None):
try:
return d[key]
except KeyError:
for pattern, value in iteritems(d):
if fnmatchcase(key, pattern):
return value
return default
def decode_terminated(data, encoding, strict=True):
"""Returns the decoded data until the first NULL terminator
and all data after it.
In case the data can't be decoded raises UnicodeError.
In case the encoding is not found raises LookupError.
In case the data isn't null terminated (even if it is encoded correctly)
raises ValueError except if strict is False, then the decoded string
will be returned anyway.
"""
codec_info = codecs.lookup(encoding)
# normalize encoding name so we can compare by name
encoding = codec_info.name
# fast path
if encoding in ("utf-8", "iso8859-1"):
index = data.find(b"\x00")
if index == -1:
# make sure we raise UnicodeError first, like in the slow path
res = data.decode(encoding), b""
if strict:
raise ValueError("not null terminated")
else:
return res
return data[:index].decode(encoding), data[index + 1:]
# slow path
decoder = codec_info.incrementaldecoder()
r = []
for i, b in enumerate(iterbytes(data)):
c = decoder.decode(b)
if c == u"\x00":
return u"".join(r), data[i + 1:]
r.append(c)
else:
# make sure the decoder is finished
r.append(decoder.decode(b"", True))
if strict:
raise ValueError("not null terminated")
return u"".join(r), b""
@@ -0,0 +1,317 @@
# Vorbis comment support for Mutagen
# Copyright 2005-2006 Joe Wreschnig
# 2013 Christoph Reiter
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
"""Read and write Vorbis comment data.
Vorbis comments are freeform key/value pairs; keys are
case-insensitive ASCII and values are Unicode strings. A key may have
multiple values.
The specification is at http://www.xiph.org/vorbis/doc/v-comment.html.
"""
import sys
import mutagen
from ._compat import reraise, BytesIO, text_type, xrange, PY3, PY2
from mutagen._util import DictMixin, cdata
def is_valid_key(key):
"""Return true if a string is a valid Vorbis comment key.
Valid Vorbis comment keys are printable ASCII between 0x20 (space)
and 0x7D ('}'), excluding '='.
Takes str/unicode in Python 2, unicode in Python 3
"""
if PY3 and isinstance(key, bytes):
raise ValueError
for c in key:
if c < " " or c > "}" or c == "=":
return False
else:
return bool(key)
istag = is_valid_key
class error(IOError):
pass
class VorbisUnsetFrameError(error):
pass
class VorbisEncodingError(error):
pass
class VComment(mutagen.Metadata, list):
"""A Vorbis comment parser, accessor, and renderer.
All comment ordering is preserved. A VComment is a list of
key/value pairs, and so any Python list method can be used on it.
Vorbis comments are always wrapped in something like an Ogg Vorbis
bitstream or a FLAC metadata block, so this loads string data or a
file-like object, not a filename.
Attributes:
* vendor -- the stream 'vendor' (i.e. writer); default 'Mutagen'
"""
vendor = u"Mutagen " + mutagen.version_string
def __init__(self, data=None, *args, **kwargs):
# Collect the args to pass to load, this lets child classes
# override just load and get equivalent magic for the
# constructor.
if data is not None:
if isinstance(data, bytes):
data = BytesIO(data)
elif not hasattr(data, 'read'):
raise TypeError("VComment requires bytes or a file-like")
self.load(data, *args, **kwargs)
def load(self, fileobj, errors='replace', framing=True):
"""Parse a Vorbis comment from a file-like object.
Keyword arguments:
* errors:
'strict', 'replace', or 'ignore'. This affects Unicode decoding
and how other malformed content is interpreted.
* framing -- if true, fail if a framing bit is not present
Framing bits are required by the Vorbis comment specification,
but are not used in FLAC Vorbis comment blocks.
"""
try:
vendor_length = cdata.uint_le(fileobj.read(4))
self.vendor = fileobj.read(vendor_length).decode('utf-8', errors)
count = cdata.uint_le(fileobj.read(4))
for i in xrange(count):
length = cdata.uint_le(fileobj.read(4))
try:
string = fileobj.read(length).decode('utf-8', errors)
except (OverflowError, MemoryError):
raise error("cannot read %d bytes, too large" % length)
try:
tag, value = string.split('=', 1)
except ValueError as err:
if errors == "ignore":
continue
elif errors == "replace":
tag, value = u"unknown%d" % i, string
else:
reraise(VorbisEncodingError, err, sys.exc_info()[2])
try:
tag = tag.encode('ascii', errors)
except UnicodeEncodeError:
raise VorbisEncodingError("invalid tag name %r" % tag)
else:
# string keys in py3k
if PY3:
tag = tag.decode("ascii")
if is_valid_key(tag):
self.append((tag, value))
if framing and not ord(fileobj.read(1)) & 0x01:
raise VorbisUnsetFrameError("framing bit was unset")
except (cdata.error, TypeError):
raise error("file is not a valid Vorbis comment")
def validate(self):
"""Validate keys and values.
Check to make sure every key used is a valid Vorbis key, and
that every value used is a valid Unicode or UTF-8 string. If
any invalid keys or values are found, a ValueError is raised.
In Python 3 all keys and values have to be a string.
"""
# be stricter in Python 3
if PY3:
if not isinstance(self.vendor, text_type):
raise ValueError
for key, value in self:
if not isinstance(key, text_type):
raise ValueError
if not isinstance(value, text_type):
raise ValueError
if not isinstance(self.vendor, text_type):
try:
self.vendor.decode('utf-8')
except UnicodeDecodeError:
raise ValueError
for key, value in self:
try:
if not is_valid_key(key):
raise ValueError
except:
raise ValueError("%r is not a valid key" % key)
if not isinstance(value, text_type):
try:
value.encode("utf-8")
except:
raise ValueError("%r is not a valid value" % value)
else:
return True
def clear(self):
"""Clear all keys from the comment."""
for i in list(self):
self.remove(i)
def write(self, framing=True):
"""Return a string representation of the data.
Validation is always performed, so calling this function on
invalid data may raise a ValueError.
Keyword arguments:
* framing -- if true, append a framing bit (see load)
"""
self.validate()
def _encode(value):
if not isinstance(value, bytes):
return value.encode('utf-8')
return value
f = BytesIO()
vendor = _encode(self.vendor)
f.write(cdata.to_uint_le(len(vendor)))
f.write(vendor)
f.write(cdata.to_uint_le(len(self)))
for tag, value in self:
tag = _encode(tag)
value = _encode(value)
comment = tag + b"=" + value
f.write(cdata.to_uint_le(len(comment)))
f.write(comment)
if framing:
f.write(b"\x01")
return f.getvalue()
def pprint(self):
def _decode(value):
if not isinstance(value, text_type):
return value.decode('utf-8', 'replace')
return value
tags = [u"%s=%s" % (_decode(k), _decode(v)) for k, v in self]
return u"\n".join(tags)
class VCommentDict(VComment, DictMixin):
"""A VComment that looks like a dictionary.
This object differs from a dictionary in two ways. First,
len(comment) will still return the number of values, not the
number of keys. Secondly, iterating through the object will
iterate over (key, value) pairs, not keys. Since a key may have
multiple values, the same value may appear multiple times while
iterating.
Since Vorbis comment keys are case-insensitive, all keys are
normalized to lowercase ASCII.
"""
def __getitem__(self, key):
"""A list of values for the key.
This is a copy, so comment['title'].append('a title') will not
work.
"""
if not is_valid_key(key):
raise ValueError
key = key.lower()
values = [value for (k, value) in self if k.lower() == key]
if not values:
raise KeyError(key)
else:
return values
def __delitem__(self, key):
"""Delete all values associated with the key."""
if not is_valid_key(key):
raise ValueError
key = key.lower()
to_delete = [x for x in self if x[0].lower() == key]
if not to_delete:
raise KeyError(key)
else:
for item in to_delete:
self.remove(item)
def __contains__(self, key):
"""Return true if the key has any values."""
if not is_valid_key(key):
raise ValueError
key = key.lower()
for k, value in self:
if k.lower() == key:
return True
else:
return False
def __setitem__(self, key, values):
"""Set a key's value or values.
Setting a value overwrites all old ones. The value may be a
list of Unicode or UTF-8 strings, or a single Unicode or UTF-8
string.
"""
if not is_valid_key(key):
raise ValueError
if not isinstance(values, list):
values = [values]
try:
del(self[key])
except KeyError:
pass
if PY2:
key = key.encode('ascii')
for value in values:
self.append((key, value))
def keys(self):
"""Return all keys in the comment."""
return list(set([k.lower() for k, v in self]))
def as_dict(self):
"""Return a copy of the comment data in a real dict."""
return dict([(key, self[key]) for key in self.keys()])
@@ -0,0 +1,311 @@
# AIFF audio stream header information and ID3 tag support for Mutagen.
# Copyright 2014 Evan Purkhiser <evanpurkhiser@gmail.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
"""AIFF audio stream information and tags."""
import struct
from struct import pack
from ._compat import endswith
from mutagen import StreamInfo, FileType
from mutagen.id3 import ID3
from mutagen._id3util import error as ID3Error
from mutagen._util import insert_bytes, delete_bytes
__all__ = ["AIFF", "Open", "delete"]
class error(RuntimeError):
pass
class InvalidChunk(error, IOError):
pass
# based on stdlib's aifc
_HUGE_VAL = 1.79769313486231e+308
def read_float(s): # 10 bytes
expon, himant, lomant = struct.unpack('>hLL', s)
sign = 1
if expon < 0:
sign = -1
expon = expon + 0x8000
if expon == himant == lomant == 0:
f = 0.0
elif expon == 0x7FFF:
f = _HUGE_VAL
else:
expon = expon - 16383
f = (himant * 0x100000000 + lomant) * pow(2.0, expon - 63)
return sign * f
class IFFChunk(object):
"""Representation of a single IFF chunk"""
# Chunk headers are 8 bytes long (4 for ID and 4 for the size)
HEADER_SIZE = 8
def __init__(self, fileobj, parent_chunk=None):
self.__fileobj = fileobj
self.parent_chunk = parent_chunk
self.offset = fileobj.tell()
header = fileobj.read(self.HEADER_SIZE)
if len(header) < self.HEADER_SIZE:
raise InvalidChunk()
self.id, self.data_size = struct.unpack('>4si', header)
if self.id == b'\x00' * 4:
raise InvalidChunk()
self.size = self.HEADER_SIZE + self.data_size
self.data_offset = fileobj.tell()
self.data = None
def read(self):
"""Read the chunks data"""
self.__fileobj.seek(self.data_offset)
self.data = self.__fileobj.read(self.data_size)
def delete(self):
"""Removes the chunk from the file"""
delete_bytes(self.__fileobj, self.size, self.offset)
if self.parent_chunk is not None:
self.parent_chunk.resize(self.parent_chunk.data_size - self.size)
def resize(self, data_size):
"""Update the size of the chunk"""
self.__fileobj.seek(self.offset + 4)
self.__fileobj.write(pack('>I', data_size))
if self.parent_chunk is not None:
size_diff = self.data_size - data_size
self.parent_chunk.resize(self.parent_chunk.data_size - size_diff)
self.data_size = data_size
self.size = data_size + self.HEADER_SIZE
class IFFFile(object):
"""Representation of a IFF file"""
def __init__(self, fileobj):
self.__fileobj = fileobj
self.__chunks = {}
# AIFF Files always start with the FORM chunk which contains a 4 byte
# ID before the start of other chunks
fileobj.seek(0)
self.__chunks['FORM'] = IFFChunk(fileobj)
# Skip past the 4 byte FORM id
fileobj.seek(IFFChunk.HEADER_SIZE + 4)
# Where the next chunk can be located. We need to keep track of this
# since the size indicated in the FORM header may not match up with the
# offset determined from the size of the last chunk in the file
self.__next_offset = fileobj.tell()
# Load all of the chunks
while True:
try:
chunk = IFFChunk(fileobj, self['FORM'])
except InvalidChunk:
break
self.__chunks[chunk.id.strip()] = chunk
# Calculate the location of the next chunk,
# considering the pad byte
self.__next_offset = chunk.offset + chunk.size
self.__next_offset += self.__next_offset % 2
fileobj.seek(self.__next_offset)
def __contains__(self, id_):
"""Check if the IFF file contains a specific chunk"""
return id_ in self.__chunks
def __getitem__(self, id_):
"""Get a chunk from the IFF file"""
try:
return self.__chunks[id_]
except KeyError:
raise KeyError(
"%r has no %r chunk" % (self.__fileobj.name, id_))
def __delitem__(self, id_):
"""Remove a chunk from the IFF file"""
self.__chunks.pop(id_).delete()
def insert_chunk(self, id_):
"""Insert a new chunk at the end of the IFF file"""
self.__fileobj.seek(self.__next_offset)
self.__fileobj.write(pack('>4si', id_.ljust(4), 0))
self.__fileobj.seek(self.__next_offset)
chunk = IFFChunk(self.__fileobj, self['FORM'])
self['FORM'].resize(self['FORM'].data_size + chunk.size)
self.__chunks[id_] = chunk
self.__next_offset = chunk.offset + chunk.size
class AIFFInfo(StreamInfo):
"""AIFF audio stream information.
Information is parsed from the COMM chunk of the AIFF file
Useful attributes:
* length -- audio length, in seconds
* bitrate -- audio bitrate, in bits per second
* channels -- The number of audio channels
* sample_rate -- audio sample rate, in Hz
* sample_size -- The audio sample size
"""
length = 0
bitrate = 0
channels = 0
sample_rate = 0
def __init__(self, fileobj):
iff = IFFFile(fileobj)
try:
common_chunk = iff['COMM']
except KeyError as e:
raise error(str(e))
common_chunk.read()
info = struct.unpack('>hLh10s', common_chunk.data[:18])
channels, frame_count, sample_size, sample_rate = info
self.sample_rate = int(read_float(sample_rate))
self.sample_size = sample_size
self.channels = channels
self.bitrate = channels * sample_size * self.sample_rate
self.length = frame_count / float(self.sample_rate)
def pprint(self):
return "%d channel AIFF @ %d bps, %s Hz, %.2f seconds" % (
self.channels, self.bitrate, self.sample_rate, self.length)
class _IFFID3(ID3):
"""A AIFF file with ID3v2 tags"""
def _load_header(self):
try:
self._fileobj.seek(IFFFile(self._fileobj)['ID3'].data_offset)
except (InvalidChunk, KeyError):
raise ID3Error()
super(_IFFID3, self)._load_header()
def save(self, filename=None, v2_version=4, v23_sep='/'):
"""Save ID3v2 data to the AIFF file"""
framedata = self._prepare_framedata(v2_version, v23_sep)
framesize = len(framedata)
if filename is None:
filename = self.filename
# Unlike the parent ID3.save method, we won't save to a blank file
# since we would have to construct a empty AIFF file
fileobj = open(filename, 'rb+')
iff_file = IFFFile(fileobj)
try:
if 'ID3' not in iff_file:
iff_file.insert_chunk('ID3')
chunk = iff_file['ID3']
fileobj.seek(chunk.data_offset)
header = fileobj.read(10)
header = self._prepare_id3_header(header, framesize, v2_version)
header, new_size, _ = header
data = header + framedata + (b'\x00' * (new_size - framesize))
# Include ID3 header size in 'new_size' calculation
new_size += 10
# Expand the chunk if necessary, including pad byte
if new_size > chunk.size:
insert_at = chunk.offset + chunk.size
insert_size = new_size - chunk.size + new_size % 2
insert_bytes(fileobj, insert_size, insert_at)
chunk.resize(new_size)
fileobj.seek(chunk.data_offset)
fileobj.write(data)
finally:
fileobj.close()
def delete(self, filename=None):
"""Completely removes the ID3 chunk from the AIFF file"""
if filename is None:
filename = self.filename
delete(filename)
self.clear()
def delete(filename):
"""Completely removes the ID3 chunk from the AIFF file"""
with open(filename, "rb+") as file_:
try:
del IFFFile(file_)['ID3']
except KeyError:
pass
class AIFF(FileType):
"""An AIFF audio file.
:ivar info: :class:`AIFFInfo`
:ivar tags: :class:`ID3`
"""
_mimes = ["audio/aiff", "audio/x-aiff"]
@staticmethod
def score(filename, fileobj, header):
filename = filename.lower()
return (header.startswith(b"FORM") * 2 + endswith(filename, b".aif") +
endswith(filename, b".aiff") + endswith(filename, b".aifc"))
def add_tags(self):
"""Add an empty ID3 tag to the file."""
if self.tags is None:
self.tags = _IFFID3()
else:
raise error("an ID3 tag already exists")
def load(self, filename, **kwargs):
"""Load stream and tag information from a file."""
self.filename = filename
try:
self.tags = _IFFID3(filename, **kwargs)
except ID3Error:
self.tags = None
try:
fileobj = open(filename, "rb")
self.info = AIFFInfo(fileobj)
finally:
fileobj.close()
Open = AIFF
@@ -0,0 +1,627 @@
# An APEv2 tag reader
#
# Copyright 2005 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""APEv2 reading and writing.
The APEv2 format is most commonly used with Musepack files, but is
also the format of choice for WavPack and other formats. Some MP3s
also have APEv2 tags, but this can cause problems with many MP3
decoders and taggers.
APEv2 tags, like Vorbis comments, are freeform key=value pairs. APEv2
keys can be any ASCII string with characters from 0x20 to 0x7E,
between 2 and 255 characters long. Keys are case-sensitive, but
readers are recommended to be case insensitive, and it is forbidden to
multiple keys which differ only in case. Keys are usually stored
title-cased (e.g. 'Artist' rather than 'artist').
APEv2 values are slightly more structured than Vorbis comments; values
are flagged as one of text, binary, or an external reference (usually
a URI).
Based off the format specification found at
http://wiki.hydrogenaudio.org/index.php?title=APEv2_specification.
"""
__all__ = ["APEv2", "APEv2File", "Open", "delete"]
import sys
import struct
from ._compat import cBytesIO, PY3, text_type, PY2, reraise, swap_to_string
from mutagen import Metadata, FileType, StreamInfo
from mutagen._util import DictMixin, cdata, delete_bytes, total_ordering
def is_valid_apev2_key(key):
if PY3 and not isinstance(key, text_type):
raise TypeError("Keys have to be str")
return (2 <= len(key) <= 255 and min(key) >= ' ' and max(key) <= '~' and
key not in ["OggS", "TAG", "ID3", "MP+"])
# There are three different kinds of APE tag values.
# "0: Item contains text information coded in UTF-8
# 1: Item contains binary information
# 2: Item is a locator of external stored information [e.g. URL]
# 3: reserved"
TEXT, BINARY, EXTERNAL = range(3)
HAS_HEADER = 1 << 31
HAS_NO_FOOTER = 1 << 30
IS_HEADER = 1 << 29
class error(IOError):
pass
class APENoHeaderError(error, ValueError):
pass
class APEUnsupportedVersionError(error, ValueError):
pass
class APEBadItemError(error, ValueError):
pass
class _APEv2Data(object):
# Store offsets of the important parts of the file.
start = header = data = footer = end = None
# Footer or header; seek here and read 32 to get version/size/items/flags
metadata = None
# Actual tag data
tag = None
version = None
size = None
items = None
flags = 0
# The tag is at the start rather than the end. A tag at both
# the start and end of the file (i.e. the tag is the whole file)
# is not considered to be at the start.
is_at_start = False
def __init__(self, fileobj):
self.__find_metadata(fileobj)
if self.header is None:
self.metadata = self.footer
elif self.footer is None:
self.metadata = self.header
else:
self.metadata = max(self.header, self.footer)
if self.metadata is None:
return
self.__fill_missing(fileobj)
self.__fix_brokenness(fileobj)
if self.data is not None:
fileobj.seek(self.data)
self.tag = fileobj.read(self.size)
def __find_metadata(self, fileobj):
# Try to find a header or footer.
# Check for a simple footer.
try:
fileobj.seek(-32, 2)
except IOError:
fileobj.seek(0, 2)
return
if fileobj.read(8) == b"APETAGEX":
fileobj.seek(-8, 1)
self.footer = self.metadata = fileobj.tell()
return
# Check for an APEv2 tag followed by an ID3v1 tag at the end.
try:
fileobj.seek(-128, 2)
if fileobj.read(3) == b"TAG":
fileobj.seek(-35, 1) # "TAG" + header length
if fileobj.read(8) == b"APETAGEX":
fileobj.seek(-8, 1)
self.footer = fileobj.tell()
return
# ID3v1 tag at the end, maybe preceded by Lyrics3v2.
# (http://www.id3.org/lyrics3200.html)
# (header length - "APETAGEX") - "LYRICS200"
fileobj.seek(15, 1)
if fileobj.read(9) == b'LYRICS200':
fileobj.seek(-15, 1) # "LYRICS200" + size tag
try:
offset = int(fileobj.read(6))
except ValueError:
raise IOError
fileobj.seek(-32 - offset - 6, 1)
if fileobj.read(8) == b"APETAGEX":
fileobj.seek(-8, 1)
self.footer = fileobj.tell()
return
except IOError:
pass
# Check for a tag at the start.
fileobj.seek(0, 0)
if fileobj.read(8) == b"APETAGEX":
self.is_at_start = True
self.header = 0
def __fill_missing(self, fileobj):
fileobj.seek(self.metadata + 8)
self.version = fileobj.read(4)
self.size = cdata.uint_le(fileobj.read(4))
self.items = cdata.uint_le(fileobj.read(4))
self.flags = cdata.uint_le(fileobj.read(4))
if self.header is not None:
self.data = self.header + 32
# If we're reading the header, the size is the header
# offset + the size, which includes the footer.
self.end = self.data + self.size
fileobj.seek(self.end - 32, 0)
if fileobj.read(8) == b"APETAGEX":
self.footer = self.end - 32
elif self.footer is not None:
self.end = self.footer + 32
self.data = self.end - self.size
if self.flags & HAS_HEADER:
self.header = self.data - 32
else:
self.header = self.data
else:
raise APENoHeaderError("No APE tag found")
# exclude the footer from size
if self.footer is not None:
self.size -= 32
def __fix_brokenness(self, fileobj):
# Fix broken tags written with PyMusepack.
if self.header is not None:
start = self.header
else:
start = self.data
fileobj.seek(start)
while start > 0:
# Clean up broken writing from pre-Mutagen PyMusepack.
# It didn't remove the first 24 bytes of header.
try:
fileobj.seek(-24, 1)
except IOError:
break
else:
if fileobj.read(8) == b"APETAGEX":
fileobj.seek(-8, 1)
start = fileobj.tell()
else:
break
self.start = start
class _CIDictProxy(DictMixin):
def __init__(self, *args, **kwargs):
self.__casemap = {}
self.__dict = {}
super(_CIDictProxy, self).__init__(*args, **kwargs)
# Internally all names are stored as lowercase, but the case
# they were set with is remembered and used when saving. This
# is roughly in line with the standard, which says that keys
# are case-sensitive but two keys differing only in case are
# not allowed, and recommends case-insensitive
# implementations.
def __getitem__(self, key):
return self.__dict[key.lower()]
def __setitem__(self, key, value):
lower = key.lower()
self.__casemap[lower] = key
self.__dict[lower] = value
def __delitem__(self, key):
lower = key.lower()
del(self.__casemap[lower])
del(self.__dict[lower])
def keys(self):
return [self.__casemap.get(key, key) for key in self.__dict.keys()]
class APEv2(_CIDictProxy, Metadata):
"""A file with an APEv2 tag.
ID3v1 tags are silently ignored and overwritten.
"""
filename = None
def pprint(self):
"""Return tag key=value pairs in a human-readable format."""
items = sorted(self.items())
return u"\n".join([u"%s=%s" % (k, v.pprint()) for k, v in items])
def load(self, filename):
"""Load tags from a filename."""
self.filename = filename
fileobj = open(filename, "rb")
try:
data = _APEv2Data(fileobj)
finally:
fileobj.close()
if data.tag:
self.clear()
self.__parse_tag(data.tag, data.items)
else:
raise APENoHeaderError("No APE tag found")
def __parse_tag(self, tag, count):
fileobj = cBytesIO(tag)
for i in range(count):
size_data = fileobj.read(4)
# someone writes wrong item counts
if not size_data:
break
size = cdata.uint_le(size_data)
flags = cdata.uint_le(fileobj.read(4))
# Bits 1 and 2 bits are flags, 0-3
# Bit 0 is read/write flag, ignored
kind = (flags & 6) >> 1
if kind == 3:
raise APEBadItemError("value type must be 0, 1, or 2")
key = value = fileobj.read(1)
while key[-1:] != b'\x00' and value:
value = fileobj.read(1)
key += value
if key[-1:] == b"\x00":
key = key[:-1]
if PY3:
try:
key = key.decode("ascii")
except UnicodeError as err:
reraise(APEBadItemError, err, sys.exc_info()[2])
value = fileobj.read(size)
if kind == TEXT:
value = APETextValue(value, kind)
elif kind == BINARY:
value = APEBinaryValue(value, kind)
elif kind == EXTERNAL:
value = APEExtValue(value, kind)
self[key] = value
def __getitem__(self, key):
if not is_valid_apev2_key(key):
raise KeyError("%r is not a valid APEv2 key" % key)
if PY2:
key = key.encode('ascii')
return super(APEv2, self).__getitem__(key)
def __delitem__(self, key):
if not is_valid_apev2_key(key):
raise KeyError("%r is not a valid APEv2 key" % key)
if PY2:
key = key.encode('ascii')
super(APEv2, self).__delitem__(key)
def __setitem__(self, key, value):
"""'Magic' value setter.
This function tries to guess at what kind of value you want to
store. If you pass in a valid UTF-8 or Unicode string, it
treats it as a text value. If you pass in a list, it treats it
as a list of string/Unicode values. If you pass in a string
that is not valid UTF-8, it assumes it is a binary value.
Python 3: all bytes will be assumed to be a byte value, even
if they are valid utf-8.
If you need to force a specific type of value (e.g. binary
data that also happens to be valid UTF-8, or an external
reference), use the APEValue factory and set the value to the
result of that::
from mutagen.apev2 import APEValue, EXTERNAL
tag['Website'] = APEValue('http://example.org', EXTERNAL)
"""
if not is_valid_apev2_key(key):
raise KeyError("%r is not a valid APEv2 key" % key)
if PY2:
key = key.encode('ascii')
if not isinstance(value, _APEValue):
# let's guess at the content if we're not already a value...
if isinstance(value, text_type):
# unicode? we've got to be text.
value = APEValue(value, TEXT)
elif isinstance(value, list):
items = []
for v in value:
if not isinstance(v, text_type):
if PY3:
raise TypeError("item in list not str")
v = v.decode("utf-8")
items.append(v)
# list? text.
value = APEValue(u"\0".join(items), TEXT)
else:
if PY3:
value = APEValue(value, BINARY)
else:
try:
value.decode("utf-8")
except UnicodeError:
# invalid UTF8 text, probably binary
value = APEValue(value, BINARY)
else:
# valid UTF8, probably text
value = APEValue(value, TEXT)
super(APEv2, self).__setitem__(key, value)
def save(self, filename=None):
"""Save changes to a file.
If no filename is given, the one most recently loaded is used.
Tags are always written at the end of the file, and include
a header and a footer.
"""
filename = filename or self.filename
try:
fileobj = open(filename, "r+b")
except IOError:
fileobj = open(filename, "w+b")
data = _APEv2Data(fileobj)
if data.is_at_start:
delete_bytes(fileobj, data.end - data.start, data.start)
elif data.start is not None:
fileobj.seek(data.start)
# Delete an ID3v1 tag if present, too.
fileobj.truncate()
fileobj.seek(0, 2)
# "APE tags items should be sorted ascending by size... This is
# not a MUST, but STRONGLY recommended. Actually the items should
# be sorted by importance/byte, but this is not feasible."
tags = [v._internal(k) for k, v in self.items()]
tags.sort(key=len)
num_tags = len(tags)
tags = b"".join(tags)
header = bytearray(b"APETAGEX")
# version, tag size, item count, flags
header += struct.pack("<4I", 2000, len(tags) + 32, num_tags,
HAS_HEADER | IS_HEADER)
header += b"\0" * 8
fileobj.write(header)
fileobj.write(tags)
footer = bytearray(b"APETAGEX")
footer += struct.pack("<4I", 2000, len(tags) + 32, num_tags,
HAS_HEADER)
footer += b"\0" * 8
fileobj.write(footer)
fileobj.close()
def delete(self, filename=None):
"""Remove tags from a file."""
filename = filename or self.filename
fileobj = open(filename, "r+b")
try:
data = _APEv2Data(fileobj)
if data.start is not None and data.size is not None:
delete_bytes(fileobj, data.end - data.start, data.start)
finally:
fileobj.close()
self.clear()
Open = APEv2
def delete(filename):
"""Remove tags from a file."""
try:
APEv2(filename).delete()
except APENoHeaderError:
pass
def APEValue(value, kind):
"""APEv2 tag value factory.
Use this if you need to specify the value's type manually. Binary
and text data are automatically detected by APEv2.__setitem__.
"""
if kind in (TEXT, EXTERNAL):
if not isinstance(value, text_type):
# stricter with py3
if PY3:
raise TypeError("str only for text/external values")
else:
value = value.encode("utf-8")
if kind == TEXT:
return APETextValue(value, kind)
elif kind == BINARY:
return APEBinaryValue(value, kind)
elif kind == EXTERNAL:
return APEExtValue(value, kind)
else:
raise ValueError("kind must be TEXT, BINARY, or EXTERNAL")
@swap_to_string
@total_ordering
class _APEValue(object):
def __init__(self, value, kind):
if not isinstance(value, bytes):
raise TypeError("value not bytes")
self.kind = kind
self.value = value
def __len__(self):
return len(self.value)
def __bytes__(self):
return self.value
def __eq__(self, other):
return bytes(self) == other
def __lt__(self, other):
return bytes(self) < other
# Packed format for an item:
# 4B: Value length
# 4B: Value type
# Key name
# 1B: Null
# Key value
def _internal(self, key):
if not isinstance(key, bytes):
key = key.encode("utf-8")
data = bytearray()
data += struct.pack("<2I", len(self.value), self.kind << 1)
data += key
data += b"\0"
data += self.value
return bytes(data)
def __repr__(self):
return "%s(%r, %d)" % (type(self).__name__, self.value, self.kind)
@swap_to_string
@total_ordering
class _APEUtf8Value(_APEValue):
def __str__(self):
return self.value.decode("utf-8")
def __eq__(self, other):
return text_type(self) == other
def __lt__(self, other):
return text_type(self) < other
class APETextValue(_APEUtf8Value):
"""An APEv2 text value.
Text values are Unicode/UTF-8 strings. They can be accessed like
strings (with a null separating the values), or arrays of strings.
"""
def __iter__(self):
"""Iterate over the strings of the value (not the characters)"""
return iter(text_type(self).split(u"\0"))
def __getitem__(self, index):
return text_type(self).split(u"\0")[index]
def __len__(self):
return self.value.count(b"\0") + 1
__hash__ = _APEValue.__hash__
def __setitem__(self, index, value):
if not isinstance(value, text_type):
if PY3:
raise TypeError("value not str")
value = value.decode("utf-8")
values = list(self)
values[index] = value
self.value = (u"\0".join(values)).encode("utf-8")
def pprint(self):
return u" / ".join(self)
class APEBinaryValue(_APEValue):
"""An APEv2 binary value."""
def pprint(self):
return u"[%d bytes]" % len(self)
class APEExtValue(_APEUtf8Value):
"""An APEv2 external value.
External values are usually URI or IRI strings.
"""
def pprint(self):
return u"[External] %s" % text_type(self)
class APEv2File(FileType):
class _Info(StreamInfo):
length = 0
bitrate = 0
def __init__(self, fileobj):
pass
@staticmethod
def pprint():
return u"Unknown format with APEv2 tag."
def load(self, filename):
self.filename = filename
self.info = self._Info(open(filename, "rb"))
try:
self.tags = APEv2(filename)
except error:
self.tags = None
def add_tags(self):
if self.tags is None:
self.tags = APEv2()
else:
raise ValueError("%r already has tags: %r" % (self, self.tags))
@staticmethod
def score(filename, fileobj, header):
try:
fileobj.seek(-160, 2)
except IOError:
fileobj.seek(0)
footer = fileobj.read()
filename = filename.lower()
return ((b"APETAGEX" in footer) - header.startswith(b"ID3"))
@@ -0,0 +1,751 @@
# Copyright 2006-2007 Lukas Lalinsky
# Copyright 2005-2006 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Read and write ASF (Window Media Audio) files."""
__all__ = ["ASF", "Open"]
import struct
from mutagen import FileType, Metadata, StreamInfo
from mutagen._util import insert_bytes, delete_bytes, DictMixin, total_ordering
from ._compat import swap_to_string, text_type, PY2, string_types
class error(IOError):
pass
class ASFError(error):
pass
class ASFHeaderError(error):
pass
class ASFInfo(StreamInfo):
"""ASF stream information."""
def __init__(self):
self.length = 0.0
self.sample_rate = 0
self.bitrate = 0
self.channels = 0
def pprint(self):
s = "Windows Media Audio %d bps, %s Hz, %d channels, %.2f seconds" % (
self.bitrate, self.sample_rate, self.channels, self.length)
return s
class ASFTags(list, DictMixin, Metadata):
"""Dictionary containing ASF attributes."""
def pprint(self):
return "\n".join(["%s=%s" % (k, v) for k, v in self])
def __getitem__(self, key):
"""A list of values for the key.
This is a copy, so comment['title'].append('a title') will not
work.
"""
values = [value for (k, value) in self if k == key]
if not values:
raise KeyError(key)
else:
return values
def __delitem__(self, key):
"""Delete all values associated with the key."""
to_delete = list(filter(lambda x: x[0] == key, self))
if not to_delete:
raise KeyError(key)
else:
for k in to_delete:
self.remove(k)
def __contains__(self, key):
"""Return true if the key has any values."""
for k, value in self:
if k == key:
return True
else:
return False
def __setitem__(self, key, values):
"""Set a key's value or values.
Setting a value overwrites all old ones. The value may be a
list of Unicode or UTF-8 strings, or a single Unicode or UTF-8
string.
"""
if not isinstance(values, list):
values = [values]
try:
del(self[key])
except KeyError:
pass
for value in values:
if key in _standard_attribute_names:
value = text_type(value)
elif not isinstance(value, ASFBaseAttribute):
if isinstance(value, string_types):
if PY2 or isinstance(value, text_type):
value = ASFUnicodeAttribute(value)
elif isinstance(value, bool):
value = ASFBoolAttribute(value)
elif isinstance(value, int):
value = ASFDWordAttribute(value)
elif isinstance(value, long):
value = ASFQWordAttribute(value)
self.append((key, value))
def keys(self):
"""Return all keys in the comment."""
return self and set(next(iter(zip(*self))))
def as_dict(self):
"""Return a copy of the comment data in a real dict."""
d = {}
for key, value in self:
d.setdefault(key, []).append(value)
return d
class ASFBaseAttribute(object):
"""Generic attribute."""
TYPE = None
def __init__(self, value=None, data=None, language=None,
stream=None, **kwargs):
self.language = language
self.stream = stream
if data:
self.value = self.parse(data, **kwargs)
else:
self.value = value
def data_size(self):
raise NotImplementedError
def __repr__(self):
name = "%s(%r" % (type(self).__name__, self.value)
if self.language:
name += ", language=%d" % self.language
if self.stream:
name += ", stream=%d" % self.stream
name += ")"
return name
def render(self, name):
name = name.encode("utf-16-le") + b"\x00\x00"
data = self._render()
return (struct.pack("<H", len(name)) + name +
struct.pack("<HH", self.TYPE, len(data)) + data)
def render_m(self, name):
name = name.encode("utf-16-le") + b"\x00\x00"
if self.TYPE == 2:
data = self._render(dword=False)
else:
data = self._render()
return (struct.pack("<HHHHI", 0, self.stream or 0, len(name),
self.TYPE, len(data)) + name + data)
def render_ml(self, name):
name = name.encode("utf-16-le") + b"\x00\x00"
if self.TYPE == 2:
data = self._render(dword=False)
else:
data = self._render()
return (struct.pack("<HHHHI", self.language or 0, self.stream or 0,
len(name), self.TYPE, len(data)) + name + data)
@swap_to_string
@total_ordering
class ASFUnicodeAttribute(ASFBaseAttribute):
"""Unicode string attribute."""
TYPE = 0x0000
def parse(self, data):
return data.decode("utf-16-le").strip("\x00")
def _render(self):
return self.value.encode("utf-16-le") + b"\x00\x00"
def data_size(self):
return len(self._render())
def __bytes__(self):
return self.value.encode("utf-16-le")
def __str__(self):
return self.value
def __eq__(self, other):
return text_type(self) == other
def __lt__(self, other):
return text_type(self) < other
__hash__ = ASFBaseAttribute.__hash__
@swap_to_string
@total_ordering
class ASFByteArrayAttribute(ASFBaseAttribute):
"""Byte array attribute."""
TYPE = 0x0001
def parse(self, data):
assert isinstance(data, bytes)
return data
def _render(self):
assert isinstance(self.value, bytes)
return self.value
def data_size(self):
return len(self.value)
def __bytes__(self):
return "[binary data (%s bytes)]" % len(self.value)
def __eq__(self, other):
return self.value == other
def __lt__(self, other):
return self.value < other
__hash__ = ASFBaseAttribute.__hash__
@swap_to_string
@total_ordering
class ASFBoolAttribute(ASFBaseAttribute):
"""Bool attribute."""
TYPE = 0x0002
def parse(self, data, dword=True):
if dword:
return struct.unpack("<I", data)[0] == 1
else:
return struct.unpack("<H", data)[0] == 1
def _render(self, dword=True):
if dword:
return struct.pack("<I", int(self.value))
else:
return struct.pack("<H", int(self.value))
def data_size(self):
return 4
def __bool__(self):
return bool(self.value)
def __bytes__(self):
return self.value
def __eq__(self, other):
return bool(self.value) == other
def __lt__(self, other):
return bool(self.value) < other
__hash__ = ASFBaseAttribute.__hash__
@swap_to_string
@total_ordering
class ASFDWordAttribute(ASFBaseAttribute):
"""DWORD attribute."""
TYPE = 0x0003
def parse(self, data):
return struct.unpack("<L", data)[0]
def _render(self):
return struct.pack("<L", self.value)
def data_size(self):
return 4
def __int__(self):
return self.value
def __bytes__(self):
return self.value
def __eq__(self, other):
return int(self.value) == other
def __lt__(self, other):
return int(self.value) < other
__hash__ = ASFBaseAttribute.__hash__
@swap_to_string
@total_ordering
class ASFQWordAttribute(ASFBaseAttribute):
"""QWORD attribute."""
TYPE = 0x0004
def parse(self, data):
return struct.unpack("<Q", data)[0]
def _render(self):
return struct.pack("<Q", self.value)
def data_size(self):
return 8
def __int__(self):
return self.value
def __bytes__(self):
return self.value
def __eq__(self, other):
return int(self.value) == other
def __lt__(self, other):
return int(self.value) < other
__hash__ = ASFBaseAttribute.__hash__
@swap_to_string
@total_ordering
class ASFWordAttribute(ASFBaseAttribute):
"""WORD attribute."""
TYPE = 0x0005
def parse(self, data):
return struct.unpack("<H", data)[0]
def _render(self):
return struct.pack("<H", self.value)
def data_size(self):
return 2
def __int__(self):
return self.value
def __bytes__(self):
return self.value
def __eq__(self, other):
return int(self.value) == other
def __lt__(self, other):
return int(self.value) < other
__hash__ = ASFBaseAttribute.__hash__
@swap_to_string
@total_ordering
class ASFGUIDAttribute(ASFBaseAttribute):
"""GUID attribute."""
TYPE = 0x0006
def parse(self, data):
assert isinstance(data, bytes)
return data
def _render(self):
assert isinstance(self.value, bytes)
return self.value
def data_size(self):
return len(self.value)
def __bytes__(self):
return self.value
def __eq__(self, other):
return self.value == other
def __lt__(self, other):
return self.value < other
__hash__ = ASFBaseAttribute.__hash__
UNICODE = ASFUnicodeAttribute.TYPE
BYTEARRAY = ASFByteArrayAttribute.TYPE
BOOL = ASFBoolAttribute.TYPE
DWORD = ASFDWordAttribute.TYPE
QWORD = ASFQWordAttribute.TYPE
WORD = ASFWordAttribute.TYPE
GUID = ASFGUIDAttribute.TYPE
def ASFValue(value, kind, **kwargs):
for t, c in _attribute_types.items():
if kind == t:
return c(value=value, **kwargs)
raise ValueError("Unknown value type")
_attribute_types = {
ASFUnicodeAttribute.TYPE: ASFUnicodeAttribute,
ASFByteArrayAttribute.TYPE: ASFByteArrayAttribute,
ASFBoolAttribute.TYPE: ASFBoolAttribute,
ASFDWordAttribute.TYPE: ASFDWordAttribute,
ASFQWordAttribute.TYPE: ASFQWordAttribute,
ASFWordAttribute.TYPE: ASFWordAttribute,
ASFGUIDAttribute.TYPE: ASFGUIDAttribute,
}
_standard_attribute_names = [
"Title",
"Author",
"Copyright",
"Description",
"Rating"
]
class BaseObject(object):
"""Base ASF object."""
GUID = None
def parse(self, asf, data, fileobj, size):
self.data = data
def render(self, asf):
data = self.GUID + struct.pack("<Q", len(self.data) + 24) + self.data
return data
class UnknownObject(BaseObject):
"""Unknown ASF object."""
def __init__(self, guid):
assert isinstance(guid, bytes)
self.GUID = guid
class HeaderObject(object):
"""ASF header."""
GUID = b"\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C"
class ContentDescriptionObject(BaseObject):
"""Content description."""
GUID = b"\x33\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C"
def parse(self, asf, data, fileobj, size):
super(ContentDescriptionObject, self).parse(asf, data, fileobj, size)
asf.content_description_obj = self
lengths = struct.unpack("<HHHHH", data[:10])
texts = []
pos = 10
for length in lengths:
end = pos + length
if length > 0:
texts.append(data[pos:end].decode("utf-16-le").strip("\x00"))
else:
texts.append(None)
pos = end
title, author, copyright, desc, rating = texts
for key, value in dict(
Title=title,
Author=author,
Copyright=copyright,
Description=desc,
Rating=rating
).items():
if value is not None:
asf.tags[key] = value
def render(self, asf):
def render_text(name):
value = asf.tags.get(name, [])
if value:
return value[0].encode("utf-16-le") + b"\x00\x00"
else:
return b""
texts = list(map(render_text, _standard_attribute_names))
data = struct.pack("<HHHHH", *map(len, texts)) + b"".join(texts)
return self.GUID + struct.pack("<Q", 24 + len(data)) + data
class ExtendedContentDescriptionObject(BaseObject):
"""Extended content description."""
GUID = b"\x40\xA4\xD0\xD2\x07\xE3\xD2\x11\x97\xF0\x00\xA0\xC9\x5E\xA8\x50"
def parse(self, asf, data, fileobj, size):
super(ExtendedContentDescriptionObject, self).parse(
asf, data, fileobj, size)
asf.extended_content_description_obj = self
num_attributes, = struct.unpack("<H", data[0:2])
pos = 2
for i in range(num_attributes):
name_length, = struct.unpack("<H", data[pos:pos+2])
pos += 2
name = data[pos:pos+name_length].decode("utf-16-le").strip("\x00")
pos += name_length
value_type, value_length = struct.unpack("<HH", data[pos:pos+4])
pos += 4
value = data[pos:pos+value_length]
pos += value_length
attr = _attribute_types[value_type](data=value)
asf.tags.append((name, attr))
def render(self, asf):
attrs = asf.to_extended_content_description.items()
data = b"".join([attr.render(name) for (name, attr) in attrs])
data = struct.pack("<QH", 26 + len(data), len(attrs)) + data
return self.GUID + data
class FilePropertiesObject(BaseObject):
"""File properties."""
GUID = b"\xA1\xDC\xAB\x8C\x47\xA9\xCF\x11\x8E\xE4\x00\xC0\x0C\x20\x53\x65"
def parse(self, asf, data, fileobj, size):
super(FilePropertiesObject, self).parse(asf, data, fileobj, size)
length, _, preroll = struct.unpack("<QQQ", data[40:64])
asf.info.length = length / 10000000.0 - preroll / 1000.0
class StreamPropertiesObject(BaseObject):
"""Stream properties."""
GUID = b"\x91\x07\xDC\xB7\xB7\xA9\xCF\x11\x8E\xE6\x00\xC0\x0C\x20\x53\x65"
def parse(self, asf, data, fileobj, size):
super(StreamPropertiesObject, self).parse(asf, data, fileobj, size)
channels, sample_rate, bitrate = struct.unpack("<HII", data[56:66])
asf.info.channels = channels
asf.info.sample_rate = sample_rate
asf.info.bitrate = bitrate * 8
class HeaderExtensionObject(BaseObject):
"""Header extension."""
GUID = b"\xb5\x03\xbf_.\xa9\xcf\x11\x8e\xe3\x00\xc0\x0c Se"
def parse(self, asf, data, fileobj, size):
super(HeaderExtensionObject, self).parse(asf, data, fileobj, size)
asf.header_extension_obj = self
datasize, = struct.unpack("<I", data[18:22])
datapos = 0
self.objects = []
while datapos < datasize:
guid, size = struct.unpack("<16sQ", data[22+datapos:22+datapos+24])
if guid in _object_types:
obj = _object_types[guid]()
else:
obj = UnknownObject(guid)
obj.parse(asf, data[22+datapos+24:22+datapos+size], fileobj, size)
self.objects.append(obj)
datapos += size
def render(self, asf):
data = b"".join([obj.render(asf) for obj in self.objects])
return (self.GUID + struct.pack("<Q", 24 + 16 + 6 + len(data)) +
b"\x11\xD2\xD3\xAB\xBA\xA9\xcf\x11" +
b"\x8E\xE6\x00\xC0\x0C\x20\x53\x65" +
b"\x06\x00" + struct.pack("<I", len(data)) + data)
class MetadataObject(BaseObject):
"""Metadata description."""
GUID = b"\xea\xcb\xf8\xc5\xaf[wH\x84g\xaa\x8cD\xfaL\xca"
def parse(self, asf, data, fileobj, size):
super(MetadataObject, self).parse(asf, data, fileobj, size)
asf.metadata_obj = self
num_attributes, = struct.unpack("<H", data[0:2])
pos = 2
for i in range(num_attributes):
(reserved, stream, name_length, value_type,
value_length) = struct.unpack("<HHHHI", data[pos:pos+12])
pos += 12
name = data[pos:pos+name_length].decode("utf-16-le").strip("\x00")
pos += name_length
value = data[pos:pos+value_length]
pos += value_length
args = {'data': value, 'stream': stream}
if value_type == 2:
args['dword'] = False
attr = _attribute_types[value_type](**args)
asf.tags.append((name, attr))
def render(self, asf):
attrs = asf.to_metadata.items()
data = b"".join([attr.render_m(name) for (name, attr) in attrs])
return (self.GUID + struct.pack("<QH", 26 + len(data), len(attrs)) +
data)
class MetadataLibraryObject(BaseObject):
"""Metadata library description."""
GUID = b"\x94\x1c#D\x98\x94\xd1I\xa1A\x1d\x13NEpT"
def parse(self, asf, data, fileobj, size):
super(MetadataLibraryObject, self).parse(asf, data, fileobj, size)
asf.metadata_library_obj = self
num_attributes, = struct.unpack("<H", data[0:2])
pos = 2
for i in range(num_attributes):
(language, stream, name_length, value_type,
value_length) = struct.unpack("<HHHHI", data[pos:pos+12])
pos += 12
name = data[pos:pos+name_length].decode("utf-16-le").strip("\x00")
pos += name_length
value = data[pos:pos+value_length]
pos += value_length
args = {'data': value, 'language': language, 'stream': stream}
if value_type == 2:
args['dword'] = False
attr = _attribute_types[value_type](**args)
asf.tags.append((name, attr))
def render(self, asf):
attrs = asf.to_metadata_library
data = b"".join([attr.render_ml(name) for (name, attr) in attrs])
return (self.GUID + struct.pack("<QH", 26 + len(data), len(attrs)) +
data)
_object_types = {
ExtendedContentDescriptionObject.GUID: ExtendedContentDescriptionObject,
ContentDescriptionObject.GUID: ContentDescriptionObject,
FilePropertiesObject.GUID: FilePropertiesObject,
StreamPropertiesObject.GUID: StreamPropertiesObject,
HeaderExtensionObject.GUID: HeaderExtensionObject,
MetadataLibraryObject.GUID: MetadataLibraryObject,
MetadataObject.GUID: MetadataObject,
}
class ASF(FileType):
"""An ASF file, probably containing WMA or WMV."""
_mimes = ["audio/x-ms-wma", "audio/x-ms-wmv", "video/x-ms-asf",
"audio/x-wma", "video/x-wmv"]
def load(self, filename):
self.filename = filename
fileobj = open(filename, "rb")
try:
self.size = 0
self.size1 = 0
self.size2 = 0
self.offset1 = 0
self.offset2 = 0
self.num_objects = 0
self.info = ASFInfo()
self.tags = ASFTags()
self.__read_file(fileobj)
finally:
fileobj.close()
def save(self):
# Move attributes to the right objects
self.to_extended_content_description = {}
self.to_metadata = {}
self.to_metadata_library = []
for name, value in self.tags:
if name in _standard_attribute_names:
continue
library_only = (value.data_size() > 0xFFFF or value.TYPE == GUID)
if (value.language is None and value.stream is None and
name not in self.to_extended_content_description and
not library_only):
self.to_extended_content_description[name] = value
elif (value.language is None and value.stream is not None and
name not in self.to_metadata and not library_only):
self.to_metadata[name] = value
else:
self.to_metadata_library.append((name, value))
# Add missing objects
if not self.content_description_obj:
self.content_description_obj = \
ContentDescriptionObject()
self.objects.append(self.content_description_obj)
if not self.extended_content_description_obj:
self.extended_content_description_obj = \
ExtendedContentDescriptionObject()
self.objects.append(self.extended_content_description_obj)
if not self.header_extension_obj:
self.header_extension_obj = \
HeaderExtensionObject()
self.objects.append(self.header_extension_obj)
if not self.metadata_obj:
self.metadata_obj = \
MetadataObject()
self.header_extension_obj.objects.append(self.metadata_obj)
if not self.metadata_library_obj:
self.metadata_library_obj = \
MetadataLibraryObject()
self.header_extension_obj.objects.append(self.metadata_library_obj)
# Render the header
data = b"".join([obj.render(self) for obj in self.objects])
data = (HeaderObject.GUID +
struct.pack("<QL", len(data) + 30, len(self.objects)) +
b"\x01\x02" + data)
fileobj = open(self.filename, "rb+")
try:
size = len(data)
if size > self.size:
insert_bytes(fileobj, size - self.size, self.size)
if size < self.size:
delete_bytes(fileobj, self.size - size, 0)
fileobj.seek(0)
fileobj.write(data)
finally:
fileobj.close()
self.size = size
self.num_objects = len(self.objects)
def __read_file(self, fileobj):
header = fileobj.read(30)
if len(header) != 30 or header[:16] != HeaderObject.GUID:
raise ASFHeaderError("Not an ASF file.")
self.extended_content_description_obj = None
self.content_description_obj = None
self.header_extension_obj = None
self.metadata_obj = None
self.metadata_library_obj = None
self.size, self.num_objects = struct.unpack("<QL", header[16:28])
self.objects = []
for i in range(self.num_objects):
self.__read_object(fileobj)
def __read_object(self, fileobj):
guid, size = struct.unpack("<16sQ", fileobj.read(24))
if guid in _object_types:
obj = _object_types[guid]()
else:
obj = UnknownObject(guid)
data = fileobj.read(size - 24)
obj.parse(self, data, fileobj, size)
self.objects.append(obj)
@staticmethod
def score(filename, fileobj, header):
return header.startswith(HeaderObject.GUID) * 2
Open = ASF
@@ -0,0 +1,509 @@
# Simpler (but far more limited) API for ID3 editing
# Copyright 2006 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
"""Easier access to ID3 tags.
EasyID3 is a wrapper around mutagen.id3.ID3 to make ID3 tags appear
more like Vorbis or APEv2 tags.
"""
import mutagen.id3
from ._compat import iteritems, text_type, PY2
from mutagen import Metadata
from mutagen._util import DictMixin, dict_match
from mutagen.id3 import ID3, error, delete, ID3FileType
__all__ = ['EasyID3', 'Open', 'delete']
class EasyID3KeyError(KeyError, ValueError, error):
"""Raised when trying to get/set an invalid key.
Subclasses both KeyError and ValueError for API compatibility,
catching KeyError is preferred.
"""
class EasyID3(DictMixin, Metadata):
"""A file with an ID3 tag.
Like Vorbis comments, EasyID3 keys are case-insensitive ASCII
strings. Only a subset of ID3 frames are supported by default. Use
EasyID3.RegisterKey and its wrappers to support more.
You can also set the GetFallback, SetFallback, and DeleteFallback
to generic key getter/setter/deleter functions, which are called
if no specific handler is registered for a key. Additionally,
ListFallback can be used to supply an arbitrary list of extra
keys. These can be set on EasyID3 or on individual instances after
creation.
To use an EasyID3 class with mutagen.mp3.MP3::
from mutagen.mp3 import EasyMP3 as MP3
MP3(filename)
Because many of the attributes are constructed on the fly, things
like the following will not work::
ezid3["performer"].append("Joe")
Instead, you must do::
values = ezid3["performer"]
values.append("Joe")
ezid3["performer"] = values
"""
Set = {}
Get = {}
Delete = {}
List = {}
# For compatibility.
valid_keys = Get
GetFallback = None
SetFallback = None
DeleteFallback = None
ListFallback = None
@classmethod
def RegisterKey(cls, key,
getter=None, setter=None, deleter=None, lister=None):
"""Register a new key mapping.
A key mapping is four functions, a getter, setter, deleter,
and lister. The key may be either a string or a glob pattern.
The getter, deleted, and lister receive an ID3 instance and
the requested key name. The setter also receives the desired
value, which will be a list of strings.
The getter, setter, and deleter are used to implement __getitem__,
__setitem__, and __delitem__.
The lister is used to implement keys(). It should return a
list of keys that are actually in the ID3 instance, provided
by its associated getter.
"""
key = key.lower()
if getter is not None:
cls.Get[key] = getter
if setter is not None:
cls.Set[key] = setter
if deleter is not None:
cls.Delete[key] = deleter
if lister is not None:
cls.List[key] = lister
@classmethod
def RegisterTextKey(cls, key, frameid):
"""Register a text key.
If the key you need to register is a simple one-to-one mapping
of ID3 frame name to EasyID3 key, then you can use this
function::
EasyID3.RegisterTextKey("title", "TIT2")
"""
def getter(id3, key):
return list(id3[frameid])
def setter(id3, key, value):
try:
frame = id3[frameid]
except KeyError:
id3.add(mutagen.id3.Frames[frameid](encoding=3, text=value))
else:
frame.encoding = 3
frame.text = value
def deleter(id3, key):
del(id3[frameid])
cls.RegisterKey(key, getter, setter, deleter)
@classmethod
def RegisterTXXXKey(cls, key, desc):
"""Register a user-defined text frame key.
Some ID3 tags are stored in TXXX frames, which allow a
freeform 'description' which acts as a subkey,
e.g. TXXX:BARCODE.::
EasyID3.RegisterTXXXKey('barcode', 'BARCODE').
"""
frameid = "TXXX:" + desc
def getter(id3, key):
return list(id3[frameid])
def setter(id3, key, value):
try:
frame = id3[frameid]
except KeyError:
enc = 0
# Store 8859-1 if we can, per MusicBrainz spec.
for v in value:
if v and max(v) > u'\x7f':
enc = 3
id3.add(mutagen.id3.TXXX(encoding=enc, text=value, desc=desc))
else:
frame.text = value
def deleter(id3, key):
del(id3[frameid])
cls.RegisterKey(key, getter, setter, deleter)
def __init__(self, filename=None):
self.__id3 = ID3()
if filename is not None:
self.load(filename)
load = property(lambda s: s.__id3.load,
lambda s, v: setattr(s.__id3, 'load', v))
save = property(lambda s: s.__id3.save,
lambda s, v: setattr(s.__id3, 'save', v))
delete = property(lambda s: s.__id3.delete,
lambda s, v: setattr(s.__id3, 'delete', v))
filename = property(lambda s: s.__id3.filename,
lambda s, fn: setattr(s.__id3, 'filename', fn))
size = property(lambda s: s.__id3.size,
lambda s, fn: setattr(s.__id3, 'size', s))
def __getitem__(self, key):
key = key.lower()
func = dict_match(self.Get, key, self.GetFallback)
if func is not None:
return func(self.__id3, key)
else:
raise EasyID3KeyError("%r is not a valid key" % key)
def __setitem__(self, key, value):
key = key.lower()
if PY2:
if isinstance(value, basestring):
value = [value]
else:
if isinstance(value, text_type):
value = [value]
func = dict_match(self.Set, key, self.SetFallback)
if func is not None:
return func(self.__id3, key, value)
else:
raise EasyID3KeyError("%r is not a valid key" % key)
def __delitem__(self, key):
key = key.lower()
func = dict_match(self.Delete, key, self.DeleteFallback)
if func is not None:
return func(self.__id3, key)
else:
raise EasyID3KeyError("%r is not a valid key" % key)
def keys(self):
keys = []
for key in self.Get.keys():
if key in self.List:
keys.extend(self.List[key](self.__id3, key))
elif key in self:
keys.append(key)
if self.ListFallback is not None:
keys.extend(self.ListFallback(self.__id3, ""))
return keys
def pprint(self):
"""Print tag key=value pairs."""
strings = []
for key in sorted(self.keys()):
values = self[key]
for value in values:
strings.append("%s=%s" % (key, value))
return "\n".join(strings)
Open = EasyID3
def genre_get(id3, key):
return id3["TCON"].genres
def genre_set(id3, key, value):
try:
frame = id3["TCON"]
except KeyError:
id3.add(mutagen.id3.TCON(encoding=3, text=value))
else:
frame.encoding = 3
frame.genres = value
def genre_delete(id3, key):
del(id3["TCON"])
def date_get(id3, key):
return [stamp.text for stamp in id3["TDRC"].text]
def date_set(id3, key, value):
id3.add(mutagen.id3.TDRC(encoding=3, text=value))
def date_delete(id3, key):
del(id3["TDRC"])
def performer_get(id3, key):
people = []
wanted_role = key.split(":", 1)[1]
try:
mcl = id3["TMCL"]
except KeyError:
raise KeyError(key)
for role, person in mcl.people:
if role == wanted_role:
people.append(person)
if people:
return people
else:
raise KeyError(key)
def performer_set(id3, key, value):
wanted_role = key.split(":", 1)[1]
try:
mcl = id3["TMCL"]
except KeyError:
mcl = mutagen.id3.TMCL(encoding=3, people=[])
id3.add(mcl)
mcl.encoding = 3
people = [p for p in mcl.people if p[0] != wanted_role]
for v in value:
people.append((wanted_role, v))
mcl.people = people
def performer_delete(id3, key):
wanted_role = key.split(":", 1)[1]
try:
mcl = id3["TMCL"]
except KeyError:
raise KeyError(key)
people = [p for p in mcl.people if p[0] != wanted_role]
if people == mcl.people:
raise KeyError(key)
elif people:
mcl.people = people
else:
del(id3["TMCL"])
def performer_list(id3, key):
try:
mcl = id3["TMCL"]
except KeyError:
return []
else:
return list(set("performer:" + p[0] for p in mcl.people))
def musicbrainz_trackid_get(id3, key):
return [id3["UFID:http://musicbrainz.org"].data.decode('ascii')]
def musicbrainz_trackid_set(id3, key, value):
if len(value) != 1:
raise ValueError("only one track ID may be set per song")
value = value[0].encode('ascii')
try:
frame = id3["UFID:http://musicbrainz.org"]
except KeyError:
frame = mutagen.id3.UFID(owner="http://musicbrainz.org", data=value)
id3.add(frame)
else:
frame.data = value
def musicbrainz_trackid_delete(id3, key):
del(id3["UFID:http://musicbrainz.org"])
def website_get(id3, key):
urls = [frame.url for frame in id3.getall("WOAR")]
if urls:
return urls
else:
raise EasyID3KeyError(key)
def website_set(id3, key, value):
id3.delall("WOAR")
for v in value:
id3.add(mutagen.id3.WOAR(url=v))
def website_delete(id3, key):
id3.delall("WOAR")
def gain_get(id3, key):
try:
frame = id3["RVA2:" + key[11:-5]]
except KeyError:
raise EasyID3KeyError(key)
else:
return [u"%+f dB" % frame.gain]
def gain_set(id3, key, value):
if len(value) != 1:
raise ValueError(
"there must be exactly one gain value, not %r.", value)
gain = float(value[0].split()[0])
try:
frame = id3["RVA2:" + key[11:-5]]
except KeyError:
frame = mutagen.id3.RVA2(desc=key[11:-5], gain=0, peak=0, channel=1)
id3.add(frame)
frame.gain = gain
def gain_delete(id3, key):
try:
frame = id3["RVA2:" + key[11:-5]]
except KeyError:
pass
else:
if frame.peak:
frame.gain = 0.0
else:
del(id3["RVA2:" + key[11:-5]])
def peak_get(id3, key):
try:
frame = id3["RVA2:" + key[11:-5]]
except KeyError:
raise EasyID3KeyError(key)
else:
return [u"%f" % frame.peak]
def peak_set(id3, key, value):
if len(value) != 1:
raise ValueError(
"there must be exactly one peak value, not %r.", value)
peak = float(value[0])
if peak >= 2 or peak < 0:
raise ValueError("peak must be => 0 and < 2.")
try:
frame = id3["RVA2:" + key[11:-5]]
except KeyError:
frame = mutagen.id3.RVA2(desc=key[11:-5], gain=0, peak=0, channel=1)
id3.add(frame)
frame.peak = peak
def peak_delete(id3, key):
try:
frame = id3["RVA2:" + key[11:-5]]
except KeyError:
pass
else:
if frame.gain:
frame.peak = 0.0
else:
del(id3["RVA2:" + key[11:-5]])
def peakgain_list(id3, key):
keys = []
for frame in id3.getall("RVA2"):
keys.append("replaygain_%s_gain" % frame.desc)
keys.append("replaygain_%s_peak" % frame.desc)
return keys
for frameid, key in iteritems({
"TALB": "album",
"TBPM": "bpm",
"TCMP": "compilation", # iTunes extension
"TCOM": "composer",
"TCOP": "copyright",
"TENC": "encodedby",
"TEXT": "lyricist",
"TLEN": "length",
"TMED": "media",
"TMOO": "mood",
"TIT2": "title",
"TIT3": "version",
"TPE1": "artist",
"TPE2": "performer",
"TPE3": "conductor",
"TPE4": "arranger",
"TPOS": "discnumber",
"TPUB": "organization",
"TRCK": "tracknumber",
"TOLY": "author",
"TSO2": "albumartistsort", # iTunes extension
"TSOA": "albumsort",
"TSOC": "composersort", # iTunes extension
"TSOP": "artistsort",
"TSOT": "titlesort",
"TSRC": "isrc",
"TSST": "discsubtitle",
}):
EasyID3.RegisterTextKey(key, frameid)
EasyID3.RegisterKey("genre", genre_get, genre_set, genre_delete)
EasyID3.RegisterKey("date", date_get, date_set, date_delete)
EasyID3.RegisterKey(
"performer:*", performer_get, performer_set, performer_delete,
performer_list)
EasyID3.RegisterKey("musicbrainz_trackid", musicbrainz_trackid_get,
musicbrainz_trackid_set, musicbrainz_trackid_delete)
EasyID3.RegisterKey("website", website_get, website_set, website_delete)
EasyID3.RegisterKey("website", website_get, website_set, website_delete)
EasyID3.RegisterKey(
"replaygain_*_gain", gain_get, gain_set, gain_delete, peakgain_list)
EasyID3.RegisterKey("replaygain_*_peak", peak_get, peak_set, peak_delete)
# At various times, information for this came from
# http://musicbrainz.org/docs/specs/metadata_tags.html
# http://bugs.musicbrainz.org/ticket/1383
# http://musicbrainz.org/doc/MusicBrainzTag
for desc, key in iteritems({
u"MusicBrainz Artist Id": "musicbrainz_artistid",
u"MusicBrainz Album Id": "musicbrainz_albumid",
u"MusicBrainz Album Artist Id": "musicbrainz_albumartistid",
u"MusicBrainz TRM Id": "musicbrainz_trmid",
u"MusicIP PUID": "musicip_puid",
u"MusicMagic Fingerprint": "musicip_fingerprint",
u"MusicBrainz Album Status": "musicbrainz_albumstatus",
u"MusicBrainz Album Type": "musicbrainz_albumtype",
u"MusicBrainz Album Release Country": "releasecountry",
u"MusicBrainz Disc Id": "musicbrainz_discid",
u"ASIN": "asin",
u"ALBUMARTISTSORT": "albumartistsort",
u"BARCODE": "barcode",
}):
EasyID3.RegisterTXXXKey(key, desc)
class EasyID3FileType(ID3FileType):
"""Like ID3FileType, but uses EasyID3 for tags."""
ID3 = EasyID3
@@ -0,0 +1,274 @@
# Copyright 2009 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
from mutagen import Metadata
from mutagen._util import DictMixin, dict_match, utf8
from mutagen.mp4 import MP4, MP4Tags, error, delete
from ._compat import PY2, text_type
__all__ = ["EasyMP4Tags", "EasyMP4", "delete", "error"]
class EasyMP4KeyError(error, KeyError, ValueError):
pass
class EasyMP4Tags(DictMixin, Metadata):
"""A file with MPEG-4 iTunes metadata.
Like Vorbis comments, EasyMP4Tags keys are case-insensitive ASCII
strings, and values are a list of Unicode strings (and these lists
are always of length 0 or 1).
If you need access to the full MP4 metadata feature set, you should use
MP4, not EasyMP4.
"""
Set = {}
Get = {}
Delete = {}
List = {}
def __init__(self, *args, **kwargs):
self.__mp4 = MP4Tags(*args, **kwargs)
self.load = self.__mp4.load
self.save = self.__mp4.save
self.delete = self.__mp4.delete
filename = property(lambda s: s.__mp4.filename,
lambda s, fn: setattr(s.__mp4, 'filename', fn))
@classmethod
def RegisterKey(cls, key,
getter=None, setter=None, deleter=None, lister=None):
"""Register a new key mapping.
A key mapping is four functions, a getter, setter, deleter,
and lister. The key may be either a string or a glob pattern.
The getter, deleted, and lister receive an MP4Tags instance
and the requested key name. The setter also receives the
desired value, which will be a list of strings.
The getter, setter, and deleter are used to implement __getitem__,
__setitem__, and __delitem__.
The lister is used to implement keys(). It should return a
list of keys that are actually in the MP4 instance, provided
by its associated getter.
"""
key = key.lower()
if getter is not None:
cls.Get[key] = getter
if setter is not None:
cls.Set[key] = setter
if deleter is not None:
cls.Delete[key] = deleter
if lister is not None:
cls.List[key] = lister
@classmethod
def RegisterTextKey(cls, key, atomid):
"""Register a text key.
If the key you need to register is a simple one-to-one mapping
of MP4 atom name to EasyMP4Tags key, then you can use this
function::
EasyMP4Tags.RegisterTextKey("artist", "\xa9ART")
"""
def getter(tags, key):
return tags[atomid]
def setter(tags, key, value):
tags[atomid] = value
def deleter(tags, key):
del(tags[atomid])
cls.RegisterKey(key, getter, setter, deleter)
@classmethod
def RegisterIntKey(cls, key, atomid, min_value=0, max_value=2**16-1):
"""Register a scalar integer key.
"""
def getter(tags, key):
return list(map(text_type, tags[atomid]))
def setter(tags, key, value):
clamp = lambda x: int(min(max(min_value, x), max_value))
tags[atomid] = list(map(clamp, map(int, value)))
def deleter(tags, key):
del(tags[atomid])
cls.RegisterKey(key, getter, setter, deleter)
@classmethod
def RegisterIntPairKey(cls, key, atomid, min_value=0, max_value=2**16-1):
def getter(tags, key):
ret = []
for (track, total) in tags[atomid]:
if total:
ret.append(u"%d/%d" % (track, total))
else:
ret.append(text_type(track))
return ret
def setter(tags, key, value):
clamp = lambda x: int(min(max(min_value, x), max_value))
data = []
for v in value:
try:
tracks, total = v.split("/")
tracks = clamp(int(tracks))
total = clamp(int(total))
except (ValueError, TypeError):
tracks = clamp(int(v))
total = min_value
data.append((tracks, total))
tags[atomid] = data
def deleter(tags, key):
del(tags[atomid])
cls.RegisterKey(key, getter, setter, deleter)
@classmethod
def RegisterFreeformKey(cls, key, name, mean=b"com.apple.iTunes"):
"""Register a text key.
If the key you need to register is a simple one-to-one mapping
of MP4 freeform atom (----) and name to EasyMP4Tags key, then
you can use this function::
EasyMP4Tags.RegisterFreeformKey(
"musicbrainz_artistid", "MusicBrainz Artist Id")
"""
atomid = b"----:" + mean + b":" + name
def getter(tags, key):
return [s.decode("utf-8", "replace") for s in tags[atomid]]
def setter(tags, key, value):
tags[atomid] = [utf8(v) for v in value]
def deleter(tags, key):
del(tags[atomid])
cls.RegisterKey(key, getter, setter, deleter)
def __getitem__(self, key):
key = key.lower()
func = dict_match(self.Get, key)
if func is not None:
return func(self.__mp4, key)
else:
raise EasyMP4KeyError("%r is not a valid key" % key)
def __setitem__(self, key, value):
key = key.lower()
if PY2:
if isinstance(value, basestring):
value = [value]
else:
if isinstance(value, text_type):
value = [value]
func = dict_match(self.Set, key)
if func is not None:
return func(self.__mp4, key, value)
else:
raise EasyMP4KeyError("%r is not a valid key" % key)
def __delitem__(self, key):
key = key.lower()
func = dict_match(self.Delete, key)
if func is not None:
return func(self.__mp4, key)
else:
raise EasyMP4KeyError("%r is not a valid key" % key)
def keys(self):
keys = []
for key in self.Get.keys():
if key in self.List:
keys.extend(self.List[key](self.__mp4, key))
elif key in self:
keys.append(key)
return keys
def pprint(self):
"""Print tag key=value pairs."""
strings = []
for key in sorted(self.keys()):
values = self[key]
for value in values:
strings.append("%s=%s" % (key, value))
return "\n".join(strings)
for atomid, key in {
b'\xa9nam': 'title',
b'\xa9alb': 'album',
b'\xa9ART': 'artist',
b'aART': 'albumartist',
b'\xa9day': 'date',
b'\xa9cmt': 'comment',
b'desc': 'description',
b'\xa9grp': 'grouping',
b'\xa9gen': 'genre',
b'cprt': 'copyright',
b'soal': 'albumsort',
b'soaa': 'albumartistsort',
b'soar': 'artistsort',
b'sonm': 'titlesort',
b'soco': 'composersort',
}.items():
EasyMP4Tags.RegisterTextKey(key, atomid)
for name, key in {
b'MusicBrainz Artist Id': 'musicbrainz_artistid',
b'MusicBrainz Track Id': 'musicbrainz_trackid',
b'MusicBrainz Album Id': 'musicbrainz_albumid',
b'MusicBrainz Album Artist Id': 'musicbrainz_albumartistid',
b'MusicIP PUID': 'musicip_puid',
b'MusicBrainz Album Status': 'musicbrainz_albumstatus',
b'MusicBrainz Album Type': 'musicbrainz_albumtype',
b'MusicBrainz Release Country': 'releasecountry',
}.items():
EasyMP4Tags.RegisterFreeformKey(key, name)
for name, key in {
b"tmpo": "bpm",
}.items():
EasyMP4Tags.RegisterIntKey(key, name)
for name, key in {
b"trkn": "tracknumber",
b"disk": "discnumber",
}.items():
EasyMP4Tags.RegisterIntPairKey(key, name)
class EasyMP4(MP4):
"""Like :class:`MP4 <mutagen.mp4.MP4>`,
but uses :class:`EasyMP4Tags` for tags.
:ivar info: :class:`MP4Info <mutagen.mp4.MP4Info>`
:ivar tags: :class:`EasyMP4Tags`
"""
MP4Tags = EasyMP4Tags
Get = EasyMP4Tags.Get
Set = EasyMP4Tags.Set
Delete = EasyMP4Tags.Delete
List = EasyMP4Tags.List
RegisterTextKey = EasyMP4Tags.RegisterTextKey
RegisterKey = EasyMP4Tags.RegisterKey
@@ -0,0 +1,839 @@
# FLAC comment support for Mutagen
# Copyright 2005 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
"""Read and write FLAC Vorbis comments and stream information.
Read more about FLAC at http://flac.sourceforge.net.
FLAC supports arbitrary metadata blocks. The two most interesting ones
are the FLAC stream information block, and the Vorbis comment block;
these are also the only ones Mutagen can currently read.
This module does not handle Ogg FLAC files.
Based off documentation available at
http://flac.sourceforge.net/format.html
"""
__all__ = ["FLAC", "Open", "delete"]
import struct
from ._vorbis import VCommentDict
import mutagen
from ._compat import cBytesIO, endswith, chr_
from mutagen._util import insert_bytes
from mutagen.id3 import BitPaddedInt
import sys
if sys.version_info >= (2, 6):
from functools import reduce
class error(IOError):
pass
class FLACNoHeaderError(error):
pass
class FLACVorbisError(ValueError, error):
pass
def to_int_be(string):
"""Convert an arbitrarily-long string to a long using big-endian
byte order."""
return reduce(lambda a, b: (a << 8) + b, bytearray(string), 0)
class StrictFileObject(object):
"""Wraps a file-like object and raises an exception if the requested
amount of data to read isn't returned."""
def __init__(self, fileobj):
self._fileobj = fileobj
for m in ["close", "tell", "seek", "write", "name"]:
if hasattr(fileobj, m):
setattr(self, m, getattr(fileobj, m))
def read(self, size=-1):
data = self._fileobj.read(size)
if size >= 0 and len(data) != size:
raise error("file said %d bytes, read %d bytes" % (
size, len(data)))
return data
def tryread(self, *args):
return self._fileobj.read(*args)
class MetadataBlock(object):
"""A generic block of FLAC metadata.
This class is extended by specific used as an ancestor for more specific
blocks, and also as a container for data blobs of unknown blocks.
Attributes:
* data -- raw binary data for this block
"""
_distrust_size = False
def __init__(self, data):
"""Parse the given data string or file-like as a metadata block.
The metadata header should not be included."""
if data is not None:
if not isinstance(data, StrictFileObject):
if isinstance(data, bytes):
data = cBytesIO(data)
elif not hasattr(data, 'read'):
raise TypeError(
"StreamInfo requires string data or a file-like")
data = StrictFileObject(data)
self.load(data)
def load(self, data):
self.data = data.read()
def write(self):
return self.data
@staticmethod
def writeblocks(blocks):
"""Render metadata block as a byte string."""
data = []
codes = [[block.code, block.write()] for block in blocks]
codes[-1][0] |= 128
for code, datum in codes:
byte = chr_(code)
if len(datum) > 2**24:
raise error("block is too long to write")
length = struct.pack(">I", len(datum))[-3:]
data.append(byte + length + datum)
return b"".join(data)
@staticmethod
def group_padding(blocks):
"""Consolidate FLAC padding metadata blocks.
The overall size of the rendered blocks does not change, so
this adds several bytes of padding for each merged block.
"""
paddings = [b for b in blocks if isinstance(b, Padding)]
for p in paddings:
blocks.remove(p)
# total padding size is the sum of padding sizes plus 4 bytes
# per removed header.
size = sum([padding.length for padding in paddings])
padding = Padding()
padding.length = size + 4 * (len(paddings) - 1)
blocks.append(padding)
class StreamInfo(MetadataBlock, mutagen.StreamInfo):
"""FLAC stream information.
This contains information about the audio data in the FLAC file.
Unlike most stream information objects in Mutagen, changes to this
one will rewritten to the file when it is saved. Unless you are
actually changing the audio stream itself, don't change any
attributes of this block.
Attributes:
* min_blocksize -- minimum audio block size
* max_blocksize -- maximum audio block size
* sample_rate -- audio sample rate in Hz
* channels -- audio channels (1 for mono, 2 for stereo)
* bits_per_sample -- bits per sample
* total_samples -- total samples in file
* length -- audio length in seconds
"""
code = 0
def __eq__(self, other):
try:
return (self.min_blocksize == other.min_blocksize and
self.max_blocksize == other.max_blocksize and
self.sample_rate == other.sample_rate and
self.channels == other.channels and
self.bits_per_sample == other.bits_per_sample and
self.total_samples == other.total_samples)
except:
return False
__hash__ = MetadataBlock.__hash__
def load(self, data):
self.min_blocksize = int(to_int_be(data.read(2)))
self.max_blocksize = int(to_int_be(data.read(2)))
self.min_framesize = int(to_int_be(data.read(3)))
self.max_framesize = int(to_int_be(data.read(3)))
# first 16 bits of sample rate
sample_first = to_int_be(data.read(2))
# last 4 bits of sample rate, 3 of channels, first 1 of bits/sample
sample_channels_bps = to_int_be(data.read(1))
# last 4 of bits/sample, 36 of total samples
bps_total = to_int_be(data.read(5))
sample_tail = sample_channels_bps >> 4
self.sample_rate = int((sample_first << 4) + sample_tail)
if not self.sample_rate:
raise error("A sample rate value of 0 is invalid")
self.channels = int(((sample_channels_bps >> 1) & 7) + 1)
bps_tail = bps_total >> 36
bps_head = (sample_channels_bps & 1) << 4
self.bits_per_sample = int(bps_head + bps_tail + 1)
self.total_samples = bps_total & 0xFFFFFFFFF
self.length = self.total_samples / float(self.sample_rate)
self.md5_signature = to_int_be(data.read(16))
def write(self):
f = cBytesIO()
f.write(struct.pack(">I", self.min_blocksize)[-2:])
f.write(struct.pack(">I", self.max_blocksize)[-2:])
f.write(struct.pack(">I", self.min_framesize)[-3:])
f.write(struct.pack(">I", self.max_framesize)[-3:])
# first 16 bits of sample rate
f.write(struct.pack(">I", self.sample_rate >> 4)[-2:])
# 4 bits sample, 3 channel, 1 bps
byte = (self.sample_rate & 0xF) << 4
byte += ((self.channels - 1) & 7) << 1
byte += ((self.bits_per_sample - 1) >> 4) & 1
f.write(chr_(byte))
# 4 bits of bps, 4 of sample count
byte = ((self.bits_per_sample - 1) & 0xF) << 4
byte += (self.total_samples >> 32) & 0xF
f.write(chr_(byte))
# last 32 of sample count
f.write(struct.pack(">I", self.total_samples & 0xFFFFFFFF))
# MD5 signature
sig = self.md5_signature
f.write(struct.pack(
">4I", (sig >> 96) & 0xFFFFFFFF, (sig >> 64) & 0xFFFFFFFF,
(sig >> 32) & 0xFFFFFFFF, sig & 0xFFFFFFFF))
return f.getvalue()
def pprint(self):
return "FLAC, %.2f seconds, %d Hz" % (self.length, self.sample_rate)
class SeekPoint(tuple):
"""A single seek point in a FLAC file.
Placeholder seek points have first_sample of 0xFFFFFFFFFFFFFFFFL,
and byte_offset and num_samples undefined. Seek points must be
sorted in ascending order by first_sample number. Seek points must
be unique by first_sample number, except for placeholder
points. Placeholder points must occur last in the table and there
may be any number of them.
Attributes:
* first_sample -- sample number of first sample in the target frame
* byte_offset -- offset from first frame to target frame
* num_samples -- number of samples in target frame
"""
def __new__(cls, first_sample, byte_offset, num_samples):
return super(cls, SeekPoint).__new__(
cls, (first_sample, byte_offset, num_samples))
first_sample = property(lambda self: self[0])
byte_offset = property(lambda self: self[1])
num_samples = property(lambda self: self[2])
class SeekTable(MetadataBlock):
"""Read and write FLAC seek tables.
Attributes:
* seekpoints -- list of SeekPoint objects
"""
__SEEKPOINT_FORMAT = '>QQH'
__SEEKPOINT_SIZE = struct.calcsize(__SEEKPOINT_FORMAT)
code = 3
def __init__(self, data):
self.seekpoints = []
super(SeekTable, self).__init__(data)
def __eq__(self, other):
try:
return (self.seekpoints == other.seekpoints)
except (AttributeError, TypeError):
return False
__hash__ = MetadataBlock.__hash__
def load(self, data):
self.seekpoints = []
sp = data.tryread(self.__SEEKPOINT_SIZE)
while len(sp) == self.__SEEKPOINT_SIZE:
self.seekpoints.append(SeekPoint(
*struct.unpack(self.__SEEKPOINT_FORMAT, sp)))
sp = data.tryread(self.__SEEKPOINT_SIZE)
def write(self):
f = cBytesIO()
for seekpoint in self.seekpoints:
packed = struct.pack(
self.__SEEKPOINT_FORMAT,
seekpoint.first_sample, seekpoint.byte_offset,
seekpoint.num_samples)
f.write(packed)
return f.getvalue()
def __repr__(self):
return "<%s seekpoints=%r>" % (type(self).__name__, self.seekpoints)
class VCFLACDict(VCommentDict):
"""Read and write FLAC Vorbis comments.
FLACs don't use the framing bit at the end of the comment block.
So this extends VCommentDict to not use the framing bit.
"""
code = 4
_distrust_size = True
def load(self, data, errors='replace', framing=False):
super(VCFLACDict, self).load(data, errors=errors, framing=framing)
def write(self, framing=False):
return super(VCFLACDict, self).write(framing=framing)
class CueSheetTrackIndex(tuple):
"""Index for a track in a cuesheet.
For CD-DA, an index_number of 0 corresponds to the track
pre-gap. The first index in a track must have a number of 0 or 1,
and subsequently, index_numbers must increase by 1. Index_numbers
must be unique within a track. And index_offset must be evenly
divisible by 588 samples.
Attributes:
* index_number -- index point number
* index_offset -- offset in samples from track start
"""
def __new__(cls, index_number, index_offset):
return super(cls, CueSheetTrackIndex).__new__(
cls, (index_number, index_offset))
index_number = property(lambda self: self[0])
index_offset = property(lambda self: self[1])
class CueSheetTrack(object):
"""A track in a cuesheet.
For CD-DA, track_numbers must be 1-99, or 170 for the
lead-out. Track_numbers must be unique within a cue sheet. There
must be atleast one index in every track except the lead-out track
which must have none.
Attributes:
* track_number -- track number
* start_offset -- track offset in samples from start of FLAC stream
* isrc -- ISRC code
* type -- 0 for audio, 1 for digital data
* pre_emphasis -- true if the track is recorded with pre-emphasis
* indexes -- list of CueSheetTrackIndex objects
"""
def __init__(self, track_number, start_offset, isrc='', type_=0,
pre_emphasis=False):
self.track_number = track_number
self.start_offset = start_offset
self.isrc = isrc
self.type = type_
self.pre_emphasis = pre_emphasis
self.indexes = []
def __eq__(self, other):
try:
return (self.track_number == other.track_number and
self.start_offset == other.start_offset and
self.isrc == other.isrc and
self.type == other.type and
self.pre_emphasis == other.pre_emphasis and
self.indexes == other.indexes)
except (AttributeError, TypeError):
return False
__hash__ = object.__hash__
def __repr__(self):
return ("<%s number=%r, offset=%d, isrc=%r, type=%r, "
"pre_emphasis=%r, indexes=%r)>") % (
type(self).__name__, self.track_number, self.start_offset,
self.isrc, self.type, self.pre_emphasis, self.indexes)
class CueSheet(MetadataBlock):
"""Read and write FLAC embedded cue sheets.
Number of tracks should be from 1 to 100. There should always be
exactly one lead-out track and that track must be the last track
in the cue sheet.
Attributes:
* media_catalog_number -- media catalog number in ASCII
* lead_in_samples -- number of lead-in samples
* compact_disc -- true if the cuesheet corresponds to a compact disc
* tracks -- list of CueSheetTrack objects
* lead_out -- lead-out as CueSheetTrack or None if lead-out was not found
"""
__CUESHEET_FORMAT = '>128sQB258xB'
__CUESHEET_SIZE = struct.calcsize(__CUESHEET_FORMAT)
__CUESHEET_TRACK_FORMAT = '>QB12sB13xB'
__CUESHEET_TRACK_SIZE = struct.calcsize(__CUESHEET_TRACK_FORMAT)
__CUESHEET_TRACKINDEX_FORMAT = '>QB3x'
__CUESHEET_TRACKINDEX_SIZE = struct.calcsize(__CUESHEET_TRACKINDEX_FORMAT)
code = 5
media_catalog_number = b''
lead_in_samples = 88200
compact_disc = True
def __init__(self, data):
self.tracks = []
super(CueSheet, self).__init__(data)
def __eq__(self, other):
try:
return (self.media_catalog_number == other.media_catalog_number and
self.lead_in_samples == other.lead_in_samples and
self.compact_disc == other.compact_disc and
self.tracks == other.tracks)
except (AttributeError, TypeError):
return False
__hash__ = MetadataBlock.__hash__
def load(self, data):
header = data.read(self.__CUESHEET_SIZE)
media_catalog_number, lead_in_samples, flags, num_tracks = \
struct.unpack(self.__CUESHEET_FORMAT, header)
self.media_catalog_number = media_catalog_number.rstrip(b'\0')
self.lead_in_samples = lead_in_samples
self.compact_disc = bool(flags & 0x80)
self.tracks = []
for i in range(num_tracks):
track = data.read(self.__CUESHEET_TRACK_SIZE)
start_offset, track_number, isrc_padded, flags, num_indexes = \
struct.unpack(self.__CUESHEET_TRACK_FORMAT, track)
isrc = isrc_padded.rstrip(b'\0')
type_ = (flags & 0x80) >> 7
pre_emphasis = bool(flags & 0x40)
val = CueSheetTrack(
track_number, start_offset, isrc, type_, pre_emphasis)
for j in range(num_indexes):
index = data.read(self.__CUESHEET_TRACKINDEX_SIZE)
index_offset, index_number = struct.unpack(
self.__CUESHEET_TRACKINDEX_FORMAT, index)
val.indexes.append(
CueSheetTrackIndex(index_number, index_offset))
self.tracks.append(val)
def write(self):
f = cBytesIO()
flags = 0
if self.compact_disc:
flags |= 0x80
packed = struct.pack(
self.__CUESHEET_FORMAT, self.media_catalog_number,
self.lead_in_samples, flags, len(self.tracks))
f.write(packed)
for track in self.tracks:
track_flags = 0
track_flags |= (track.type & 1) << 7
if track.pre_emphasis:
track_flags |= 0x40
track_packed = struct.pack(
self.__CUESHEET_TRACK_FORMAT, track.start_offset,
track.track_number, track.isrc, track_flags,
len(track.indexes))
f.write(track_packed)
for index in track.indexes:
index_packed = struct.pack(
self.__CUESHEET_TRACKINDEX_FORMAT,
index.index_offset, index.index_number)
f.write(index_packed)
return f.getvalue()
def __repr__(self):
return ("<%s media_catalog_number=%r, lead_in=%r, compact_disc=%r, "
"tracks=%r>") % (
type(self).__name__, self.media_catalog_number,
self.lead_in_samples, self.compact_disc, self.tracks)
class Picture(MetadataBlock):
"""Read and write FLAC embed pictures.
Attributes:
* type -- picture type (same as types for ID3 APIC frames)
* mime -- MIME type of the picture
* desc -- picture's description
* width -- width in pixels
* height -- height in pixels
* depth -- color depth in bits-per-pixel
* colors -- number of colors for indexed palettes (like GIF),
0 for non-indexed
* data -- picture data
"""
code = 6
_distrust_size = True
def __init__(self, data=None):
self.type = 0
self.mime = u''
self.desc = u''
self.width = 0
self.height = 0
self.depth = 0
self.colors = 0
self.data = b''
super(Picture, self).__init__(data)
def __eq__(self, other):
try:
return (self.type == other.type and
self.mime == other.mime and
self.desc == other.desc and
self.width == other.width and
self.height == other.height and
self.depth == other.depth and
self.colors == other.colors and
self.data == other.data)
except (AttributeError, TypeError):
return False
__hash__ = MetadataBlock.__hash__
def load(self, data):
self.type, length = struct.unpack('>2I', data.read(8))
self.mime = data.read(length).decode('UTF-8', 'replace')
length, = struct.unpack('>I', data.read(4))
self.desc = data.read(length).decode('UTF-8', 'replace')
(self.width, self.height, self.depth,
self.colors, length) = struct.unpack('>5I', data.read(20))
self.data = data.read(length)
def write(self):
f = cBytesIO()
mime = self.mime.encode('UTF-8')
f.write(struct.pack('>2I', self.type, len(mime)))
f.write(mime)
desc = self.desc.encode('UTF-8')
f.write(struct.pack('>I', len(desc)))
f.write(desc)
f.write(struct.pack('>5I', self.width, self.height, self.depth,
self.colors, len(self.data)))
f.write(self.data)
return f.getvalue()
def __repr__(self):
return "<%s '%s' (%d bytes)>" % (type(self).__name__, self.mime,
len(self.data))
class Padding(MetadataBlock):
"""Empty padding space for metadata blocks.
To avoid rewriting the entire FLAC file when editing comments,
metadata is often padded. Padding should occur at the end, and no
more than one padding block should be in any FLAC file. Mutagen
handles this with MetadataBlock.group_padding.
"""
code = 1
def __init__(self, data=b""):
super(Padding, self).__init__(data)
def load(self, data):
self.length = len(data.read())
def write(self):
try:
return b"\x00" * self.length
# On some 64 bit platforms this won't generate a MemoryError
# or OverflowError since you might have enough RAM, but it
# still generates a ValueError. On other 64 bit platforms,
# this will still succeed for extremely large values.
# Those should never happen in the real world, and if they
# do, writeblocks will catch it.
except (OverflowError, ValueError, MemoryError):
raise error("cannot write %d bytes" % self.length)
def __eq__(self, other):
return isinstance(other, Padding) and self.length == other.length
__hash__ = MetadataBlock.__hash__
def __repr__(self):
return "<%s (%d bytes)>" % (type(self).__name__, self.length)
class FLAC(mutagen.FileType):
"""A FLAC audio file.
Attributes:
* info -- stream information (length, bitrate, sample rate)
* tags -- metadata tags, if any
* cuesheet -- CueSheet object, if any
* seektable -- SeekTable object, if any
* pictures -- list of embedded pictures
"""
_mimes = ["audio/x-flac", "application/x-flac"]
METADATA_BLOCKS = [StreamInfo, Padding, None, SeekTable, VCFLACDict,
CueSheet, Picture]
"""Known metadata block types, indexed by ID."""
@staticmethod
def score(filename, fileobj, header):
return (header.startswith(b"fLaC") +
endswith(filename.lower(), ".flac") * 3)
def __read_metadata_block(self, fileobj):
byte = ord(fileobj.read(1))
size = to_int_be(fileobj.read(3))
code = byte & 0x7F
last_block = bool(byte & 0x80)
try:
block_type = self.METADATA_BLOCKS[code] or MetadataBlock
except IndexError:
block_type = MetadataBlock
if block_type._distrust_size:
# Some jackass is writing broken Metadata block length
# for Vorbis comment blocks, and the FLAC reference
# implementaton can parse them (mostly by accident),
# so we have to too. Instead of parsing the size
# given, parse an actual Vorbis comment, leaving
# fileobj in the right position.
# http://code.google.com/p/mutagen/issues/detail?id=52
# ..same for the Picture block:
# http://code.google.com/p/mutagen/issues/detail?id=106
block = block_type(fileobj)
else:
data = fileobj.read(size)
block = block_type(data)
block.code = code
if block.code == VCFLACDict.code:
if self.tags is None:
self.tags = block
else:
raise FLACVorbisError("> 1 Vorbis comment block found")
elif block.code == CueSheet.code:
if self.cuesheet is None:
self.cuesheet = block
else:
raise error("> 1 CueSheet block found")
elif block.code == SeekTable.code:
if self.seektable is None:
self.seektable = block
else:
raise error("> 1 SeekTable block found")
self.metadata_blocks.append(block)
return not last_block
def add_tags(self):
"""Add a Vorbis comment block to the file."""
if self.tags is None:
self.tags = VCFLACDict()
self.metadata_blocks.append(self.tags)
else:
raise FLACVorbisError("a Vorbis comment already exists")
add_vorbiscomment = add_tags
def delete(self, filename=None):
"""Remove Vorbis comments from a file.
If no filename is given, the one most recently loaded is used.
"""
if filename is None:
filename = self.filename
for s in list(self.metadata_blocks):
if isinstance(s, VCFLACDict):
self.metadata_blocks.remove(s)
self.tags = None
self.save()
break
vc = property(lambda s: s.tags, doc="Alias for tags; don't use this.")
def load(self, filename):
"""Load file information from a filename."""
self.metadata_blocks = []
self.tags = None
self.cuesheet = None
self.seektable = None
self.filename = filename
fileobj = StrictFileObject(open(filename, "rb"))
try:
self.__check_header(fileobj)
while self.__read_metadata_block(fileobj):
pass
finally:
fileobj.close()
try:
self.metadata_blocks[0].length
except (AttributeError, IndexError):
raise FLACNoHeaderError("Stream info block not found")
@property
def info(self):
return self.metadata_blocks[0]
def add_picture(self, picture):
"""Add a new picture to the file."""
self.metadata_blocks.append(picture)
def clear_pictures(self):
"""Delete all pictures from the file."""
blocks = [b for b in self.metadata_blocks if b.code != Picture.code]
self.metadata_blocks = blocks
@property
def pictures(self):
"""List of embedded pictures"""
return [b for b in self.metadata_blocks if b.code == Picture.code]
def save(self, filename=None, deleteid3=False):
"""Save metadata blocks to a file.
If no filename is given, the one most recently loaded is used.
"""
if filename is None:
filename = self.filename
f = open(filename, 'rb+')
try:
# Ensure we've got padding at the end, and only at the end.
# If adding makes it too large, we'll scale it down later.
self.metadata_blocks.append(Padding(b'\x00' * 1020))
MetadataBlock.group_padding(self.metadata_blocks)
header = self.__check_header(f)
# "fLaC" and maybe ID3
available = self.__find_audio_offset(f) - header
data = MetadataBlock.writeblocks(self.metadata_blocks)
# Delete ID3v2
if deleteid3 and header > 4:
available += header - 4
header = 4
if len(data) > available:
# If we have too much data, see if we can reduce padding.
padding = self.metadata_blocks[-1]
newlength = padding.length - (len(data) - available)
if newlength > 0:
padding.length = newlength
data = MetadataBlock.writeblocks(self.metadata_blocks)
assert len(data) == available
elif len(data) < available:
# If we have too little data, increase padding.
self.metadata_blocks[-1].length += (available - len(data))
data = MetadataBlock.writeblocks(self.metadata_blocks)
assert len(data) == available
if len(data) != available:
# We couldn't reduce the padding enough.
diff = (len(data) - available)
insert_bytes(f, diff, header)
f.seek(header - 4)
f.write(b"fLaC" + data)
# Delete ID3v1
if deleteid3:
try:
f.seek(-128, 2)
except IOError:
pass
else:
if f.read(3) == b"TAG":
f.seek(-128, 2)
f.truncate()
finally:
f.close()
def __find_audio_offset(self, fileobj):
byte = 0x00
while not (byte & 0x80):
byte = ord(fileobj.read(1))
size = to_int_be(fileobj.read(3))
try:
block_type = self.METADATA_BLOCKS[byte & 0x7F]
except IndexError:
block_type = None
if block_type and block_type._distrust_size:
# See comments in read_metadata_block; the size can't
# be trusted for Vorbis comment blocks and Picture block
block_type(fileobj)
else:
fileobj.read(size)
return fileobj.tell()
def __check_header(self, fileobj):
size = 4
header = fileobj.read(4)
if header != b"fLaC":
size = None
if header[:3] == b"ID3":
size = 14 + BitPaddedInt(fileobj.read(6)[2:])
fileobj.seek(size - 4)
if fileobj.read(4) != b"fLaC":
size = None
if size is None:
raise FLACNoHeaderError(
"%r is not a valid FLAC file" % fileobj.name)
return size
Open = FLAC
def delete(filename):
"""Remove tags from a file."""
FLAC(filename).delete()
@@ -0,0 +1,937 @@
# id3 support for mutagen
# Copyright (C) 2005 Michael Urman
# 2006 Lukas Lalinsky
# 2013 Christoph Reiter
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
"""ID3v2 reading and writing.
This is based off of the following references:
* http://id3.org/id3v2.4.0-structure
* http://id3.org/id3v2.4.0-frames
* http://id3.org/id3v2.3.0
* http://id3.org/id3v2-00
* http://id3.org/ID3v1
Its largest deviation from the above (versions 2.3 and 2.2) is that it
will not interpret the / characters as a separator, and will almost
always accept null separators to generate multi-valued text frames.
Because ID3 frame structure differs between frame types, each frame is
implemented as a different class (e.g. TIT2 as mutagen.id3.TIT2). Each
frame's documentation contains a list of its attributes.
Since this file's documentation is a little unwieldy, you are probably
interested in the :class:`ID3` class to start with.
"""
__all__ = ['ID3', 'ID3FileType', 'Frames', 'Open', 'delete']
import struct
from struct import unpack, pack, error as StructError
import mutagen
from mutagen._util import insert_bytes, delete_bytes, DictProxy
from ._compat import reraise, chr_
from mutagen._id3util import *
from mutagen._id3frames import *
from mutagen._id3specs import *
class ID3(DictProxy, mutagen.Metadata):
"""A file with an ID3v2 tag.
Attributes:
* version -- ID3 tag version as a tuple
* unknown_frames -- raw frame data of any unknown frames found
* size -- the total size of the ID3 tag, including the header
"""
PEDANTIC = True
version = (2, 4, 0)
filename = None
size = 0
__flags = 0
__readbytes = 0
__crc = None
__unknown_version = None
_V24 = (2, 4, 0)
_V23 = (2, 3, 0)
_V22 = (2, 2, 0)
_V11 = (1, 1)
def __init__(self, *args, **kwargs):
self.unknown_frames = []
super(ID3, self).__init__(*args, **kwargs)
def __fullread(self, size):
try:
if size < 0:
raise ValueError('Requested bytes (%s) less than zero' % size)
if size > self.__filesize:
raise EOFError('Requested %#x of %#x (%s)' % (
int(size), int(self.__filesize), self.filename))
except AttributeError:
pass
data = self._fileobj.read(size)
if len(data) != size:
raise EOFError
self.__readbytes += size
return data
def load(self, filename, known_frames=None, translate=True, v2_version=4):
"""Load tags from a filename.
Keyword arguments:
* filename -- filename to load tag data from
* known_frames -- dict mapping frame IDs to Frame objects
* translate -- Update all tags to ID3v2.3/4 internally. If you
intend to save, this must be true or you have to
call update_to_v23() / update_to_v24() manually.
* v2_version -- if update_to_v23 or update_to_v24 get called (3 or 4)
Example of loading a custom frame::
my_frames = dict(mutagen.id3.Frames)
class XMYF(Frame): ...
my_frames["XMYF"] = XMYF
mutagen.id3.ID3(filename, known_frames=my_frames)
"""
if not v2_version in (3, 4):
raise ValueError("Only 3 and 4 possible for v2_version")
from os.path import getsize
self.filename = filename
self.__known_frames = known_frames
self._fileobj = open(filename, 'rb')
self.__filesize = getsize(filename)
try:
try:
self._load_header()
except EOFError:
self.size = 0
raise ID3NoHeaderError("%s: too small (%d bytes)" % (
filename, self.__filesize))
except (ID3NoHeaderError, ID3UnsupportedVersionError) as err:
self.size = 0
import sys
stack = sys.exc_info()[2]
try:
self._fileobj.seek(-128, 2)
except EnvironmentError:
reraise(err, None, stack)
else:
frames = ParseID3v1(self._fileobj.read(128))
if frames is not None:
self.version = self._V11
for v in frames.values():
self.add(v)
else:
reraise(type(err), None, stack)
else:
frames = self.__known_frames
if frames is None:
if self._V23 <= self.version:
frames = Frames
elif self._V22 <= self.version:
frames = Frames_2_2
data = self.__fullread(self.size - 10)
for frame in self.__read_frames(data, frames=frames):
if isinstance(frame, Frame):
self.add(frame)
else:
self.unknown_frames.append(frame)
self.__unknown_version = self.version
finally:
self._fileobj.close()
del self._fileobj
del self.__filesize
if translate:
if v2_version == 3:
self.update_to_v23()
else:
self.update_to_v24()
def getall(self, key):
"""Return all frames with a given name (the list may be empty).
This is best explained by examples::
id3.getall('TIT2') == [id3['TIT2']]
id3.getall('TTTT') == []
id3.getall('TXXX') == [TXXX(desc='woo', text='bar'),
TXXX(desc='baz', text='quuuux'), ...]
Since this is based on the frame's HashKey, which is
colon-separated, you can use it to do things like
``getall('COMM:MusicMatch')`` or ``getall('TXXX:QuodLibet:')``.
"""
if key in self:
return [self[key]]
else:
key = key + ":"
return [v for s, v in self.items() if s.startswith(key)]
def delall(self, key):
"""Delete all tags of a given kind; see getall."""
if key in self:
del(self[key])
else:
key = key + ":"
for k in filter(lambda s: s.startswith(key), self.keys()):
del(self[k])
def setall(self, key, values):
"""Delete frames of the given type and add frames in 'values'."""
self.delall(key)
for tag in values:
self[tag.HashKey] = tag
def pprint(self):
"""Return tags in a human-readable format.
"Human-readable" is used loosely here. The format is intended
to mirror that used for Vorbis or APEv2 output, e.g.
``TIT2=My Title``
However, ID3 frames can have multiple keys:
``POPM=user@example.org=3 128/255``
"""
frames = list(map(Frame.pprint, self.values()))
frames.sort()
return "\n".join(frames)
def loaded_frame(self, tag):
"""Deprecated; use the add method."""
# turn 2.2 into 2.3/2.4 tags
if len(type(tag).__name__) == 3:
tag = type(tag).__base__(tag)
self[tag.HashKey] = tag
# add = loaded_frame (and vice versa) break applications that
# expect to be able to override loaded_frame (e.g. Quod Libet),
# as does making loaded_frame call add.
def add(self, frame):
"""Add a frame to the tag."""
return self.loaded_frame(frame)
def _load_header(self):
fn = self.filename
data = self.__fullread(10)
id3, vmaj, vrev, flags, size = unpack('>3sBBB4s', data)
self.__flags = flags
self.size = BitPaddedInt(size) + 10
self.version = (2, vmaj, vrev)
if id3 != b'ID3':
raise ID3NoHeaderError("%r doesn't start with an ID3 tag" % fn)
if vmaj not in [2, 3, 4]:
raise ID3UnsupportedVersionError("%r ID3v2.%d not supported"
% (fn, vmaj))
if self.PEDANTIC:
if not BitPaddedInt.has_valid_padding(size):
raise ValueError("Header size not synchsafe")
if self._V24 <= self.version and (flags & 0x0f):
raise ValueError("%r has invalid flags %#02x" % (fn, flags))
elif self._V23 <= self.version < self._V24 and (flags & 0x1f):
raise ValueError("%r has invalid flags %#02x" % (fn, flags))
if self.f_extended:
extsize = self.__fullread(4)
if extsize in Frames:
# Some tagger sets the extended header flag but
# doesn't write an extended header; in this case, the
# ID3 data follows immediately. Since no extended
# header is going to be long enough to actually match
# a frame, and if it's *not* a frame we're going to be
# completely lost anyway, this seems to be the most
# correct check.
# http://code.google.com/p/quodlibet/issues/detail?id=126
self.__flags ^= 0x40
self.__extsize = 0
self._fileobj.seek(-4, 1)
self.__readbytes -= 4
elif self.version >= self._V24:
# "Where the 'Extended header size' is the size of the whole
# extended header, stored as a 32 bit synchsafe integer."
self.__extsize = BitPaddedInt(extsize) - 4
if self.PEDANTIC:
if not BitPaddedInt.has_valid_padding(extsize):
raise ValueError("Extended header size not synchsafe")
else:
# "Where the 'Extended header size', currently 6 or 10 bytes,
# excludes itself."
self.__extsize = unpack('>L', extsize)[0]
if self.__extsize:
self.__extdata = self.__fullread(self.__extsize)
else:
self.__extdata = b""
def __determine_bpi(self, data, frames, EMPTY=b"\x00" * 10):
if self.version < self._V24:
return int
# have to special case whether to use bitpaddedints here
# spec says to use them, but iTunes has it wrong
# count number of tags found as BitPaddedInt and how far past
o = 0
asbpi = 0
while o < len(data) - 10:
part = data[o:o + 10]
if part == EMPTY:
bpioff = -((len(data) - o) % 10)
break
name, size, flags = unpack('>4sLH', part)
size = BitPaddedInt(size)
o += 10 + size
if name in frames:
asbpi += 1
else:
bpioff = o - len(data)
# count number of tags found as int and how far past
o = 0
asint = 0
while o < len(data) - 10:
part = data[o:o + 10]
if part == EMPTY:
intoff = -((len(data) - o) % 10)
break
name, size, flags = unpack('>4sLH', part)
o += 10 + size
if name in frames:
asint += 1
else:
intoff = o - len(data)
# if more tags as int, or equal and bpi is past and int is not
if asint > asbpi or (asint == asbpi and (bpioff >= 1 and intoff <= 1)):
return int
return BitPaddedInt
def __read_frames(self, data, frames):
if self.version < self._V24 and self.f_unsynch:
try:
data = unsynch.decode(data)
except ValueError:
pass
if self._V23 <= self.version:
bpi = self.__determine_bpi(data, frames)
while data:
header = data[:10]
try:
name, size, flags = unpack('>4sLH', header)
except struct.error:
return # not enough header
if name.strip(b'\x00') == b'':
return
size = bpi(size)
framedata = data[10:10+size]
data = data[10+size:]
if size == 0:
continue # drop empty frames
try:
tag = frames[name]
except KeyError:
if is_valid_frame_id(name):
yield header + framedata
else:
try:
yield self.__load_framedata(tag, flags, framedata)
except NotImplementedError:
yield header + framedata
except ID3JunkFrameError:
pass
elif self._V22 <= self.version:
while data:
header = data[0:6]
try:
name, size = unpack('>3s3s', header)
except struct.error:
return # not enough header
size, = struct.unpack('>L', b'\x00'+size)
if name.strip(b'\x00') == b'':
return
framedata = data[6:6+size]
data = data[6+size:]
if size == 0:
continue # drop empty frames
try:
tag = frames[name]
except KeyError:
if is_valid_frame_id(name):
yield header + framedata
else:
try:
yield self.__load_framedata(tag, 0, framedata)
except NotImplementedError:
yield header + framedata
except ID3JunkFrameError:
pass
def __load_framedata(self, tag, flags, framedata):
return tag.fromData(self, flags, framedata)
f_unsynch = property(lambda s: bool(s.__flags & 0x80))
f_extended = property(lambda s: bool(s.__flags & 0x40))
f_experimental = property(lambda s: bool(s.__flags & 0x20))
f_footer = property(lambda s: bool(s.__flags & 0x10))
#f_crc = property(lambda s: bool(s.__extflags & 0x8000))
def _prepare_framedata(self, v2_version, v23_sep):
if v2_version == 3:
version = self._V23
elif v2_version == 4:
version = self._V24
else:
raise ValueError("Only 3 or 4 allowed for v2_version")
# Sort frames by 'importance'
order = ["TIT2", "TPE1", "TRCK", "TALB", "TPOS", "TDRC", "TCON"]
order = dict(zip(order, range(len(order))))
last = len(order)
frames = self.items()
frames.sort(key=lambda a: (order.get(a[0][:4], last), a[0]))
framedata = [self.__save_frame(frame, version=version, v23_sep=v23_sep)
for (key, frame) in frames]
# only write unknown frames if they were loaded from the version
# we are saving with or upgraded to it
if self.__unknown_version == version:
framedata.extend([data for data in self.unknown_frames
if len(data) > 10])
return b''.join(framedata)
def _prepare_id3_header(self, original_header, framesize, v2_version):
try:
id3, vmaj, vrev, flags, insize = unpack('>3sBBB4s', original_header)
except struct.error:
id3, insize = b'', 0
insize = BitPaddedInt(insize)
if id3 != b'ID3':
insize = -10
if insize >= framesize:
outsize = insize
else:
outsize = (framesize + 1023) & ~0x3FF
framesize = BitPaddedInt.to_str(outsize, width=4)
header = pack('>3sBBB4s', b'ID3', v2_version, 0, 0, framesize)
return (header, outsize, insize)
def save(self, filename=None, v1=1, v2_version=4, v23_sep='/'):
"""Save changes to a file.
If no filename is given, the one most recently loaded is used.
Keyword arguments:
v1 -- if 0, ID3v1 tags will be removed
if 1, ID3v1 tags will be updated but not added
if 2, ID3v1 tags will be created and/or updated
v2 -- version of ID3v2 tags (3 or 4).
By default Mutagen saves ID3v2.4 tags. If you want to save ID3v2.3
tags, you must call method update_to_v23 before saving the file.
v23_sep -- the separator used to join multiple text values
if v2_version == 3. Defaults to '/' but if it's None
will be the ID3v2v2.4 null separator.
The lack of a way to update only an ID3v1 tag is intentional.
"""
framedata = self._prepare_framedata(v2_version, v23_sep)
framesize = len(framedata)
if not framedata:
try:
self.delete(filename)
except EnvironmentError as err:
from errno import ENOENT
if err.errno != ENOENT:
raise
return
if filename is None:
filename = self.filename
try:
f = open(filename, 'rb+')
except IOError as err:
from errno import ENOENT
if err.errno != ENOENT:
raise
f = open(filename, 'ab') # create, then reopen
f = open(filename, 'rb+')
try:
idata = f.read(10)
header = self._prepare_id3_header(idata, framesize, v2_version)
header, outsize, insize = header
data = header + framedata + (b'\x00' * (outsize - framesize))
if (insize < outsize):
insert_bytes(f, outsize-insize, insize+10)
f.seek(0)
f.write(data)
try:
f.seek(-128, 2)
except IOError as err:
# If the file is too small, that's OK - it just means
# we're certain it doesn't have a v1 tag.
from errno import EINVAL
if err.errno != EINVAL:
# If we failed to see for some other reason, bail out.
raise
# Since we're sure this isn't a v1 tag, don't read it.
f.seek(0, 2)
data = f.read(128)
try:
idx = data.index(b"TAG")
except ValueError:
offset = 0
has_v1 = False
else:
offset = idx - len(data)
has_v1 = True
f.seek(offset, 2)
if v1 == 1 and has_v1 or v1 == 2:
f.write(MakeID3v1(self))
else:
f.truncate()
finally:
f.close()
def delete(self, filename=None, delete_v1=True, delete_v2=True):
"""Remove tags from a file.
If no filename is given, the one most recently loaded is used.
Keyword arguments:
* delete_v1 -- delete any ID3v1 tag
* delete_v2 -- delete any ID3v2 tag
"""
if filename is None:
filename = self.filename
delete(filename, delete_v1, delete_v2)
self.clear()
def __save_frame(self, frame, name=None, version=_V24, v23_sep=None):
flags = 0
if self.PEDANTIC and isinstance(frame, TextFrame):
if len(str(frame)) == 0:
return b''
if version == self._V23:
framev23 = frame._get_v23_frame(sep=v23_sep)
framedata = framev23._writeData()
else:
framedata = frame._writeData()
usize = len(framedata)
if usize > 2048:
# Disabled as this causes iTunes and other programs
# to fail to find these frames, which usually includes
# e.g. APIC.
#framedata = BitPaddedInt.to_str(usize) + framedata.encode('zlib')
#flags |= Frame.FLAG24_COMPRESS | Frame.FLAG24_DATALEN
pass
if version == self._V24:
bits = 7
elif version == self._V23:
bits = 8
else:
raise ValueError
datasize = BitPaddedInt.to_str(len(framedata), width=4, bits=bits)
frame_name = type(frame).__name__.encode("ascii")
header = pack('>4s4sH', name or frame_name, datasize, flags)
return header + framedata
def __update_common(self):
"""Updates done by both v23 and v24 update"""
if "TCON" in self:
# Get rid of "(xx)Foobr" format.
self["TCON"].genres = self["TCON"].genres
if self.version < self._V23:
# ID3v2.2 PIC frames are slightly different.
pics = self.getall("APIC")
mimes = {"PNG": "image/png", "JPG": "image/jpeg"}
self.delall("APIC")
for pic in pics:
newpic = APIC(
encoding=pic.encoding, mime=mimes.get(pic.mime, pic.mime),
type=pic.type, desc=pic.desc, data=pic.data)
self.add(newpic)
# ID3v2.2 LNK frames are just way too different to upgrade.
self.delall("LINK")
def update_to_v24(self):
"""Convert older tags into an ID3v2.4 tag.
This updates old ID3v2 frames to ID3v2.4 ones (e.g. TYER to
TDRC). If you intend to save tags, you must call this function
at some point; it is called by default when loading the tag.
"""
self.__update_common()
if self.__unknown_version == (2, 3, 0):
# convert unknown 2.3 frames (flags/size) to 2.4
converted = []
for frame in self.unknown_frames:
try:
name, size, flags = unpack('>4sLH', frame[:10])
frame = BinaryFrame.fromData(self, flags, frame[10:])
except (struct.error, error):
continue
converted.append(self.__save_frame(frame, name=name))
self.unknown_frames[:] = converted
self.__unknown_version = (2, 4, 0)
# TDAT, TYER, and TIME have been turned into TDRC.
try:
if str(self.get("TYER", "")).strip("\x00"):
date = str(self.pop("TYER"))
if str(self.get("TDAT", "")).strip("\x00"):
dat = str(self.pop("TDAT"))
date = "%s-%s-%s" % (date, dat[2:], dat[:2])
if str(self.get("TIME", "")).strip("\x00"):
time = str(self.pop("TIME"))
date += "T%s:%s:00" % (time[:2], time[2:])
if "TDRC" not in self:
self.add(TDRC(encoding=0, text=date))
except UnicodeDecodeError:
# Old ID3 tags have *lots* of Unicode problems, so if TYER
# is bad, just chuck the frames.
pass
# TORY can be the first part of a TDOR.
if "TORY" in self:
f = self.pop("TORY")
if "TDOR" not in self:
try:
self.add(TDOR(encoding=0, text=str(f)))
except UnicodeDecodeError:
pass
# IPLS is now TIPL.
if "IPLS" in self:
f = self.pop("IPLS")
if "TIPL" not in self:
self.add(TIPL(encoding=f.encoding, people=f.people))
# These can't be trivially translated to any ID3v2.4 tags, or
# should have been removed already.
for key in ["RVAD", "EQUA", "TRDA", "TSIZ", "TDAT", "TIME", "CRM"]:
if key in self:
del(self[key])
def update_to_v23(self):
"""Convert older (and newer) tags into an ID3v2.3 tag.
This updates incompatible ID3v2 frames to ID3v2.3 ones. If you
intend to save tags as ID3v2.3, you must call this function
at some point.
If you want to to go off spec and include some v2.4 frames
in v2.3, remove them before calling this and add them back afterwards.
"""
self.__update_common()
# we could downgrade unknown v2.4 frames here, but given that
# the main reason to save v2.3 is compatibility and this
# might increase the chance of some parser breaking.. better not
# TMCL, TIPL -> TIPL
if "TIPL" in self or "TMCL" in self:
people = []
if "TIPL" in self:
f = self.pop("TIPL")
people.extend(f.people)
if "TMCL" in self:
f = self.pop("TMCL")
people.extend(f.people)
if "IPLS" not in self:
self.add(IPLS(encoding=f.encoding, people=people))
# TDOR -> TORY
if "TDOR" in self:
f = self.pop("TDOR")
if f.text:
d = f.text[0]
if d.year and "TORY" not in self:
self.add(TORY(encoding=f.encoding, text="%04d" % d.year))
# TDRC -> TYER, TDAT, TIME
if "TDRC" in self:
f = self.pop("TDRC")
if f.text:
d = f.text[0]
if d.year and "TYER" not in self:
self.add(TYER(encoding=f.encoding, text="%04d" % d.year))
if d.month and d.day and "TDAT" not in self:
self.add(TDAT(encoding=f.encoding,
text="%02d%02d" % (d.day, d.month)))
if d.hour and d.minute and "TIME" not in self:
self.add(TIME(encoding=f.encoding,
text="%02d%02d" % (d.hour, d.minute)))
# New frames added in v2.4
v24_frames = [
'ASPI', 'EQU2', 'RVA2', 'SEEK', 'SIGN', 'TDEN', 'TDOR',
'TDRC', 'TDRL', 'TDTG', 'TIPL', 'TMCL', 'TMOO', 'TPRO',
'TSOA', 'TSOP', 'TSOT', 'TSST',
]
for key in v24_frames:
if key in self:
del(self[key])
def delete(filename, delete_v1=True, delete_v2=True):
"""Remove tags from a file.
Keyword arguments:
* delete_v1 -- delete any ID3v1 tag
* delete_v2 -- delete any ID3v2 tag
"""
f = open(filename, 'rb+')
if delete_v1:
try:
f.seek(-128, 2)
except IOError:
pass
else:
if f.read(3) == b"TAG":
f.seek(-128, 2)
f.truncate()
# technically an insize=0 tag is invalid, but we delete it anyway
# (primarily because we used to write it)
if delete_v2:
f.seek(0, 0)
idata = f.read(10)
try:
id3, vmaj, vrev, flags, insize = unpack('>3sBBB4s', idata)
except struct.error:
id3, insize = '', -1
insize = BitPaddedInt(insize)
if id3 == b'ID3' and insize >= 0:
delete_bytes(f, insize + 10, 0)
# support open(filename) as interface
Open = ID3
# ID3v1.1 support.
def ParseID3v1(string):
"""Parse an ID3v1 tag, returning a list of ID3v2.4 frames."""
try:
string = string[string.index(b"TAG"):]
except ValueError:
return None
if 128 < len(string) or len(string) < 124:
return None
# Issue #69 - Previous versions of Mutagen, when encountering
# out-of-spec TDRC and TYER frames of less than four characters,
# wrote only the characters available - e.g. "1" or "" - into the
# year field. To parse those, reduce the size of the year field.
# Amazingly, "0s" works as a struct format string.
unpack_fmt = "3s30s30s30s%ds29sBB" % (len(string) - 124)
try:
tag, title, artist, album, year, comment, track, genre = unpack(
unpack_fmt, string)
except StructError:
return None
if tag != b"TAG":
return None
def fix(string):
return string.split(b"\x00")[0].strip().decode('latin1')
title, artist, album, year, comment = map(
fix, [title, artist, album, year, comment])
frames = {}
if title:
frames["TIT2"] = TIT2(encoding=0, text=title)
if artist:
frames["TPE1"] = TPE1(encoding=0, text=[artist])
if album:
frames["TALB"] = TALB(encoding=0, text=album)
if year:
frames["TDRC"] = TDRC(encoding=0, text=year)
if comment:
frames["COMM"] = COMM(
encoding=0, lang="eng", desc="ID3v1 Comment", text=comment)
# Don't read a track number if it looks like the comment was
# padded with spaces instead of nulls (thanks, WinAmp).
if track and (track != 32 or string[-3] == b'\x00'):
frames["TRCK"] = TRCK(encoding=0, text=str(track))
if genre != 255:
frames["TCON"] = TCON(encoding=0, text=str(genre))
return frames
def MakeID3v1(id3):
"""Return an ID3v1.1 tag string from a dict of ID3v2.4 frames."""
v1 = {}
for v2id, name in {"TIT2": "title", "TPE1": "artist",
"TALB": "album"}.items():
if v2id in id3:
text = id3[v2id].text[0].encode('latin1', 'replace')[:30]
else:
text = b""
v1[name] = text + (b"\x00" * (30 - len(text)))
if "COMM" in id3:
cmnt = id3["COMM"].text[0].encode('latin1', 'replace')[:28]
else:
cmnt = b""
v1["comment"] = cmnt + (b"\x00" * (29 - len(cmnt)))
if "TRCK" in id3:
try:
v1["track"] = chr_(+id3["TRCK"])
except ValueError:
v1["track"] = b"\x00"
else:
v1["track"] = b"\x00"
if "TCON" in id3:
try:
genre = id3["TCON"].genres[0]
except IndexError:
pass
else:
if genre in TCON.GENRES:
v1["genre"] = chr_(TCON.GENRES.index(genre))
if "genre" not in v1:
v1["genre"] = b"\xff"
if "TDRC" in id3:
year = bytes(id3["TDRC"])
elif "TYER" in id3:
year = bytes(id3["TYER"])
else:
year = b""
v1["year"] = (year + b"\x00\x00\x00\x00")[:4]
data = b"TAG"
data += v1["title"]
data += v1["artist"]
data += v1["album"]
data += v1["year"]
data += v1["comment"]
data += v1["track"]
data += v1["genre"]
return data
class ID3FileType(mutagen.FileType):
"""An unknown type of file with ID3 tags."""
ID3 = ID3
class _Info(mutagen.StreamInfo):
length = 0
def __init__(self, fileobj, offset):
pass
@staticmethod
def pprint():
return "Unknown format with ID3 tag"
@staticmethod
def score(filename, fileobj, header):
return header.startswith(b"ID3")
def add_tags(self, ID3=None):
"""Add an empty ID3 tag to the file.
A custom tag reader may be used in instead of the default
mutagen.id3.ID3 object, e.g. an EasyID3 reader.
"""
if ID3 is None:
ID3 = self.ID3
if self.tags is None:
self.ID3 = ID3
self.tags = ID3()
else:
raise error("an ID3 tag already exists")
def load(self, filename, ID3=None, **kwargs):
"""Load stream and tag information from a file.
A custom tag reader may be used in instead of the default
mutagen.id3.ID3 object, e.g. an EasyID3 reader.
"""
if ID3 is None:
ID3 = self.ID3
else:
# If this was initialized with EasyID3, remember that for
# when tags are auto-instantiated in add_tags.
self.ID3 = ID3
self.filename = filename
try:
self.tags = ID3(filename, **kwargs)
except error:
self.tags = None
if self.tags is not None:
try:
offset = self.tags.size
except AttributeError:
offset = None
else:
offset = None
try:
fileobj = open(filename, "rb")
self.info = self._Info(fileobj, offset)
finally:
fileobj.close()
@@ -0,0 +1,544 @@
# Copyright 2006 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
import sys
if sys.version_info[0] != 2:
raise ImportError("No longer available with Python 3, use mutagen.mp4")
"""Read and write MPEG-4 audio files with iTunes metadata.
This module will read MPEG-4 audio information and metadata,
as found in Apple's M4A (aka MP4, M4B, M4P) files.
There is no official specification for this format. The source code
for TagLib, FAAD, and various MPEG specifications at
http://developer.apple.com/documentation/QuickTime/QTFF/,
http://www.geocities.com/xhelmboyx/quicktime/formats/mp4-layout.txt,
and http://wiki.multimedia.cx/index.php?title=Apple_QuickTime were all
consulted.
This module does not support 64 bit atom sizes, and so will not
work on metadata over 4GB.
"""
import struct
import sys
from cStringIO import StringIO
from ._compat import reraise
from mutagen import FileType, Metadata, StreamInfo
from mutagen._constants import GENRES
from mutagen._util import cdata, insert_bytes, delete_bytes, DictProxy
class error(IOError):
pass
class M4AMetadataError(error):
pass
class M4AStreamInfoError(error):
pass
class M4AMetadataValueError(ValueError, M4AMetadataError):
pass
import warnings
warnings.warn(
"mutagen.m4a is deprecated; use mutagen.mp4 instead.", DeprecationWarning)
# This is not an exhaustive list of container atoms, but just the
# ones this module needs to peek inside.
_CONTAINERS = ["moov", "udta", "trak", "mdia", "meta", "ilst",
"stbl", "minf", "stsd"]
_SKIP_SIZE = {"meta": 4}
__all__ = ['M4A', 'Open', 'delete', 'M4ACover']
class M4ACover(str):
"""A cover artwork.
Attributes:
imageformat -- format of the image (either FORMAT_JPEG or FORMAT_PNG)
"""
FORMAT_JPEG = 0x0D
FORMAT_PNG = 0x0E
def __new__(cls, data, imageformat=None):
self = str.__new__(cls, data)
if imageformat is None:
imageformat = M4ACover.FORMAT_JPEG
self.imageformat = imageformat
try:
self.format
except AttributeError:
self.format = imageformat
return self
class Atom(object):
"""An individual atom.
Attributes:
children -- list child atoms (or None for non-container atoms)
length -- length of this atom, including length and name
name -- four byte name of the atom, as a str
offset -- location in the constructor-given fileobj of this atom
This structure should only be used internally by Mutagen.
"""
children = None
def __init__(self, fileobj):
self.offset = fileobj.tell()
self.length, self.name = struct.unpack(">I4s", fileobj.read(8))
if self.length == 1:
raise error("64 bit atom sizes are not supported")
elif self.length < 8:
return
if self.name in _CONTAINERS:
self.children = []
fileobj.seek(_SKIP_SIZE.get(self.name, 0), 1)
while fileobj.tell() < self.offset + self.length:
self.children.append(Atom(fileobj))
else:
fileobj.seek(self.offset + self.length, 0)
@staticmethod
def render(name, data):
"""Render raw atom data."""
# this raises OverflowError if Py_ssize_t can't handle the atom data
size = len(data) + 8
if size <= 0xFFFFFFFF:
return struct.pack(">I4s", size, name) + data
else:
return struct.pack(">I4sQ", 1, name, size + 8) + data
def __getitem__(self, remaining):
"""Look up a child atom, potentially recursively.
e.g. atom['udta', 'meta'] => <Atom name='meta' ...>
"""
if not remaining:
return self
elif self.children is None:
raise KeyError("%r is not a container" % self.name)
for child in self.children:
if child.name == remaining[0]:
return child[remaining[1:]]
else:
raise KeyError("%r not found" % remaining[0])
def __repr__(self):
klass = self.__class__.__name__
if self.children is None:
return "<%s name=%r length=%r offset=%r>" % (
klass, self.name, self.length, self.offset)
else:
children = "\n".join([" " + line for child in self.children
for line in repr(child).splitlines()])
return "<%s name=%r length=%r offset=%r\n%s>" % (
klass, self.name, self.length, self.offset, children)
class Atoms(object):
"""Root atoms in a given file.
Attributes:
atoms -- a list of top-level atoms as Atom objects
This structure should only be used internally by Mutagen.
"""
def __init__(self, fileobj):
self.atoms = []
fileobj.seek(0, 2)
end = fileobj.tell()
fileobj.seek(0)
while fileobj.tell() < end:
self.atoms.append(Atom(fileobj))
def path(self, *names):
"""Look up and return the complete path of an atom.
For example, atoms.path('moov', 'udta', 'meta') will return a
list of three atoms, corresponding to the moov, udta, and meta
atoms.
"""
path = [self]
for name in names:
path.append(path[-1][name, ])
return path[1:]
def __getitem__(self, names):
"""Look up a child atom.
'names' may be a list of atoms (['moov', 'udta']) or a string
specifying the complete path ('moov.udta').
"""
if isinstance(names, basestring):
names = names.split(".")
for child in self.atoms:
if child.name == names[0]:
return child[names[1:]]
else:
raise KeyError("%s not found" % names[0])
def __repr__(self):
return "\n".join([repr(child) for child in self.atoms])
class M4ATags(DictProxy, Metadata):
"""Dictionary containing Apple iTunes metadata list key/values.
Keys are four byte identifiers, except for freeform ('----')
keys. Values are usually unicode strings, but some atoms have a
special structure:
cpil -- boolean
trkn, disk -- tuple of 16 bit ints (current, total)
tmpo -- 16 bit int
covr -- list of M4ACover objects (which are tagged strs)
gnre -- not supported. Use '\\xa9gen' instead.
The freeform '----' frames use a key in the format '----:mean:name'
where 'mean' is usually 'com.apple.iTunes' and 'name' is a unique
identifier for this frame. The value is a str, but is probably
text that can be decoded as UTF-8.
M4A tag data cannot exist outside of the structure of an M4A file,
so this class should not be manually instantiated.
Unknown non-text tags are removed.
"""
def load(self, atoms, fileobj):
try:
ilst = atoms["moov.udta.meta.ilst"]
except KeyError as key:
raise M4AMetadataError(key)
for atom in ilst.children:
fileobj.seek(atom.offset + 8)
data = fileobj.read(atom.length - 8)
parse = self.__atoms.get(atom.name, (M4ATags.__parse_text,))[0]
parse(self, atom, data)
@staticmethod
def __key_sort(item1, item2):
(key1, v1) = item1
(key2, v2) = item2
# iTunes always writes the tags in order of "relevance", try
# to copy it as closely as possible.
order = ["\xa9nam", "\xa9ART", "\xa9wrt", "\xa9alb",
"\xa9gen", "gnre", "trkn", "disk",
"\xa9day", "cpil", "tmpo", "\xa9too",
"----", "covr", "\xa9lyr"]
order = dict(zip(order, range(len(order))))
last = len(order)
# If there's no key-based way to distinguish, order by length.
# If there's still no way, go by string comparison on the
# values, so we at least have something determinstic.
return (cmp(order.get(key1[:4], last), order.get(key2[:4], last)) or
cmp(len(v1), len(v2)) or cmp(v1, v2))
def save(self, filename):
"""Save the metadata to the given filename."""
values = []
items = self.items()
items.sort(self.__key_sort)
for key, value in items:
render = self.__atoms.get(
key[:4], (None, M4ATags.__render_text))[1]
values.append(render(self, key, value))
data = Atom.render("ilst", "".join(values))
# Find the old atoms.
fileobj = open(filename, "rb+")
try:
atoms = Atoms(fileobj)
moov = atoms["moov"]
if moov != atoms.atoms[-1]:
# "Free" the old moov block. Something in the mdat
# block is not happy when its offset changes and it
# won't play back. So, rather than try to figure that
# out, just move the moov atom to the end of the file.
offset = self.__move_moov(fileobj, moov)
else:
offset = 0
try:
path = atoms.path("moov", "udta", "meta", "ilst")
except KeyError:
self.__save_new(fileobj, atoms, data, offset)
else:
self.__save_existing(fileobj, atoms, path, data, offset)
finally:
fileobj.close()
def __move_moov(self, fileobj, moov):
fileobj.seek(moov.offset)
data = fileobj.read(moov.length)
fileobj.seek(moov.offset)
free = Atom.render("free", "\x00" * (moov.length - 8))
fileobj.write(free)
fileobj.seek(0, 2)
# Figure out how far we have to shift all our successive
# seek calls, relative to what the atoms say.
old_end = fileobj.tell()
fileobj.write(data)
return old_end - moov.offset
def __save_new(self, fileobj, atoms, ilst, offset):
hdlr = Atom.render("hdlr", "\x00" * 8 + "mdirappl" + "\x00" * 9)
meta = Atom.render("meta", "\x00\x00\x00\x00" + hdlr + ilst)
moov, udta = atoms.path("moov", "udta")
insert_bytes(fileobj, len(meta), udta.offset + offset + 8)
fileobj.seek(udta.offset + offset + 8)
fileobj.write(meta)
self.__update_parents(fileobj, [moov, udta], len(meta), offset)
def __save_existing(self, fileobj, atoms, path, data, offset):
# Replace the old ilst atom.
ilst = path.pop()
delta = len(data) - ilst.length
fileobj.seek(ilst.offset + offset)
if delta > 0:
insert_bytes(fileobj, delta, ilst.offset + offset)
elif delta < 0:
delete_bytes(fileobj, -delta, ilst.offset + offset)
fileobj.seek(ilst.offset + offset)
fileobj.write(data)
self.__update_parents(fileobj, path, delta, offset)
def __update_parents(self, fileobj, path, delta, offset):
# Update all parent atoms with the new size.
for atom in path:
fileobj.seek(atom.offset + offset)
size = cdata.uint_be(fileobj.read(4)) + delta
fileobj.seek(atom.offset + offset)
fileobj.write(cdata.to_uint_be(size))
def __render_data(self, key, flags, data):
data = struct.pack(">2I", flags, 0) + data
return Atom.render(key, Atom.render("data", data))
def __parse_freeform(self, atom, data):
try:
fileobj = StringIO(data)
mean_length = cdata.uint_be(fileobj.read(4))
# skip over 8 bytes of atom name, flags
mean = fileobj.read(mean_length - 4)[8:]
name_length = cdata.uint_be(fileobj.read(4))
name = fileobj.read(name_length - 4)[8:]
value_length = cdata.uint_be(fileobj.read(4))
# Name, flags, and reserved bytes
value = fileobj.read(value_length - 4)[12:]
except struct.error:
# Some ---- atoms have no data atom, I have no clue why
# they actually end up in the file.
pass
else:
self["%s:%s:%s" % (atom.name, mean, name)] = value
def __render_freeform(self, key, value):
dummy, mean, name = key.split(":", 2)
mean = struct.pack(">I4sI", len(mean) + 12, "mean", 0) + mean
name = struct.pack(">I4sI", len(name) + 12, "name", 0) + name
value = struct.pack(">I4s2I", len(value) + 16, "data", 0x1, 0) + value
final = mean + name + value
return Atom.render("----", final)
def __parse_pair(self, atom, data):
self[atom.name] = struct.unpack(">2H", data[18:22])
def __render_pair(self, key, value):
track, total = value
if 0 <= track < 1 << 16 and 0 <= total < 1 << 16:
data = struct.pack(">4H", 0, track, total, 0)
return self.__render_data(key, 0, data)
else:
raise M4AMetadataValueError("invalid numeric pair %r" % (value,))
def __render_pair_no_trailing(self, key, value):
track, total = value
if 0 <= track < 1 << 16 and 0 <= total < 1 << 16:
data = struct.pack(">3H", 0, track, total)
return self.__render_data(key, 0, data)
else:
raise M4AMetadataValueError("invalid numeric pair %r" % (value,))
def __parse_genre(self, atom, data):
# Translate to a freeform genre.
genre = cdata.short_be(data[16:18])
if "\xa9gen" not in self:
try:
self["\xa9gen"] = GENRES[genre - 1]
except IndexError:
pass
def __parse_tempo(self, atom, data):
self[atom.name] = cdata.short_be(data[16:18])
def __render_tempo(self, key, value):
if 0 <= value < 1 << 16:
return self.__render_data(key, 0x15, cdata.to_ushort_be(value))
else:
raise M4AMetadataValueError("invalid short integer %r" % value)
def __parse_compilation(self, atom, data):
try:
self[atom.name] = bool(ord(data[16:17]))
except TypeError:
self[atom.name] = False
def __render_compilation(self, key, value):
return self.__render_data(key, 0x15, chr(bool(value)))
def __parse_cover(self, atom, data):
length, name, imageformat = struct.unpack(">I4sI", data[:12])
if name != "data":
raise M4AMetadataError(
"unexpected atom %r inside 'covr'" % name)
if imageformat not in (M4ACover.FORMAT_JPEG, M4ACover.FORMAT_PNG):
imageformat = M4ACover.FORMAT_JPEG
self[atom.name] = M4ACover(data[16:length], imageformat)
def __render_cover(self, key, value):
try:
imageformat = value.imageformat
except AttributeError:
imageformat = M4ACover.FORMAT_JPEG
data = Atom.render("data", struct.pack(">2I", imageformat, 0) + value)
return Atom.render(key, data)
def __parse_text(self, atom, data):
flags = cdata.uint_be(data[8:12])
if flags == 1:
self[atom.name] = data[16:].decode('utf-8', 'replace')
def __render_text(self, key, value):
return self.__render_data(key, 0x1, value.encode('utf-8'))
def delete(self, filename):
self.clear()
self.save(filename)
__atoms = {
"----": (__parse_freeform, __render_freeform),
"trkn": (__parse_pair, __render_pair),
"disk": (__parse_pair, __render_pair_no_trailing),
"gnre": (__parse_genre, None),
"tmpo": (__parse_tempo, __render_tempo),
"cpil": (__parse_compilation, __render_compilation),
"covr": (__parse_cover, __render_cover),
}
def pprint(self):
values = []
for key, value in self.iteritems():
key = key.decode('latin1')
try:
values.append("%s=%s" % (key, value))
except UnicodeDecodeError:
values.append("%s=[%d bytes of data]" % (key, len(value)))
return "\n".join(values)
class M4AInfo(StreamInfo):
"""MPEG-4 stream information.
Attributes:
bitrate -- bitrate in bits per second, as an int
length -- file length in seconds, as a float
"""
bitrate = 0
def __init__(self, atoms, fileobj):
hdlr = atoms["moov.trak.mdia.hdlr"]
fileobj.seek(hdlr.offset)
if "soun" not in fileobj.read(hdlr.length):
raise M4AStreamInfoError("track has no audio data")
mdhd = atoms["moov.trak.mdia.mdhd"]
fileobj.seek(mdhd.offset)
data = fileobj.read(mdhd.length)
if ord(data[8]) == 0:
offset = 20
fmt = ">2I"
else:
offset = 28
fmt = ">IQ"
end = offset + struct.calcsize(fmt)
unit, length = struct.unpack(fmt, data[offset:end])
self.length = float(length) / unit
try:
atom = atoms["moov.trak.mdia.minf.stbl.stsd"]
fileobj.seek(atom.offset)
data = fileobj.read(atom.length)
self.bitrate = cdata.uint_be(data[-17:-13])
except (ValueError, KeyError):
# Bitrate values are optional.
pass
def pprint(self):
return "MPEG-4 audio, %.2f seconds, %d bps" % (
self.length, self.bitrate)
class M4A(FileType):
"""An MPEG-4 audio file, probably containing AAC.
If more than one track is present in the file, the first is used.
Only audio ('soun') tracks will be read.
"""
_mimes = ["audio/mp4", "audio/x-m4a", "audio/mpeg4", "audio/aac"]
def load(self, filename):
self.filename = filename
fileobj = open(filename, "rb")
try:
atoms = Atoms(fileobj)
try:
self.info = M4AInfo(atoms, fileobj)
except StandardError as err:
reraise(M4AStreamInfoError, err, sys.exc_info()[2])
try:
self.tags = M4ATags(atoms, fileobj)
except M4AMetadataError:
self.tags = None
except StandardError as err:
reraise(M4AMetadataError, err, sys.exc_info()[2])
finally:
fileobj.close()
def add_tags(self):
self.tags = M4ATags()
@staticmethod
def score(filename, fileobj, header):
return ("ftyp" in header) + ("mp4" in header)
Open = M4A
def delete(filename):
"""Remove tags from a file."""
M4A(filename).delete()
@@ -0,0 +1,86 @@
# A Monkey's Audio (APE) reader/tagger
#
# Copyright 2006 Lukas Lalinsky <lalinsky@gmail.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Monkey's Audio streams with APEv2 tags.
Monkey's Audio is a very efficient lossless audio compressor developed
by Matt Ashland.
For more information, see http://www.monkeysaudio.com/.
"""
__all__ = ["MonkeysAudio", "Open", "delete"]
import struct
from ._compat import endswith
from mutagen import StreamInfo
from mutagen.apev2 import APEv2File, error, delete
from mutagen._util import cdata
class MonkeysAudioHeaderError(error):
pass
class MonkeysAudioInfo(StreamInfo):
"""Monkey's Audio stream information.
Attributes:
* channels -- number of audio channels
* length -- file length in seconds, as a float
* sample_rate -- audio sampling rate in Hz
* bits_per_sample -- bits per sample
* version -- Monkey's Audio stream version, as a float (eg: 3.99)
"""
def __init__(self, fileobj):
header = fileobj.read(76)
if len(header) != 76 or not header.startswith(b"MAC "):
raise MonkeysAudioHeaderError("not a Monkey's Audio file")
self.version = cdata.ushort_le(header[4:6])
if self.version >= 3980:
(blocks_per_frame, final_frame_blocks, total_frames,
self.bits_per_sample, self.channels,
self.sample_rate) = struct.unpack("<IIIHHI", header[56:76])
else:
compression_level = cdata.ushort_le(header[6:8])
self.channels, self.sample_rate = struct.unpack(
"<HI", header[10:16])
total_frames, final_frame_blocks = struct.unpack(
"<II", header[24:32])
if self.version >= 3950:
blocks_per_frame = 73728 * 4
elif self.version >= 3900 or (self.version >= 3800 and
compression_level == 4):
blocks_per_frame = 73728
else:
blocks_per_frame = 9216
self.version /= 1000.0
self.length = 0.0
if self.sample_rate != 0 and total_frames > 0:
total_blocks = ((total_frames - 1) * blocks_per_frame +
final_frame_blocks)
self.length = float(total_blocks) / self.sample_rate
def pprint(self):
return "Monkey's Audio %.2f, %.2f seconds, %d Hz" % (
self.version, self.length, self.sample_rate)
class MonkeysAudio(APEv2File):
_Info = MonkeysAudioInfo
_mimes = ["audio/ape", "audio/x-ape"]
@staticmethod
def score(filename, fileobj, header):
return header.startswith(b"MAC ") + endswith(filename.lower(), ".ape")
Open = MonkeysAudio
@@ -0,0 +1,283 @@
# MP3 stream header information support for Mutagen.
# Copyright 2006 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
"""MPEG audio stream information and tags."""
import os
import struct
from ._compat import endswith
from mutagen import StreamInfo
from mutagen.id3 import ID3FileType, BitPaddedInt, delete
__all__ = ["MP3", "Open", "delete", "MP3"]
class error(RuntimeError):
pass
class HeaderNotFoundError(error, IOError):
pass
class InvalidMPEGHeader(error, IOError):
pass
# Mode values.
STEREO, JOINTSTEREO, DUALCHANNEL, MONO = range(4)
class MPEGInfo(StreamInfo):
"""MPEG audio stream information
Parse information about an MPEG audio file. This also reads the
Xing VBR header format.
This code was implemented based on the format documentation at
http://mpgedit.org/mpgedit/mpeg_format/mpeghdr.htm.
Useful attributes:
* length -- audio length, in seconds
* bitrate -- audio bitrate, in bits per second
* sketchy -- if true, the file may not be valid MPEG audio
Useless attributes:
* version -- MPEG version (1, 2, 2.5)
* layer -- 1, 2, or 3
* mode -- One of STEREO, JOINTSTEREO, DUALCHANNEL, or MONO (0-3)
* protected -- whether or not the file is "protected"
* padding -- whether or not audio frames are padded
* sample_rate -- audio sample rate, in Hz
"""
# Map (version, layer) tuples to bitrates.
__BITRATE = {
(1, 1): range(0, 480, 32),
(1, 2): [0, 32, 48, 56, 64, 80, 96, 112, 128,
160, 192, 224, 256, 320, 384],
(1, 3): [0, 32, 40, 48, 56, 64, 80, 96, 112,
128, 160, 192, 224, 256, 320],
(2, 1): [0, 32, 48, 56, 64, 80, 96, 112, 128,
144, 160, 176, 192, 224, 256],
(2, 2): [0, 8, 16, 24, 32, 40, 48, 56, 64,
80, 96, 112, 128, 144, 160],
}
__BITRATE[(2, 3)] = __BITRATE[(2, 2)]
for i in range(1, 4):
__BITRATE[(2.5, i)] = __BITRATE[(2, i)]
# Map version to sample rates.
__RATES = {
1: [44100, 48000, 32000],
2: [22050, 24000, 16000],
2.5: [11025, 12000, 8000]
}
sketchy = False
def __init__(self, fileobj, offset=None):
"""Parse MPEG stream information from a file-like object.
If an offset argument is given, it is used to start looking
for stream information and Xing headers; otherwise, ID3v2 tags
will be skipped automatically. A correct offset can make
loading files significantly faster.
"""
try:
size = os.path.getsize(fileobj.name)
except (IOError, OSError, AttributeError):
fileobj.seek(0, 2)
size = fileobj.tell()
# If we don't get an offset, try to skip an ID3v2 tag.
if offset is None:
fileobj.seek(0, 0)
idata = fileobj.read(10)
try:
id3, insize = struct.unpack('>3sxxx4s', idata)
except struct.error:
id3, insize = '', 0
insize = BitPaddedInt(insize)
if id3 == b'ID3' and insize > 0:
offset = insize + 10
else:
offset = 0
# Try to find two valid headers (meaning, very likely MPEG data)
# at the given offset, 30% through the file, 60% through the file,
# and 90% through the file.
for i in [offset, 0.3 * size, 0.6 * size, 0.9 * size]:
try:
self.__try(fileobj, int(i), size - offset)
except error:
pass
else:
break
# If we can't find any two consecutive frames, try to find just
# one frame back at the original offset given.
else:
self.__try(fileobj, offset, size - offset, False)
self.sketchy = True
def __try(self, fileobj, offset, real_size, check_second=True):
# This is going to be one really long function; bear with it,
# because there's not really a sane point to cut it up.
fileobj.seek(offset, 0)
# We "know" we have an MPEG file if we find two frames that look like
# valid MPEG data. If we can't find them in 32k of reads, something
# is horribly wrong (the longest frame can only be about 4k). This
# is assuming the offset didn't lie.
data = fileobj.read(32768)
frame_1 = data.find(b"\xff")
while 0 <= frame_1 <= len(data) - 4:
frame_data = struct.unpack(">I", data[frame_1:frame_1 + 4])[0]
if (frame_data >> 16) & 0xE0 != 0xE0:
frame_1 = data.find(b"\xff", frame_1 + 2)
else:
version = (frame_data >> 19) & 0x3
layer = (frame_data >> 17) & 0x3
protection = (frame_data >> 16) & 0x1
bitrate = (frame_data >> 12) & 0xF
sample_rate = (frame_data >> 10) & 0x3
padding = (frame_data >> 9) & 0x1
#private = (frame_data >> 8) & 0x1
self.mode = (frame_data >> 6) & 0x3
#mode_extension = (frame_data >> 4) & 0x3
#copyright = (frame_data >> 3) & 0x1
#original = (frame_data >> 2) & 0x1
#emphasis = (frame_data >> 0) & 0x3
if (version == 1 or layer == 0 or sample_rate == 0x3 or
bitrate == 0 or bitrate == 0xF):
frame_1 = data.find(b"\xff", frame_1 + 2)
else:
break
else:
raise HeaderNotFoundError("can't sync to an MPEG frame")
# There is a serious problem here, which is that many flags
# in an MPEG header are backwards.
self.version = [2.5, None, 2, 1][version]
self.layer = 4 - layer
self.protected = not protection
self.padding = bool(padding)
self.bitrate = self.__BITRATE[(self.version, self.layer)][bitrate]
self.bitrate *= 1000
self.sample_rate = self.__RATES[self.version][sample_rate]
if self.layer == 1:
frame_length = (12 * self.bitrate / self.sample_rate + padding) * 4
frame_size = 384
elif self.version >= 2 and self.layer == 3:
frame_length = 72 * self.bitrate / self.sample_rate + padding
frame_size = 576
else:
frame_length = 144 * self.bitrate / self.sample_rate + padding
frame_size = 1152
if check_second:
possible = int(frame_1 + frame_length)
if possible > len(data) + 4:
raise HeaderNotFoundError("can't sync to second MPEG frame")
try:
frame_data = struct.unpack(
">H", data[possible:possible + 2])[0]
except struct.error:
raise HeaderNotFoundError("can't sync to second MPEG frame")
if frame_data & 0xFFE0 != 0xFFE0:
raise HeaderNotFoundError("can't sync to second MPEG frame")
self.length = 8 * real_size / float(self.bitrate)
# Try to find/parse the Xing header, which trumps the above length
# and bitrate calculation.
fileobj.seek(offset, 0)
data = fileobj.read(32768)
try:
xing = data[:-4].index(b"Xing")
except ValueError:
# Try to find/parse the VBRI header, which trumps the above length
# calculation.
try:
vbri = data[:-24].index(b"VBRI")
except ValueError:
pass
else:
# If a VBRI header was found, this is definitely MPEG audio.
self.sketchy = False
vbri_version = struct.unpack('>H', data[vbri + 4:vbri + 6])[0]
if vbri_version == 1:
frame_count = struct.unpack(
'>I', data[vbri + 14:vbri + 18])[0]
samples = float(frame_size * frame_count)
self.length = (samples / self.sample_rate) or self.length
else:
# If a Xing header was found, this is definitely MPEG audio.
self.sketchy = False
flags = struct.unpack('>I', data[xing + 4:xing + 8])[0]
if flags & 0x1:
frame_count = struct.unpack('>I', data[xing + 8:xing + 12])[0]
samples = float(frame_size * frame_count)
self.length = (samples / self.sample_rate) or self.length
if flags & 0x2:
bytes = struct.unpack('>I', data[xing + 12:xing + 16])[0]
self.bitrate = int((bytes * 8) // self.length)
def pprint(self):
s = "MPEG %s layer %d, %d bps, %s Hz, %.2f seconds" % (
self.version, self.layer, self.bitrate, self.sample_rate,
self.length)
if self.sketchy:
s += " (sketchy)"
return s
class MP3(ID3FileType):
"""An MPEG audio (usually MPEG-1 Layer 3) file.
:ivar info: :class:`MPEGInfo`
:ivar tags: :class:`ID3 <mutagen.id3.ID3>`
"""
_Info = MPEGInfo
_mimes = ["audio/mpeg", "audio/mpg", "audio/x-mpeg"]
@property
def mime(self):
l = self.info.layer
return ["audio/mp%d" % l, "audio/x-mp%d" % l] + super(MP3, self).mime
@staticmethod
def score(filename, fileobj, header):
filename = filename.lower()
return (header.startswith(b"ID3") * 2 + endswith(filename, b".mp3") +
endswith(filename, b".mp2") + endswith(filename, b".mpg") +
endswith(filename, b".mpeg"))
Open = MP3
class EasyMP3(MP3):
"""Like MP3, but uses EasyID3 for tags.
:ivar info: :class:`MPEGInfo`
:ivar tags: :class:`EasyID3 <mutagen.easyid3.EasyID3>`
"""
from mutagen.easyid3 import EasyID3 as ID3
ID3 = ID3
@@ -0,0 +1,837 @@
# Copyright 2006 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Read and write MPEG-4 audio files with iTunes metadata.
This module will read MPEG-4 audio information and metadata,
as found in Apple's MP4 (aka M4A, M4B, M4P) files.
There is no official specification for this format. The source code
for TagLib, FAAD, and various MPEG specifications at
* http://developer.apple.com/documentation/QuickTime/QTFF/
* http://www.geocities.com/xhelmboyx/quicktime/formats/mp4-layout.txt
* http://standards.iso.org/ittf/PubliclyAvailableStandards/\
c041828_ISO_IEC_14496-12_2005(E).zip
* http://wiki.multimedia.cx/index.php?title=Apple_QuickTime
were all consulted.
"""
import struct
import sys
from mutagen import FileType, Metadata, StreamInfo
from mutagen._constants import GENRES
from mutagen._util import cdata, insert_bytes, DictProxy, utf8
from mutagen._compat import reraise, PY2, string_types, text_type, chr_
class error(IOError):
pass
class MP4MetadataError(error):
pass
class MP4StreamInfoError(error):
pass
class MP4MetadataValueError(ValueError, MP4MetadataError):
pass
# This is not an exhaustive list of container atoms, but just the
# ones this module needs to peek inside.
_CONTAINERS = [b"moov", b"udta", b"trak", b"mdia", b"meta", b"ilst",
b"stbl", b"minf", b"moof", b"traf"]
_SKIP_SIZE = {b"meta": 4}
__all__ = ['MP4', 'Open', 'delete', 'MP4Cover', 'MP4FreeForm']
class MP4Cover(bytes):
"""A cover artwork.
Attributes:
* imageformat -- format of the image (either FORMAT_JPEG or FORMAT_PNG)
"""
FORMAT_JPEG = 0x0D
FORMAT_PNG = 0x0E
def __new__(cls, data, *args, **kwargs):
return bytes.__new__(cls, data)
def __init__(self, data, imageformat=FORMAT_JPEG):
self.imageformat = imageformat
try:
self.format
except AttributeError:
self.format = imageformat
class MP4FreeForm(bytes):
"""A freeform value.
Attributes:
* dataformat -- format of the data (either FORMAT_TEXT or FORMAT_DATA)
"""
FORMAT_DATA = 0x0
FORMAT_TEXT = 0x1
def __new__(cls, data, *args, **kwargs):
return bytes.__new__(cls, data)
def __init__(self, data, dataformat=FORMAT_TEXT):
self.dataformat = dataformat
class Atom(object):
"""An individual atom.
Attributes:
children -- list child atoms (or None for non-container atoms)
length -- length of this atom, including length and name
name -- four byte name of the atom, as a str
offset -- location in the constructor-given fileobj of this atom
This structure should only be used internally by Mutagen.
"""
children = None
def __init__(self, fileobj, level=0):
self.offset = fileobj.tell()
self.length, self.name = struct.unpack(">I4s", fileobj.read(8))
if self.length == 1:
self.length, = struct.unpack(">Q", fileobj.read(8))
if self.length < 16:
raise MP4MetadataError(
"64 bit atom length can only be 16 and higher")
elif self.length == 0:
if level != 0:
raise MP4MetadataError(
"only a top-level atom can have zero length")
# Only the last atom is supposed to have a zero-length, meaning it
# extends to the end of file.
fileobj.seek(0, 2)
self.length = fileobj.tell() - self.offset
fileobj.seek(self.offset + 8, 0)
elif self.length < 8:
raise MP4MetadataError(
"atom length can only be 0, 1 or 8 and higher")
if self.name in _CONTAINERS:
self.children = []
fileobj.seek(_SKIP_SIZE.get(self.name, 0), 1)
while fileobj.tell() < self.offset + self.length:
self.children.append(Atom(fileobj, level + 1))
else:
fileobj.seek(self.offset + self.length, 0)
@staticmethod
def render(name, data):
"""Render raw atom data."""
# this raises OverflowError if Py_ssize_t can't handle the atom data
size = len(data) + 8
if size <= 0xFFFFFFFF:
return struct.pack(">I4s", size, name) + data
else:
return struct.pack(">I4sQ", 1, name, size + 8) + data
def findall(self, name, recursive=False):
"""Recursively find all child atoms by specified name."""
if self.children is not None:
for child in self.children:
if child.name == name:
yield child
if recursive:
for atom in child.findall(name, True):
yield atom
def __getitem__(self, remaining):
"""Look up a child atom, potentially recursively.
e.g. atom['udta', 'meta'] => <Atom name='meta' ...>
"""
if not remaining:
return self
elif self.children is None:
raise KeyError("%r is not a container" % self.name)
for child in self.children:
if child.name == remaining[0]:
return child[remaining[1:]]
else:
raise KeyError("%r not found" % remaining[0])
def __repr__(self):
klass = self.__class__.__name__
if self.children is None:
return "<%s name=%r length=%r offset=%r>" % (
klass, self.name, self.length, self.offset)
else:
children = "\n".join([" " + line for child in self.children
for line in repr(child).splitlines()])
return "<%s name=%r length=%r offset=%r\n%s>" % (
klass, self.name, self.length, self.offset, children)
class Atoms(object):
"""Root atoms in a given file.
Attributes:
atoms -- a list of top-level atoms as Atom objects
This structure should only be used internally by Mutagen.
"""
def __init__(self, fileobj):
self.atoms = []
fileobj.seek(0, 2)
end = fileobj.tell()
fileobj.seek(0)
while fileobj.tell() + 8 <= end:
self.atoms.append(Atom(fileobj))
def path(self, *names):
"""Look up and return the complete path of an atom.
For example, atoms.path('moov', 'udta', 'meta') will return a
list of three atoms, corresponding to the moov, udta, and meta
atoms.
"""
path = [self]
for name in names:
path.append(path[-1][name, ])
return path[1:]
def __contains__(self, names):
try:
self[names]
except KeyError:
return False
return True
def __getitem__(self, names):
"""Look up a child atom.
'names' may be a list of atoms (['moov', 'udta']) or a string
specifying the complete path ('moov.udta').
"""
if PY2:
if isinstance(names, basestring):
names = names.split(b".")
else:
if isinstance(names, bytes):
names = names.split(b".")
for child in self.atoms:
if child.name == names[0]:
return child[names[1:]]
else:
raise KeyError("%s not found" % names[0])
def __repr__(self):
return "\n".join([repr(child) for child in self.atoms])
class MP4Tags(DictProxy, Metadata):
r"""Dictionary containing Apple iTunes metadata list key/values.
Keys are four byte identifiers, except for freeform ('----')
keys. Values are usually unicode strings, but some atoms have a
special structure:
Text values (multiple values per key are supported):
* '\\xa9nam' -- track title
* '\\xa9alb' -- album
* '\\xa9ART' -- artist
* 'aART' -- album artist
* '\\xa9wrt' -- composer
* '\\xa9day' -- year
* '\\xa9cmt' -- comment
* 'desc' -- description (usually used in podcasts)
* 'purd' -- purchase date
* '\\xa9grp' -- grouping
* '\\xa9gen' -- genre
* '\\xa9lyr' -- lyrics
* 'purl' -- podcast URL
* 'egid' -- podcast episode GUID
* 'catg' -- podcast category
* 'keyw' -- podcast keywords
* '\\xa9too' -- encoded by
* 'cprt' -- copyright
* 'soal' -- album sort order
* 'soaa' -- album artist sort order
* 'soar' -- artist sort order
* 'sonm' -- title sort order
* 'soco' -- composer sort order
* 'sosn' -- show sort order
* 'tvsh' -- show name
Boolean values:
* 'cpil' -- part of a compilation
* 'pgap' -- part of a gapless album
* 'pcst' -- podcast (iTunes reads this only on import)
Tuples of ints (multiple values per key are supported):
* 'trkn' -- track number, total tracks
* 'disk' -- disc number, total discs
Others:
* 'tmpo' -- tempo/BPM, 16 bit int
* 'covr' -- cover artwork, list of MP4Cover objects (which are
tagged strs)
* 'gnre' -- ID3v1 genre. Not supported, use '\\xa9gen' instead.
The freeform '----' frames use a key in the format '----:mean:name'
where 'mean' is usually 'com.apple.iTunes' and 'name' is a unique
identifier for this frame. The value is a str, but is probably
text that can be decoded as UTF-8. Multiple values per key are
supported.
MP4 tag data cannot exist outside of the structure of an MP4 file,
so this class should not be manually instantiated.
Unknown non-text tags are removed.
"""
def load(self, atoms, fileobj):
try:
ilst = atoms[b"moov.udta.meta.ilst"]
except KeyError as key:
raise MP4MetadataError(key)
for atom in ilst.children:
fileobj.seek(atom.offset + 8)
data = fileobj.read(atom.length - 8)
if len(data) != atom.length - 8:
raise MP4MetadataError("Not enough data")
if atom.name in self.__atoms:
info = self.__atoms[atom.name]
info[0](self, atom, data, *info[2:])
else:
# unknown atom, try as text and skip if it fails
# FIXME: keep them somehow
try:
self.__parse_text(atom, data)
except MP4MetadataError:
continue
@classmethod
def _can_load(cls, atoms):
return b"moov.udta.meta.ilst" in atoms
@staticmethod
def __key_sort(item):
(key, v) = item
# iTunes always writes the tags in order of "relevance", try
# to copy it as closely as possible.
order = [b"\xa9nam", b"\xa9ART", b"\xa9wrt", b"\xa9alb",
b"\xa9gen", b"gnre", b"trkn", b"disk",
b"\xa9day", b"cpil", b"pgap", b"pcst", b"tmpo",
b"\xa9too", b"----", b"covr", b"\xa9lyr"]
order = dict(zip(order, range(len(order))))
last = len(order)
# If there's no key-based way to distinguish, order by length.
# If there's still no way, go by string comparison on the
# values, so we at least have something determinstic.
return (order.get(key[:4], last), len(repr(v)), repr(v))
def save(self, filename):
"""Save the metadata to the given filename."""
values = []
items = self.items()
items.sort(key=self.__key_sort)
for key, value in items:
if not PY2 and not isinstance(key, bytes):
raise MP4MetadataValueError("keys have to be bytes")
info = self.__atoms.get(key[:4], (None, type(self).__render_text))
try:
values.append(info[1](self, key, value, *info[2:]))
except (TypeError, ValueError) as s:
reraise(MP4MetadataValueError, s, sys.exc_info()[2])
data = Atom.render(b"ilst", b"".join(values))
# Find the old atoms.
fileobj = open(filename, "rb+")
try:
atoms = Atoms(fileobj)
try:
path = atoms.path(b"moov", b"udta", b"meta", b"ilst")
except KeyError:
self.__save_new(fileobj, atoms, data)
else:
self.__save_existing(fileobj, atoms, path, data)
finally:
fileobj.close()
def __pad_ilst(self, data, length=None):
if length is None:
length = ((len(data) + 1023) & ~1023) - len(data)
return Atom.render(b"free", b"\x00" * length)
def __save_new(self, fileobj, atoms, ilst):
hdlr = Atom.render(b"hdlr", b"\x00" * 8 + b"mdirappl" + b"\x00" * 9)
meta = Atom.render(
b"meta", b"\x00\x00\x00\x00" + hdlr + ilst + self.__pad_ilst(ilst))
try:
path = atoms.path(b"moov", b"udta")
except KeyError:
# moov.udta not found -- create one
path = atoms.path(b"moov")
meta = Atom.render(b"udta", meta)
offset = path[-1].offset + 8
insert_bytes(fileobj, len(meta), offset)
fileobj.seek(offset)
fileobj.write(meta)
self.__update_parents(fileobj, path, len(meta))
self.__update_offsets(fileobj, atoms, len(meta), offset)
def __save_existing(self, fileobj, atoms, path, data):
# Replace the old ilst atom.
ilst = path.pop()
offset = ilst.offset
length = ilst.length
# Check for padding "free" atoms
meta = path[-1]
index = meta.children.index(ilst)
try:
prev = meta.children[index-1]
if prev.name == b"free":
offset = prev.offset
length += prev.length
except IndexError:
pass
try:
next = meta.children[index+1]
if next.name == b"free":
length += next.length
except IndexError:
pass
delta = len(data) - length
if delta > 0 or (delta < 0 and delta > -8):
data += self.__pad_ilst(data)
delta = len(data) - length
insert_bytes(fileobj, delta, offset)
elif delta < 0:
data += self.__pad_ilst(data, -delta - 8)
delta = 0
fileobj.seek(offset)
fileobj.write(data)
self.__update_parents(fileobj, path, delta)
self.__update_offsets(fileobj, atoms, delta, offset)
def __update_parents(self, fileobj, path, delta):
"""Update all parent atoms with the new size."""
for atom in path:
fileobj.seek(atom.offset)
size = cdata.uint_be(fileobj.read(4))
if size == 1: # 64bit
# skip name (4B) and read size (8B)
size = cdata.ulonglong_be(fileobj.read(12)[4:])
fileobj.seek(atom.offset + 8)
fileobj.write(cdata.to_ulonglong_be(size + delta))
else: # 32bit
fileobj.seek(atom.offset)
fileobj.write(cdata.to_uint_be(size + delta))
def __update_offset_table(self, fileobj, fmt, atom, delta, offset):
"""Update offset table in the specified atom."""
if atom.offset > offset:
atom.offset += delta
fileobj.seek(atom.offset + 12)
data = fileobj.read(atom.length - 12)
fmt = fmt % cdata.uint_be(data[:4])
offsets = struct.unpack(fmt, data[4:])
offsets = [o + (0, delta)[offset < o] for o in offsets]
fileobj.seek(atom.offset + 16)
fileobj.write(struct.pack(fmt, *offsets))
def __update_tfhd(self, fileobj, atom, delta, offset):
if atom.offset > offset:
atom.offset += delta
fileobj.seek(atom.offset + 9)
data = fileobj.read(atom.length - 9)
flags = cdata.uint_be(b"\x00" + data[:3])
if flags & 1:
o = cdata.ulonglong_be(data[7:15])
if o > offset:
o += delta
fileobj.seek(atom.offset + 16)
fileobj.write(cdata.to_ulonglong_be(o))
def __update_offsets(self, fileobj, atoms, delta, offset):
"""Update offset tables in all 'stco' and 'co64' atoms."""
if delta == 0:
return
moov = atoms[b"moov"]
for atom in moov.findall(b'stco', True):
self.__update_offset_table(fileobj, ">%dI", atom, delta, offset)
for atom in moov.findall(b'co64', True):
self.__update_offset_table(fileobj, ">%dQ", atom, delta, offset)
try:
for atom in atoms[b"moof"].findall(b'tfhd', True):
self.__update_tfhd(fileobj, atom, delta, offset)
except KeyError:
pass
def __parse_data(self, atom, data):
pos = 0
while pos < atom.length - 8:
length, name, flags = struct.unpack(">I4sI", data[pos:pos+12])
if name != b"data":
raise MP4MetadataError(
"unexpected atom %r inside %r" % (name, atom.name))
yield flags, data[pos+16:pos+length]
pos += length
def __render_data(self, key, flags, value):
return Atom.render(key, b"".join([
Atom.render(b"data", struct.pack(">2I", flags, 0) + data)
for data in value]))
def __parse_freeform(self, atom, data):
length = cdata.uint_be(data[:4])
mean = data[12:length]
pos = length
length = cdata.uint_be(data[pos:pos+4])
name = data[pos+12:pos+length]
pos += length
value = []
while pos < atom.length - 8:
length, atom_name = struct.unpack(">I4s", data[pos:pos+8])
if atom_name != b"data":
raise MP4MetadataError(
"unexpected atom %r inside %r" % (atom_name, atom.name))
version = ord(data[pos+8:pos+8+1])
if version != 0:
raise MP4MetadataError("Unsupported version: %r" % version)
flags = struct.unpack(">I", b"\x00" + data[pos+9:pos+12])[0]
value.append(MP4FreeForm(data[pos+16:pos+length],
dataformat=flags))
pos += length
if value:
self[atom.name + b":" + mean + b":" + name] = value
def __render_freeform(self, key, value):
dummy, mean, name = key.split(b":", 2)
mean = struct.pack(">I4sI", len(mean) + 12, b"mean", 0) + mean
name = struct.pack(">I4sI", len(name) + 12, b"name", 0) + name
if isinstance(value, bytes):
value = [value]
data = b""
for v in value:
flags = MP4FreeForm.FORMAT_TEXT
if isinstance(v, MP4FreeForm):
flags = v.dataformat
data += struct.pack(">I4s2I", len(v) + 16, b"data", flags, 0)
data += v
return Atom.render(b"----", mean + name + data)
def __parse_pair(self, atom, data):
self[atom.name] = [struct.unpack(">2H", d[2:6]) for
flags, d in self.__parse_data(atom, data)]
def __render_pair(self, key, value):
data = []
for (track, total) in value:
if 0 <= track < 1 << 16 and 0 <= total < 1 << 16:
data.append(struct.pack(">4H", 0, track, total, 0))
else:
raise MP4MetadataValueError(
"invalid numeric pair %r" % ((track, total),))
return self.__render_data(key, 0, data)
def __render_pair_no_trailing(self, key, value):
data = []
for (track, total) in value:
if 0 <= track < 1 << 16 and 0 <= total < 1 << 16:
data.append(struct.pack(">3H", 0, track, total))
else:
raise MP4MetadataValueError(
"invalid numeric pair %r" % ((track, total),))
return self.__render_data(key, 0, data)
def __parse_genre(self, atom, data):
# Translate to a freeform genre.
genre = cdata.short_be(data[16:18])
if b"\xa9gen" not in self:
try:
self[b"\xa9gen"] = [GENRES[genre - 1]]
except IndexError:
pass
def __parse_tempo(self, atom, data):
self[atom.name] = [cdata.ushort_be(value[1]) for
value in self.__parse_data(atom, data)]
def __render_tempo(self, key, value):
try:
if len(value) == 0:
return self.__render_data(key, 0x15, b"")
if min(value) < 0 or max(value) >= 2**16:
raise MP4MetadataValueError(
"invalid 16 bit integers: %r" % value)
except TypeError:
raise MP4MetadataValueError(
"tmpo must be a list of 16 bit integers")
values = list(map(cdata.to_ushort_be, value))
return self.__render_data(key, 0x15, values)
def __parse_bool(self, atom, data):
try:
self[atom.name] = bool(ord(data[16:17]))
except TypeError:
self[atom.name] = False
def __render_bool(self, key, value):
return self.__render_data(key, 0x15, [chr_(bool(value))])
def __parse_cover(self, atom, data):
self[atom.name] = []
pos = 0
while pos < atom.length - 8:
length, name, imageformat = struct.unpack(">I4sI",
data[pos:pos+12])
if name != b"data":
if name == b"name":
pos += length
continue
raise MP4MetadataError(
"unexpected atom %r inside 'covr'" % name)
if imageformat not in (MP4Cover.FORMAT_JPEG, MP4Cover.FORMAT_PNG):
imageformat = MP4Cover.FORMAT_JPEG
cover = MP4Cover(data[pos+16:pos+length], imageformat)
self[atom.name].append(cover)
pos += length
def __render_cover(self, key, value):
atom_data = []
for cover in value:
try:
imageformat = cover.imageformat
except AttributeError:
imageformat = MP4Cover.FORMAT_JPEG
atom_data.append(Atom.render(
b"data", struct.pack(">2I", imageformat, 0) + cover))
return Atom.render(key, b"".join(atom_data))
def __parse_text(self, atom, data, expected_flags=1):
value = [text.decode('utf-8', 'replace') for flags, text
in self.__parse_data(atom, data)
if flags == expected_flags]
if value:
self[atom.name] = value
def __render_text(self, key, value, flags=1):
if isinstance(value, string_types):
value = [value]
return self.__render_data(
key, flags, [utf8(v) for v in value])
def delete(self, filename):
"""Remove the metadata from the given filename."""
self.clear()
self.save(filename)
__atoms = {
b"----": (__parse_freeform, __render_freeform),
b"trkn": (__parse_pair, __render_pair),
b"disk": (__parse_pair, __render_pair_no_trailing),
b"gnre": (__parse_genre, None),
b"tmpo": (__parse_tempo, __render_tempo),
b"cpil": (__parse_bool, __render_bool),
b"pgap": (__parse_bool, __render_bool),
b"pcst": (__parse_bool, __render_bool),
b"covr": (__parse_cover, __render_cover),
b"purl": (__parse_text, __render_text, 0),
b"egid": (__parse_text, __render_text, 0),
}
# the text atoms we know about which should make loading fail if parsing
# any of them fails
for name in [b"\xa9nam", b"\xa9alb", b"\xa9ART", b"aART", b"\xa9wrt",
b"\xa9day", b"\xa9cmt", b"desc", b"purd", b"\xa9grp",
b"\xa9gen", b"\xa9lyr", b"catg", b"keyw", b"\xa9too",
b"cprt", b"soal", b"soaa", b"soar", b"sonm", b"soco",
b"sosn", b"tvsh"]:
__atoms[name] = (__parse_text, __render_text)
def pprint(self):
values = []
for key, value in self.iteritems():
key = key.decode('latin1', "replace")
if key == "covr":
values.append("%s=%s" % (key, ", ".join(
["[%d bytes of data]" % len(data) for data in value])))
elif isinstance(value, list):
values.append("%s=%s" %
(key, " / ".join(map(text_type, value))))
else:
values.append("%s=%s" % (key, value))
return "\n".join(values)
class MP4Info(StreamInfo):
"""MPEG-4 stream information.
Attributes:
* bitrate -- bitrate in bits per second, as an int
* length -- file length in seconds, as a float
* channels -- number of audio channels
* sample_rate -- audio sampling rate in Hz
* bits_per_sample -- bits per sample
"""
bitrate = 0
channels = 0
sample_rate = 0
bits_per_sample = 0
def __init__(self, atoms, fileobj):
for trak in list(atoms[b"moov"].findall(b"trak")):
hdlr = trak[b"mdia", b"hdlr"]
fileobj.seek(hdlr.offset)
data = fileobj.read(hdlr.length)
if data[16:20] == b"soun":
break
else:
raise MP4StreamInfoError("track has no audio data")
mdhd = trak[b"mdia", b"mdhd"]
fileobj.seek(mdhd.offset)
data = fileobj.read(mdhd.length)
if ord(data[8:9]) == 0:
offset = 20
fmt = ">2I"
else:
offset = 28
fmt = ">IQ"
end = offset + struct.calcsize(fmt)
unit, length = struct.unpack(fmt, data[offset:end])
self.length = float(length) / unit
try:
atom = trak[b"mdia", b"minf", b"stbl", b"stsd"]
fileobj.seek(atom.offset)
data = fileobj.read(atom.length)
if data[20:24] == b"mp4a":
length = cdata.uint_be(data[16:20])
(self.channels, self.bits_per_sample, _,
self.sample_rate) = struct.unpack(">3HI", data[40:50])
# ES descriptor type
if data[56:60] == b"esds" and ord(data[64:65]) == 0x03:
pos = 65
# skip extended descriptor type tag, length, ES ID
# and stream priority
if data[pos:pos+3] == b"\x80\x80\x80":
pos += 3
pos += 4
# decoder config descriptor type
if ord(data[pos:pos+1]) == 0x04:
pos += 1
# skip extended descriptor type tag, length,
# object type ID, stream type, buffer size
# and maximum bitrate
if data[pos:pos+3] == b"\x80\x80\x80":
pos += 3
pos += 10
# average bitrate
self.bitrate = cdata.uint_be(data[pos:pos+4])
except (ValueError, KeyError):
# stsd atoms are optional
pass
def pprint(self):
return "MPEG-4 audio, %.2f seconds, %d bps" % (
self.length, self.bitrate)
class MP4(FileType):
"""An MPEG-4 audio file, probably containing AAC.
If more than one track is present in the file, the first is used.
Only audio ('soun') tracks will be read.
:ivar info: :class:`MP4Info`
:ivar tags: :class:`MP4Tags`
"""
MP4Tags = MP4Tags
_mimes = ["audio/mp4", "audio/x-m4a", "audio/mpeg4", "audio/aac"]
def load(self, filename):
self.filename = filename
fileobj = open(filename, "rb")
try:
atoms = Atoms(fileobj)
# ftyp is always the first atom in a valid MP4 file
if not atoms.atoms or atoms.atoms[0].name != b"ftyp":
raise error("Not a MP4 file")
try:
self.info = MP4Info(atoms, fileobj)
except error:
raise
except Exception as err:
reraise(MP4StreamInfoError, err, sys.exc_info()[2])
if not MP4Tags._can_load(atoms):
self.tags = None
else:
try:
self.tags = self.MP4Tags(atoms, fileobj)
except error:
raise
except Exception as err:
reraise(MP4MetadataError, err, sys.exc_info()[2])
finally:
fileobj.close()
def add_tags(self):
if self.tags is None:
self.tags = self.MP4Tags()
else:
raise error("an MP4 tag already exists")
@staticmethod
def score(filename, fileobj, header):
return (b"ftyp" in header) + (b"mp4" in header)
Open = MP4
def delete(filename):
"""Remove tags from a file."""
MP4(filename).delete()
@@ -0,0 +1,260 @@
# A Musepack reader/tagger
#
# Copyright 2006 Lukas Lalinsky <lalinsky@gmail.com>
# Copyright 2012 Christoph Reiter <christoph.reiter@gmx.at>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Musepack audio streams with APEv2 tags.
Musepack is an audio format originally based on the MPEG-1 Layer-2
algorithms. Stream versions 4 through 7 are supported.
For more information, see http://www.musepack.net/.
"""
__all__ = ["Musepack", "Open", "delete"]
import struct
from ._compat import endswith
from mutagen import StreamInfo
from mutagen.apev2 import APEv2File, error, delete
from mutagen.id3 import BitPaddedInt
from mutagen._util import cdata
from ._compat import xrange
class MusepackHeaderError(error):
pass
RATES = [44100, 48000, 37800, 32000]
def _parse_sv8_int(fileobj, limit=9):
"""Reads (max limit) bytes from fileobj until the MSB is zero.
All 7 LSB will be merged to a big endian uint.
Raises ValueError in case not MSB is zero, or EOFError in
case the file ended before limit is reached.
Returns (parsed number, number of bytes read)
"""
num = 0
for i in xrange(limit):
c = fileobj.read(1)
if len(c) != 1:
raise EOFError
num = (num << 7) | (ord(c) & 0x7F)
if not ord(c) & 0x80:
return num, i + 1
if limit > 0:
raise ValueError
return 0, 0
def _calc_sv8_gain(gain):
# 64.82 taken from mpcdec
return 64.82 - gain / 256.0
def _calc_sv8_peak(peak):
return (10 ** (peak / (256.0 * 20.0)) / 65535.0)
class MusepackInfo(StreamInfo):
"""Musepack stream information.
Attributes:
* channels -- number of audio channels
* length -- file length in seconds, as a float
* sample_rate -- audio sampling rate in Hz
* bitrate -- audio bitrate, in bits per second
* version -- Musepack stream version
Optional Attributes:
* title_gain, title_peak -- Replay Gain and peak data for this song
* album_gain, album_peak -- Replay Gain and peak data for this album
These attributes are only available in stream version 7/8. The
gains are a float, +/- some dB. The peaks are a percentage [0..1] of
the maximum amplitude. This means to get a number comparable to
VorbisGain, you must multiply the peak by 2.
"""
def __init__(self, fileobj):
header = fileobj.read(4)
if len(header) != 4:
raise MusepackHeaderError("not a Musepack file")
# Skip ID3v2 tags
if header[:3] == b"ID3":
header = fileobj.read(6)
if len(header) != 6:
raise MusepackHeaderError("not a Musepack file")
size = 10 + BitPaddedInt(header[2:6])
fileobj.seek(size)
header = fileobj.read(4)
if len(header) != 4:
raise MusepackHeaderError("not a Musepack file")
if header.startswith(b"MPCK"):
self.__parse_sv8(fileobj)
else:
self.__parse_sv467(fileobj)
if not self.bitrate and self.length != 0:
fileobj.seek(0, 2)
self.bitrate = int(round(fileobj.tell() * 8 / self.length))
def __parse_sv8(self, fileobj):
#SV8 http://trac.musepack.net/trac/wiki/SV8Specification
key_size = 2
mandatory_packets = [b"SH", b"RG"]
def check_frame_key(key):
if len(frame_type) != key_size or not b'AA' <= frame_type <= b'ZZ':
raise MusepackHeaderError("Invalid frame key.")
frame_type = fileobj.read(key_size)
check_frame_key(frame_type)
while frame_type not in (b"AP", b"SE") and mandatory_packets:
try:
frame_size, slen = _parse_sv8_int(fileobj)
except (EOFError, ValueError):
raise MusepackHeaderError("Invalid packet size.")
data_size = frame_size - key_size - slen
if frame_type == b"SH":
mandatory_packets.remove(frame_type)
self.__parse_stream_header(fileobj, data_size)
elif frame_type == b"RG":
mandatory_packets.remove(frame_type)
self.__parse_replaygain_packet(fileobj, data_size)
else:
fileobj.seek(data_size, 1)
frame_type = fileobj.read(key_size)
check_frame_key(frame_type)
if mandatory_packets:
raise MusepackHeaderError("Missing mandatory packets: %s." %
", ".join(map(repr, mandatory_packets)))
self.length = float(self.samples) / self.sample_rate
self.bitrate = 0
def __parse_stream_header(self, fileobj, data_size):
fileobj.seek(4, 1)
try:
self.version = ord(fileobj.read(1))
except TypeError:
raise MusepackHeaderError("SH packet ended unexpectedly.")
try:
samples, l1 = _parse_sv8_int(fileobj)
samples_skip, l2 = _parse_sv8_int(fileobj)
except (EOFError, ValueError):
raise MusepackHeaderError(
"SH packet: Invalid sample counts.")
left_size = data_size - 5 - l1 - l2
if left_size != 2:
raise MusepackHeaderError("Invalid SH packet size.")
data = fileobj.read(left_size)
if len(data) != left_size:
raise MusepackHeaderError("SH packet ended unexpectedly.")
self.sample_rate = RATES[ord(data[-2:-1]) >> 5]
self.channels = (ord(data[-1:]) >> 4) + 1
self.samples = samples - samples_skip
def __parse_replaygain_packet(self, fileobj, data_size):
data = fileobj.read(data_size)
if data_size != 9:
raise MusepackHeaderError("Invalid RG packet size.")
if len(data) != data_size:
raise MusepackHeaderError("RG packet ended unexpectedly.")
title_gain = cdata.short_be(data[1:3])
title_peak = cdata.short_be(data[3:5])
album_gain = cdata.short_be(data[5:7])
album_peak = cdata.short_be(data[7:9])
if title_gain:
self.title_gain = _calc_sv8_gain(title_gain)
if title_peak:
self.title_peak = _calc_sv8_peak(title_peak)
if album_gain:
self.album_gain = _calc_sv8_gain(album_gain)
if album_peak:
self.album_peak = _calc_sv8_peak(album_peak)
def __parse_sv467(self, fileobj):
fileobj.seek(-4, 1)
header = fileobj.read(32)
if len(header) != 32:
raise MusepackHeaderError("not a Musepack file")
# SV7
if header.startswith(b"MP+"):
self.version = ord(header[3:4]) & 0xF
if self.version < 7:
raise MusepackHeaderError("not a Musepack file")
frames = cdata.uint_le(header[4:8])
flags = cdata.uint_le(header[8:12])
self.title_peak, self.title_gain = struct.unpack(
"<Hh", header[12:16])
self.album_peak, self.album_gain = struct.unpack(
"<Hh", header[16:20])
self.title_gain /= 100.0
self.album_gain /= 100.0
self.title_peak /= 65535.0
self.album_peak /= 65535.0
self.sample_rate = RATES[(flags >> 16) & 0x0003]
self.bitrate = 0
# SV4-SV6
else:
header_dword = cdata.uint_le(header[0:4])
self.version = (header_dword >> 11) & 0x03FF
if self.version < 4 or self.version > 6:
raise MusepackHeaderError("not a Musepack file")
self.bitrate = (header_dword >> 23) & 0x01FF
self.sample_rate = 44100
if self.version >= 5:
frames = cdata.uint_le(header[4:8])
else:
frames = cdata.ushort_le(header[6:8])
if self.version < 6:
frames -= 1
self.channels = 2
self.length = float(frames * 1152 - 576) / self.sample_rate
def pprint(self):
rg_data = []
if hasattr(self, "title_gain"):
rg_data.append("%+0.2f (title)" % self.title_gain)
if hasattr(self, "album_gain"):
rg_data.append("%+0.2f (album)" % self.album_gain)
rg_data = (rg_data and ", Gain: " + ", ".join(rg_data)) or ""
return "Musepack SV%d, %.2f seconds, %d Hz, %d bps%s" % (
self.version, self.length, self.sample_rate, self.bitrate, rg_data)
class Musepack(APEv2File):
_Info = MusepackInfo
_mimes = ["audio/x-musepack", "audio/x-mpc"]
@staticmethod
def score(filename, fileobj, header):
return (header.startswith(b"MP+") + header.startswith(b"MPCK") +
endswith(filename.lower(), b".mpc"))
Open = Musepack
@@ -0,0 +1,506 @@
# Copyright 2006 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Read and write Ogg bitstreams and pages.
This module reads and writes a subset of the Ogg bitstream format
version 0. It does *not* read or write Ogg Vorbis files! For that,
you should use mutagen.oggvorbis.
This implementation is based on the RFC 3533 standard found at
http://www.xiph.org/ogg/doc/rfc3533.txt.
"""
import struct
import sys
import zlib
from mutagen import FileType
from mutagen._util import cdata, insert_bytes, delete_bytes
from ._compat import cBytesIO, reraise, chr_
class error(IOError):
"""Ogg stream parsing errors."""
pass
class OggPage(object):
"""A single Ogg page (not necessarily a single encoded packet).
A page is a header of 26 bytes, followed by the length of the
data, followed by the data.
The constructor is givin a file-like object pointing to the start
of an Ogg page. After the constructor is finished it is pointing
to the start of the next page.
Attributes:
* version -- stream structure version (currently always 0)
* position -- absolute stream position (default -1)
* serial -- logical stream serial number (default 0)
* sequence -- page sequence number within logical stream (default 0)
* offset -- offset this page was read from (default None)
* complete -- if the last packet on this page is complete (default True)
* packets -- list of raw packet data (default [])
Note that if 'complete' is false, the next page's 'continued'
property must be true (so set both when constructing pages).
If a file-like object is supplied to the constructor, the above
attributes will be filled in based on it.
"""
version = 0
__type_flags = 0
position = 0
serial = 0
sequence = 0
offset = None
complete = True
def __init__(self, fileobj=None):
self.packets = []
if fileobj is None:
return
self.offset = fileobj.tell()
header = fileobj.read(27)
if len(header) == 0:
raise EOFError
try:
(oggs, self.version, self.__type_flags, self.position,
self.serial, self.sequence, crc, segments) = struct.unpack(
"<4sBBqIIiB", header)
except struct.error:
raise error("unable to read full header; got %r" % header)
if oggs != b"OggS":
raise error("read %r, expected %r, at 0x%x" % (
oggs, b"OggS", fileobj.tell() - 27))
if self.version != 0:
raise error("version %r unsupported" % self.version)
total = 0
lacings = []
lacing_bytes = fileobj.read(segments)
if len(lacing_bytes) != segments:
raise error("unable to read %r lacing bytes" % segments)
for c in bytearray(lacing_bytes):
total += c
if c < 255:
lacings.append(total)
total = 0
if total:
lacings.append(total)
self.complete = False
self.packets = [fileobj.read(l) for l in lacings]
if [len(p) for p in self.packets] != lacings:
raise error("unable to read full data")
def __eq__(self, other):
"""Two Ogg pages are the same if they write the same data."""
try:
return (self.write() == other.write())
except AttributeError:
return False
__hash__ = object.__hash__
def __repr__(self):
attrs = ['version', 'position', 'serial', 'sequence', 'offset',
'complete', 'continued', 'first', 'last']
values = ["%s=%r" % (attr, getattr(self, attr)) for attr in attrs]
return "<%s %s, %d bytes in %d packets>" % (
type(self).__name__, " ".join(values), sum(map(len, self.packets)),
len(self.packets))
def write(self):
"""Return a string encoding of the page header and data.
A ValueError is raised if the data is too big to fit in a
single page.
"""
data = [
struct.pack("<4sBBqIIi", b"OggS", self.version, self.__type_flags,
self.position, self.serial, self.sequence, 0)
]
lacing_data = []
for datum in self.packets:
quot, rem = divmod(len(datum), 255)
lacing_data.append(b"\xff" * quot + chr_(rem))
lacing_data = b"".join(lacing_data)
if not self.complete and lacing_data.endswith(b"\x00"):
lacing_data = lacing_data[:-1]
data.append(chr_(len(lacing_data)))
data.append(lacing_data)
data.extend(self.packets)
data = b"".join(data)
# Python's CRC is swapped relative to Ogg's needs.
# crc32 returns uint prior to py2.6 on some platforms, so force uint
crc = (~zlib.crc32(data.translate(cdata.bitswap), -1)) & 0xffffffff
# Although we're using to_uint_be, this actually makes the CRC
# a proper le integer, since Python's CRC is byteswapped.
crc = cdata.to_uint_be(crc).translate(cdata.bitswap)
data = data[:22] + crc + data[26:]
return data
@property
def size(self):
"""Total frame size."""
size = 27 # Initial header size
for datum in self.packets:
quot, rem = divmod(len(datum), 255)
size += quot + 1
if not self.complete and rem == 0:
# Packet contains a multiple of 255 bytes and is not
# terminated, so we don't have a \x00 at the end.
size -= 1
size += sum(map(len, self.packets))
return size
def __set_flag(self, bit, val):
mask = 1 << bit
if val:
self.__type_flags |= mask
else:
self.__type_flags &= ~mask
continued = property(
lambda self: cdata.test_bit(self.__type_flags, 0),
lambda self, v: self.__set_flag(0, v),
doc="The first packet is continued from the previous page.")
first = property(
lambda self: cdata.test_bit(self.__type_flags, 1),
lambda self, v: self.__set_flag(1, v),
doc="This is the first page of a logical bitstream.")
last = property(
lambda self: cdata.test_bit(self.__type_flags, 2),
lambda self, v: self.__set_flag(2, v),
doc="This is the last page of a logical bitstream.")
@classmethod
def renumber(klass, fileobj, serial, start):
"""Renumber pages belonging to a specified logical stream.
fileobj must be opened with mode r+b or w+b.
Starting at page number 'start', renumber all pages belonging
to logical stream 'serial'. Other pages will be ignored.
fileobj must point to the start of a valid Ogg page; any
occuring after it and part of the specified logical stream
will be numbered. No adjustment will be made to the data in
the pages nor the granule position; only the page number, and
so also the CRC.
If an error occurs (e.g. non-Ogg data is found), fileobj will
be left pointing to the place in the stream the error occured,
but the invalid data will be left intact (since this function
does not change the total file size).
"""
number = start
while True:
try:
page = OggPage(fileobj)
except EOFError:
break
else:
if page.serial != serial:
# Wrong stream, skip this page.
continue
# Changing the number can't change the page size,
# so seeking back based on the current size is safe.
fileobj.seek(-page.size, 1)
page.sequence = number
fileobj.write(page.write())
fileobj.seek(page.offset + page.size, 0)
number += 1
@classmethod
def to_packets(klass, pages, strict=False):
"""Construct a list of packet data from a list of Ogg pages.
If strict is true, the first page must start a new packet,
and the last page must end the last packet.
"""
serial = pages[0].serial
sequence = pages[0].sequence
packets = []
if strict:
if pages[0].continued:
raise ValueError("first packet is continued")
if not pages[-1].complete:
raise ValueError("last packet does not complete")
elif pages and pages[0].continued:
packets.append([b""])
for page in pages:
if serial != page.serial:
raise ValueError("invalid serial number in %r" % page)
elif sequence != page.sequence:
raise ValueError("bad sequence number in %r" % page)
else:
sequence += 1
if page.continued:
packets[-1].append(page.packets[0])
else:
packets.append([page.packets[0]])
packets.extend([[p] for p in page.packets[1:]])
return [b"".join(p) for p in packets]
@classmethod
def from_packets(klass, packets, sequence=0,
default_size=4096, wiggle_room=2048):
"""Construct a list of Ogg pages from a list of packet data.
The algorithm will generate pages of approximately
default_size in size (rounded down to the nearest multiple of
255). However, it will also allow pages to increase to
approximately default_size + wiggle_room if allowing the
wiggle room would finish a packet (only one packet will be
finished in this way per page; if the next packet would fit
into the wiggle room, it still starts on a new page).
This method reduces packet fragmentation when packet sizes are
slightly larger than the default page size, while still
ensuring most pages are of the average size.
Pages are numbered started at 'sequence'; other information is
uninitialized.
"""
chunk_size = (default_size // 255) * 255
pages = []
page = OggPage()
page.sequence = sequence
for packet in packets:
page.packets.append(b"")
while packet:
data, packet = packet[:chunk_size], packet[chunk_size:]
if page.size < default_size and len(page.packets) < 255:
page.packets[-1] += data
else:
# If we've put any packet data into this page yet,
# we need to mark it incomplete. However, we can
# also have just started this packet on an already
# full page, in which case, just start the new
# page with this packet.
if page.packets[-1]:
page.complete = False
if len(page.packets) == 1:
page.position = -1
else:
page.packets.pop(-1)
pages.append(page)
page = OggPage()
page.continued = not pages[-1].complete
page.sequence = pages[-1].sequence + 1
page.packets.append(data)
if len(packet) < wiggle_room:
page.packets[-1] += packet
packet = b""
if page.packets:
pages.append(page)
return pages
@classmethod
def replace(klass, fileobj, old_pages, new_pages):
"""Replace old_pages with new_pages within fileobj.
old_pages must have come from reading fileobj originally.
new_pages are assumed to have the 'same' data as old_pages,
and so the serial and sequence numbers will be copied, as will
the flags for the first and last pages.
fileobj will be resized and pages renumbered as necessary. As
such, it must be opened r+b or w+b.
"""
# Number the new pages starting from the first old page.
first = old_pages[0].sequence
for page, seq in zip(new_pages, range(first, first + len(new_pages))):
page.sequence = seq
page.serial = old_pages[0].serial
new_pages[0].first = old_pages[0].first
new_pages[0].last = old_pages[0].last
new_pages[0].continued = old_pages[0].continued
new_pages[-1].first = old_pages[-1].first
new_pages[-1].last = old_pages[-1].last
new_pages[-1].complete = old_pages[-1].complete
if not new_pages[-1].complete and len(new_pages[-1].packets) == 1:
new_pages[-1].position = -1
new_data = b"".join(map(klass.write, new_pages))
# Make room in the file for the new data.
delta = len(new_data)
fileobj.seek(old_pages[0].offset, 0)
insert_bytes(fileobj, delta, old_pages[0].offset)
fileobj.seek(old_pages[0].offset, 0)
fileobj.write(new_data)
new_data_end = old_pages[0].offset + delta
# Go through the old pages and delete them. Since we shifted
# the data down the file, we need to adjust their offsets. We
# also need to go backwards, so we don't adjust the deltas of
# the other pages.
old_pages.reverse()
for old_page in old_pages:
adj_offset = old_page.offset + delta
delete_bytes(fileobj, old_page.size, adj_offset)
# Finally, if there's any discrepency in length, we need to
# renumber the pages for the logical stream.
if len(old_pages) != len(new_pages):
fileobj.seek(new_data_end, 0)
serial = new_pages[-1].serial
sequence = new_pages[-1].sequence + 1
klass.renumber(fileobj, serial, sequence)
@classmethod
def find_last(klass, fileobj, serial):
"""Find the last page of the stream 'serial'.
If the file is not multiplexed this function is fast. If it is,
it must read the whole the stream.
This finds the last page in the actual file object, or the last
page in the stream (with eos set), whichever comes first.
"""
# For non-muxed streams, look at the last page.
try:
fileobj.seek(-256*256, 2)
except IOError:
# The file is less than 64k in length.
fileobj.seek(0)
data = fileobj.read()
try:
index = data.rindex(b"OggS")
except ValueError:
raise error("unable to find final Ogg header")
stringobj = cBytesIO(data[index:])
best_page = None
try:
page = OggPage(stringobj)
except error:
pass
else:
if page.serial == serial:
if page.last:
return page
else:
best_page = page
else:
best_page = None
# The stream is muxed, so use the slow way.
fileobj.seek(0)
try:
page = OggPage(fileobj)
while not page.last:
page = OggPage(fileobj)
while page.serial != serial:
page = OggPage(fileobj)
best_page = page
return page
except error:
return best_page
except EOFError:
return best_page
class OggFileType(FileType):
"""An generic Ogg file."""
_Info = None
_Tags = None
_Error = None
_mimes = ["application/ogg", "application/x-ogg"]
def load(self, filename):
"""Load file information from a filename."""
self.filename = filename
fileobj = open(filename, "rb")
try:
try:
self.info = self._Info(fileobj)
self.tags = self._Tags(fileobj, self.info)
self.info._post_tags(fileobj)
except error as e:
reraise(self._Error, e, sys.exc_info()[2])
except EOFError:
raise self._Error("no appropriate stream found")
finally:
fileobj.close()
def delete(self, filename=None):
"""Remove tags from a file.
If no filename is given, the one most recently loaded is used.
"""
if filename is None:
filename = self.filename
self.tags.clear()
fileobj = open(filename, "rb+")
try:
try:
self.tags._inject(fileobj)
except error as e:
reraise(self._Error, e, sys.exc_info()[2])
except EOFError:
raise self._Error("no appropriate stream found")
finally:
fileobj.close()
def save(self, filename=None):
"""Save a tag to a file.
If no filename is given, the one most recently loaded is used.
"""
if filename is None:
filename = self.filename
fileobj = open(filename, "rb+")
try:
try:
self.tags._inject(fileobj)
except error as e:
reraise(self._Error, e, sys.exc_info()[2])
except EOFError:
raise self._Error("no appropriate stream found")
finally:
fileobj.close()
@@ -0,0 +1,148 @@
# Ogg FLAC support.
#
# Copyright 2006 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Read and write Ogg FLAC comments.
This module handles FLAC files wrapped in an Ogg bitstream. The first
FLAC stream found is used. For 'naked' FLACs, see mutagen.flac.
This module is based off the specification at
http://flac.sourceforge.net/ogg_mapping.html.
"""
__all__ = ["OggFLAC", "Open", "delete"]
import struct
from ._compat import cBytesIO
from mutagen import flac
from mutagen.flac import VCFLACDict, StrictFileObject
from mutagen.ogg import OggPage, OggFileType, error as OggError
class error(OggError):
pass
class OggFLACHeaderError(error):
pass
class OggFLACStreamInfo(flac.StreamInfo):
"""Ogg FLAC general header and stream info.
This encompasses the Ogg wrapper for the FLAC STREAMINFO metadata
block, as well as the Ogg codec setup that precedes it.
Attributes (in addition to StreamInfo's):
* packets -- number of metadata packets
* serial -- Ogg logical stream serial number
"""
packets = 0
serial = 0
def load(self, data):
# Ogg expects file objects that don't raise on read
if isinstance(data, StrictFileObject):
data = data._fileobj
page = OggPage(data)
while not page.packets[0].startswith(b"\x7FFLAC"):
page = OggPage(data)
major, minor, self.packets, flac = struct.unpack(
">BBH4s", page.packets[0][5:13])
if flac != b"fLaC":
raise OggFLACHeaderError("invalid FLAC marker (%r)" % flac)
elif (major, minor) != (1, 0):
raise OggFLACHeaderError(
"unknown mapping version: %d.%d" % (major, minor))
self.serial = page.serial
# Skip over the block header.
stringobj = StrictFileObject(cBytesIO(page.packets[0][17:]))
super(OggFLACStreamInfo, self).load(stringobj)
def _post_tags(self, fileobj):
if self.length:
return
page = OggPage.find_last(fileobj, self.serial)
self.length = page.position / float(self.sample_rate)
def pprint(self):
return u"Ogg " + super(OggFLACStreamInfo, self).pprint()
class OggFLACVComment(VCFLACDict):
def load(self, data, info, errors='replace'):
# data should be pointing at the start of an Ogg page, after
# the first FLAC page.
pages = []
complete = False
while not complete:
page = OggPage(data)
if page.serial == info.serial:
pages.append(page)
complete = page.complete or (len(page.packets) > 1)
comment = cBytesIO(OggPage.to_packets(pages)[0][4:])
super(OggFLACVComment, self).load(comment, errors=errors)
def _inject(self, fileobj):
"""Write tag data into the FLAC Vorbis comment packet/page."""
# Ogg FLAC has no convenient data marker like Vorbis, but the
# second packet - and second page - must be the comment data.
fileobj.seek(0)
page = OggPage(fileobj)
while not page.packets[0].startswith(b"\x7FFLAC"):
page = OggPage(fileobj)
first_page = page
while not (page.sequence == 1 and page.serial == first_page.serial):
page = OggPage(fileobj)
old_pages = [page]
while not (old_pages[-1].complete or len(old_pages[-1].packets) > 1):
page = OggPage(fileobj)
if page.serial == first_page.serial:
old_pages.append(page)
packets = OggPage.to_packets(old_pages, strict=False)
# Set the new comment block.
data = self.write()
data = packets[0][:1] + struct.pack(">I", len(data))[-3:] + data
packets[0] = data
new_pages = OggPage.from_packets(packets, old_pages[0].sequence)
OggPage.replace(fileobj, old_pages, new_pages)
class OggFLAC(OggFileType):
"""An Ogg FLAC file."""
_Info = OggFLACStreamInfo
_Tags = OggFLACVComment
_Error = OggFLACHeaderError
_mimes = ["audio/x-oggflac"]
@staticmethod
def score(filename, fileobj, header):
return (header.startswith(b"OggS") * (
(b"FLAC" in header) + (b"fLaC" in header)))
Open = OggFLAC
def delete(filename):
"""Remove tags from a file."""
OggFLAC(filename).delete()
@@ -0,0 +1,126 @@
# Copyright 2012, 2013 Christoph Reiter
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Read and write Ogg Opus comments.
This module handles Opus files wrapped in an Ogg bitstream. The
first Opus stream found is used.
Based on http://tools.ietf.org/html/draft-terriberry-oggopus-01
"""
__all__ = ["OggOpus", "Open", "delete"]
import struct
from mutagen import StreamInfo
from mutagen._vorbis import VCommentDict
from mutagen.ogg import OggPage, OggFileType, error as OggError
class error(OggError):
pass
class OggOpusHeaderError(error):
pass
class OggOpusInfo(StreamInfo):
"""Ogg Opus stream information.
Attributes:
* length - file length in seconds, as a float
* channels - number of channels
"""
length = 0
def __init__(self, fileobj):
page = OggPage(fileobj)
while not page.packets[0].startswith(b"OpusHead"):
page = OggPage(fileobj)
self.serial = page.serial
if not page.first:
raise OggOpusHeaderError(
"page has ID header, but doesn't start a stream")
(version, self.channels, pre_skip, orig_sample_rate, output_gain,
channel_map) = struct.unpack("<BBHIhB", page.packets[0][8:19])
self.__pre_skip = pre_skip
# only the higher 4 bits change on incombatible changes
major, minor = version >> 4, version & 0xF
if major != 0:
raise OggOpusHeaderError("version %r unsupported" % major)
def _post_tags(self, fileobj):
page = OggPage.find_last(fileobj, self.serial)
self.length = (page.position - self.__pre_skip) / float(48000)
def pprint(self):
return u"Ogg Opus, %.2f seconds" % (self.length)
class OggOpusVComment(VCommentDict):
"""Opus comments embedded in an Ogg bitstream."""
def __get_comment_pages(self, fileobj, info):
# find the first tags page with the right serial
page = OggPage(fileobj)
while info.serial != page.serial or \
not page.packets[0].startswith(b"OpusTags"):
page = OggPage(fileobj)
# get all comment pages
pages = [page]
while not (pages[-1].complete or len(pages[-1].packets) > 1):
page = OggPage(fileobj)
if page.serial == pages[0].serial:
pages.append(page)
return pages
def __init__(self, fileobj, info):
pages = self.__get_comment_pages(fileobj, info)
data = OggPage.to_packets(pages)[0][8:] # Strip OpusTags
super(OggOpusVComment, self).__init__(data, framing=False)
def _inject(self, fileobj):
fileobj.seek(0)
info = OggOpusInfo(fileobj)
old_pages = self.__get_comment_pages(fileobj, info)
packets = OggPage.to_packets(old_pages)
packets[0] = b"OpusTags" + self.write(framing=False)
new_pages = OggPage.from_packets(packets, old_pages[0].sequence)
OggPage.replace(fileobj, old_pages, new_pages)
class OggOpus(OggFileType):
"""An Ogg Opus file."""
_Info = OggOpusInfo
_Tags = OggOpusVComment
_Error = OggOpusHeaderError
_mimes = ["audio/ogg", "audio/ogg; codecs=opus"]
@staticmethod
def score(filename, fileobj, header):
return (header.startswith(b"OggS") * (b"OpusHead" in header))
Open = OggOpus
def delete(filename):
"""Remove tags from a file."""
OggOpus(filename).delete()
@@ -0,0 +1,138 @@
# Ogg Speex support.
#
# Copyright 2006 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Read and write Ogg Speex comments.
This module handles Speex files wrapped in an Ogg bitstream. The
first Speex stream found is used.
Read more about Ogg Speex at http://www.speex.org/. This module is
based on the specification at http://www.speex.org/manual2/node7.html
and clarifications after personal communication with Jean-Marc,
http://lists.xiph.org/pipermail/speex-dev/2006-July/004676.html.
"""
__all__ = ["OggSpeex", "Open", "delete"]
from mutagen import StreamInfo
from mutagen._vorbis import VCommentDict
from mutagen.ogg import OggPage, OggFileType, error as OggError
from mutagen._util import cdata
class error(OggError):
pass
class OggSpeexHeaderError(error):
pass
class OggSpeexInfo(StreamInfo):
"""Ogg Speex stream information.
Attributes:
* bitrate - nominal bitrate in bits per second
* channels - number of channels
* length - file length in seconds, as a float
The reference encoder does not set the bitrate; in this case,
the bitrate will be 0.
"""
length = 0
def __init__(self, fileobj):
page = OggPage(fileobj)
while not page.packets[0].startswith(b"Speex "):
page = OggPage(fileobj)
if not page.first:
raise OggSpeexHeaderError(
"page has ID header, but doesn't start a stream")
self.sample_rate = cdata.uint_le(page.packets[0][36:40])
self.channels = cdata.uint_le(page.packets[0][48:52])
self.bitrate = max(0, cdata.int_le(page.packets[0][52:56]))
self.serial = page.serial
def _post_tags(self, fileobj):
page = OggPage.find_last(fileobj, self.serial)
self.length = page.position / float(self.sample_rate)
def pprint(self):
return u"Ogg Speex, %.2f seconds" % self.length
class OggSpeexVComment(VCommentDict):
"""Speex comments embedded in an Ogg bitstream."""
def __init__(self, fileobj, info):
pages = []
complete = False
while not complete:
page = OggPage(fileobj)
if page.serial == info.serial:
pages.append(page)
complete = page.complete or (len(page.packets) > 1)
data = OggPage.to_packets(pages)[0] + b"\x01"
super(OggSpeexVComment, self).__init__(data, framing=False)
def _inject(self, fileobj):
"""Write tag data into the Speex comment packet/page."""
fileobj.seek(0)
# Find the first header page, with the stream info.
# Use it to get the serial number.
page = OggPage(fileobj)
while not page.packets[0].startswith(b"Speex "):
page = OggPage(fileobj)
# Look for the next page with that serial number, it'll start
# the comment packet.
serial = page.serial
page = OggPage(fileobj)
while page.serial != serial:
page = OggPage(fileobj)
# Then find all the pages with the comment packet.
old_pages = [page]
while not (old_pages[-1].complete or len(old_pages[-1].packets) > 1):
page = OggPage(fileobj)
if page.serial == old_pages[0].serial:
old_pages.append(page)
packets = OggPage.to_packets(old_pages, strict=False)
# Set the new comment packet.
packets[0] = self.write(framing=False)
new_pages = OggPage.from_packets(packets, old_pages[0].sequence)
OggPage.replace(fileobj, old_pages, new_pages)
class OggSpeex(OggFileType):
"""An Ogg Speex file."""
_Info = OggSpeexInfo
_Tags = OggSpeexVComment
_Error = OggSpeexHeaderError
_mimes = ["audio/x-speex"]
@staticmethod
def score(filename, fileobj, header):
return (header.startswith(b"OggS") * (b"Speex " in header))
Open = OggSpeex
def delete(filename):
"""Remove tags from a file."""
OggSpeex(filename).delete()
@@ -0,0 +1,131 @@
# Ogg Theora support.
#
# Copyright 2006 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Read and write Ogg Theora comments.
This module handles Theora files wrapped in an Ogg bitstream. The
first Theora stream found is used.
Based on the specification at http://theora.org/doc/Theora_I_spec.pdf.
"""
__all__ = ["OggTheora", "Open", "delete"]
import struct
from mutagen import StreamInfo
from mutagen._vorbis import VCommentDict
from mutagen._util import cdata
from mutagen.ogg import OggPage, OggFileType, error as OggError
class error(OggError):
pass
class OggTheoraHeaderError(error):
pass
class OggTheoraInfo(StreamInfo):
"""Ogg Theora stream information.
Attributes:
* length - file length in seconds, as a float
* fps - video frames per second, as a float
"""
length = 0
def __init__(self, fileobj):
page = OggPage(fileobj)
while not page.packets[0].startswith(b"\x80theora"):
page = OggPage(fileobj)
if not page.first:
raise OggTheoraHeaderError(
"page has ID header, but doesn't start a stream")
data = page.packets[0]
vmaj, vmin = struct.unpack("2B", data[7:9])
if (vmaj, vmin) != (3, 2):
raise OggTheoraHeaderError(
"found Theora version %d.%d != 3.2" % (vmaj, vmin))
fps_num, fps_den = struct.unpack(">2I", data[22:30])
self.fps = fps_num / float(fps_den)
self.bitrate = cdata.uint_be(b"\x00" + data[37:40])
self.granule_shift = (cdata.ushort_be(data[40:42]) >> 5) & 0x1F
self.serial = page.serial
def _post_tags(self, fileobj):
page = OggPage.find_last(fileobj, self.serial)
position = page.position
mask = (1 << self.granule_shift) - 1
frames = (position >> self.granule_shift) + (position & mask)
self.length = frames / float(self.fps)
def pprint(self):
return "Ogg Theora, %.2f seconds, %d bps" % (self.length, self.bitrate)
class OggTheoraCommentDict(VCommentDict):
"""Theora comments embedded in an Ogg bitstream."""
def __init__(self, fileobj, info):
pages = []
complete = False
while not complete:
page = OggPage(fileobj)
if page.serial == info.serial:
pages.append(page)
complete = page.complete or (len(page.packets) > 1)
data = OggPage.to_packets(pages)[0][7:]
super(OggTheoraCommentDict, self).__init__(data + b"\x01")
def _inject(self, fileobj):
"""Write tag data into the Theora comment packet/page."""
fileobj.seek(0)
page = OggPage(fileobj)
while not page.packets[0].startswith(b"\x81theora"):
page = OggPage(fileobj)
old_pages = [page]
while not (old_pages[-1].complete or len(old_pages[-1].packets) > 1):
page = OggPage(fileobj)
if page.serial == old_pages[0].serial:
old_pages.append(page)
packets = OggPage.to_packets(old_pages, strict=False)
packets[0] = b"\x81theora" + self.write(framing=False)
new_pages = OggPage.from_packets(packets, old_pages[0].sequence)
OggPage.replace(fileobj, old_pages, new_pages)
class OggTheora(OggFileType):
"""An Ogg Theora file."""
_Info = OggTheoraInfo
_Tags = OggTheoraCommentDict
_Error = OggTheoraHeaderError
_mimes = ["video/x-theora"]
@staticmethod
def score(filename, fileobj, header):
return (header.startswith(b"OggS") *
((b"\x80theora" in header) + (b"\x81theora" in header)))
Open = OggTheora
def delete(filename):
"""Remove tags from a file."""
OggTheora(filename).delete()
@@ -0,0 +1,138 @@
# Ogg Vorbis support.
#
# Copyright 2006 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Read and write Ogg Vorbis comments.
This module handles Vorbis files wrapped in an Ogg bitstream. The
first Vorbis stream found is used.
Read more about Ogg Vorbis at http://vorbis.com/. This module is based
on the specification at http://www.xiph.org/vorbis/doc/Vorbis_I_spec.html.
"""
__all__ = ["OggVorbis", "Open", "delete"]
import struct
from mutagen import StreamInfo
from mutagen._vorbis import VCommentDict
from mutagen.ogg import OggPage, OggFileType, error as OggError
class error(OggError):
pass
class OggVorbisHeaderError(error):
pass
class OggVorbisInfo(StreamInfo):
"""Ogg Vorbis stream information.
Attributes:
* length - file length in seconds, as a float
* bitrate - nominal ('average') bitrate in bits per second, as an int
"""
length = 0
def __init__(self, fileobj):
page = OggPage(fileobj)
while not page.packets[0].startswith(b"\x01vorbis"):
page = OggPage(fileobj)
if not page.first:
raise OggVorbisHeaderError(
"page has ID header, but doesn't start a stream")
(self.channels, self.sample_rate, max_bitrate, nominal_bitrate,
min_bitrate) = struct.unpack("<B4i", page.packets[0][11:28])
self.serial = page.serial
max_bitrate = max(0, max_bitrate)
min_bitrate = max(0, min_bitrate)
nominal_bitrate = max(0, nominal_bitrate)
if nominal_bitrate == 0:
self.bitrate = (max_bitrate + min_bitrate) // 2
elif max_bitrate and max_bitrate < nominal_bitrate:
# If the max bitrate is less than the nominal, we know
# the nominal is wrong.
self.bitrate = max_bitrate
elif min_bitrate > nominal_bitrate:
self.bitrate = min_bitrate
else:
self.bitrate = nominal_bitrate
def _post_tags(self, fileobj):
page = OggPage.find_last(fileobj, self.serial)
self.length = page.position / float(self.sample_rate)
def pprint(self):
return u"Ogg Vorbis, %.2f seconds, %d bps" % (self.length, self.bitrate)
class OggVCommentDict(VCommentDict):
"""Vorbis comments embedded in an Ogg bitstream."""
def __init__(self, fileobj, info):
pages = []
complete = False
while not complete:
page = OggPage(fileobj)
if page.serial == info.serial:
pages.append(page)
complete = page.complete or (len(page.packets) > 1)
data = OggPage.to_packets(pages)[0][7:] # Strip off "\x03vorbis".
super(OggVCommentDict, self).__init__(data)
def _inject(self, fileobj):
"""Write tag data into the Vorbis comment packet/page."""
# Find the old pages in the file; we'll need to remove them,
# plus grab any stray setup packet data out of them.
fileobj.seek(0)
page = OggPage(fileobj)
while not page.packets[0].startswith(b"\x03vorbis"):
page = OggPage(fileobj)
old_pages = [page]
while not (old_pages[-1].complete or len(old_pages[-1].packets) > 1):
page = OggPage(fileobj)
if page.serial == old_pages[0].serial:
old_pages.append(page)
packets = OggPage.to_packets(old_pages, strict=False)
# Set the new comment packet.
packets[0] = b"\x03vorbis" + self.write()
new_pages = OggPage.from_packets(packets, old_pages[0].sequence)
OggPage.replace(fileobj, old_pages, new_pages)
class OggVorbis(OggFileType):
"""An Ogg Vorbis file."""
_Info = OggVorbisInfo
_Tags = OggVCommentDict
_Error = OggVorbisHeaderError
_mimes = ["audio/vorbis", "audio/x-vorbis"]
@staticmethod
def score(filename, fileobj, header):
return (header.startswith(b"OggS") * (b"\x01vorbis" in header))
Open = OggVorbis
def delete(filename):
"""Remove tags from a file."""
OggVorbis(filename).delete()
@@ -0,0 +1,74 @@
# OptimFROG reader/tagger
#
# Copyright 2006 Lukas Lalinsky <lalinsky@gmail.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""OptimFROG audio streams with APEv2 tags.
OptimFROG is a lossless audio compression program. Its main goal is to
reduce at maximum the size of audio files, while permitting bit
identical restoration for all input. It is similar with the ZIP
compression, but it is highly specialized to compress audio data.
Only versions 4.5 and higher are supported.
For more information, see http://www.losslessaudio.org/
"""
__all__ = ["OptimFROG", "Open", "delete"]
import struct
from ._compat import endswith
from mutagen import StreamInfo
from mutagen.apev2 import APEv2File, error, delete
class OptimFROGHeaderError(error):
pass
class OptimFROGInfo(StreamInfo):
"""OptimFROG stream information.
Attributes:
* channels - number of audio channels
* length - file length in seconds, as a float
* sample_rate - audio sampling rate in Hz
"""
def __init__(self, fileobj):
header = fileobj.read(76)
if (len(header) != 76 or not header.startswith(b"OFR ") or
struct.unpack("<I", header[4:8])[0] not in [12, 15]):
raise OptimFROGHeaderError("not an OptimFROG file")
(total_samples, total_samples_high, sample_type, self.channels,
self.sample_rate) = struct.unpack("<IHBBI", header[8:20])
total_samples += total_samples_high << 32
self.channels += 1
if self.sample_rate:
self.length = float(total_samples) / (self.channels *
self.sample_rate)
else:
self.length = 0.0
def pprint(self):
return "OptimFROG, %.2f seconds, %d Hz" % (self.length,
self.sample_rate)
class OptimFROG(APEv2File):
_Info = OptimFROGInfo
@staticmethod
def score(filename, fileobj, header):
filename = filename.lower()
return (header.startswith(b"OFR") + endswith(filename, b".ofr") +
endswith(filename, b".ofs"))
Open = OptimFROG
@@ -0,0 +1,83 @@
# True Audio support for Mutagen
# Copyright 2006 Joe Wreschnig
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
"""True Audio audio stream information and tags.
True Audio is a lossless format designed for real-time encoding and
decoding. This module is based on the documentation at
http://www.true-audio.com/TTA_Lossless_Audio_Codec\_-_Format_Description
True Audio files use ID3 tags.
"""
__all__ = ["TrueAudio", "Open", "delete", "EasyTrueAudio"]
from ._compat import endswith
from mutagen import StreamInfo
from mutagen.id3 import ID3FileType, delete
from mutagen._util import cdata
class error(RuntimeError):
pass
class TrueAudioHeaderError(error, IOError):
pass
class TrueAudioInfo(StreamInfo):
"""True Audio stream information.
Attributes:
* length - audio length, in seconds
* sample_rate - audio sample rate, in Hz
"""
def __init__(self, fileobj, offset):
fileobj.seek(offset or 0)
header = fileobj.read(18)
if len(header) != 18 or not header.startswith(b"TTA"):
raise TrueAudioHeaderError("TTA header not found")
self.sample_rate = cdata.int_le(header[10:14])
samples = cdata.uint_le(header[14:18])
self.length = float(samples) / self.sample_rate
def pprint(self):
return "True Audio, %.2f seconds, %d Hz." % (
self.length, self.sample_rate)
class TrueAudio(ID3FileType):
"""A True Audio file.
:ivar info: :class:`TrueAudioInfo`
:ivar tags: :class:`ID3 <mutagen.id3.ID3>`
"""
_Info = TrueAudioInfo
_mimes = ["audio/x-tta"]
@staticmethod
def score(filename, fileobj, header):
return (header.startswith(b"ID3") + header.startswith(b"TTA") +
endswith(filename.lower(), b".tta") * 2)
Open = TrueAudio
class EasyTrueAudio(TrueAudio):
"""Like MP3, but uses EasyID3 for tags.
:ivar info: :class:`TrueAudioInfo`
:ivar tags: :class:`EasyID3 <mutagen.easyid3.EasyID3>`
"""
from mutagen.easyid3 import EasyID3 as ID3
ID3 = ID3
@@ -0,0 +1,124 @@
# A WavPack reader/tagger
#
# Copyright 2006 Joe Wreschnig
# 2014 Christoph Reiter
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""WavPack reading and writing.
WavPack is a lossless format that uses APEv2 tags. Read
* http://www.wavpack.com/
* http://www.wavpack.com/file_format.txt
for more information.
"""
__all__ = ["WavPack", "Open", "delete"]
from mutagen import StreamInfo
from mutagen.apev2 import APEv2File, error, delete
from mutagen._util import cdata
class WavPackHeaderError(error):
pass
RATES = [6000, 8000, 9600, 11025, 12000, 16000, 22050, 24000, 32000, 44100,
48000, 64000, 88200, 96000, 192000]
class _WavPackHeader(object):
def __init__(self, block_size, version, track_no, index_no, total_samples,
block_index, block_samples, flags, crc):
self.block_size = block_size
self.version = version
self.track_no = track_no
self.index_no = index_no
self.total_samples = total_samples
self.block_index = block_index
self.block_samples = block_samples
self.flags = flags
self.crc = crc
@classmethod
def from_fileobj(cls, fileobj):
"""A new _WavPackHeader or raises WavPackHeaderError"""
header = fileobj.read(32)
if len(header) != 32 or not header.startswith(b"wvpk"):
raise WavPackHeaderError("not a WavPack header: %r" % header)
block_size = cdata.uint_le(header[4:8])
version = cdata.ushort_le(header[8:10])
track_no = ord(header[10:11])
index_no = ord(header[11:12])
samples = cdata.uint_le(header[12:16])
if samples == 2**32 - 1:
samples = -1
block_index = cdata.uint_le(header[16:20])
block_samples = cdata.uint_le(header[20:24])
flags = cdata.uint_le(header[24:28])
crc = cdata.uint_le(header[28:32])
return _WavPackHeader(block_size, version, track_no, index_no,
samples, block_index, block_samples, flags, crc)
class WavPackInfo(StreamInfo):
"""WavPack stream information.
Attributes:
* channels - number of audio channels (1 or 2)
* length - file length in seconds, as a float
* sample_rate - audio sampling rate in Hz
* version - WavPack stream version
"""
def __init__(self, fileobj):
try:
header = _WavPackHeader.from_fileobj(fileobj)
except WavPackHeaderError:
raise WavPackHeaderError("not a WavPack file")
self.version = header.version
self.channels = bool(header.flags & 4) or 2
self.sample_rate = RATES[(header.flags >> 23) & 0xF]
if header.total_samples == -1 or header.block_index != 0:
# TODO: we could make this faster by using the tag size
# and search backwards for the last block, then do
# last.block_index + last.block_samples - initial.block_index
samples = header.block_samples
while 1:
fileobj.seek(header.block_size - 32 + 8, 1)
try:
header = _WavPackHeader.from_fileobj(fileobj)
except WavPackHeaderError:
break
samples += header.block_samples
else:
samples = header.total_samples
self.length = float(samples) / self.sample_rate
def pprint(self):
return "WavPack, %.2f seconds, %d Hz" % (self.length, self.sample_rate)
class WavPack(APEv2File):
_Info = WavPackInfo
_mimes = ["audio/x-wavpack"]
@staticmethod
def score(filename, fileobj, header):
return header.startswith(b"wvpk") * 2
Open = WavPack
Binary file not shown.

After

Width:  |  Height:  |  Size: 89 KiB

+4
View File
@@ -0,0 +1,4 @@
License
-------
If the software submitted to this repository accesses or calls any software provided by Plex (“Interfacing Software”), then as a condition for receiving services from Plex in response to such accesses or calls, you agree to grant and do hereby grant to Plex and its affiliates worldwide a worldwide, nonexclusive, and royalty-free right and license to use (including testing, hosting and linking to), copy, publicly perform, publicly display, reproduce in copies for distribution, and distribute the copies of any Interfacing Software made by you or with your assistance; provided, however, that you may notify Plex at legal@plex.tv if you do not wish for Plex to use, distribute, copy, publicly perform, publicly display, reproduce in copies for distribution, or distribute copies of an Interfacing Software that was created by you, and Plex will reasonable efforts to comply with such a request within a reasonable time.
-1
View File
@@ -1 +0,0 @@
include LICENSE HISTORY.rst requirements.txt
+150
View File
@@ -0,0 +1,150 @@
Sub-Zero for Plex, 1.1-RC5.2
=================
![logo](https://raw.githubusercontent.com/pannal/Sub-Zero/master/Sub-Zero.bundle/Contents/Resources/subzero.gif)
##### Subtitles done right
Originally based on @bramwalet's awesome [Subliminal.bundle](https://github.com/bramwalet/Subliminal.bundle)
Plex forum thread: https://forums.plex.tv/discussion/186575
### Quick installation
* go to ```Library/Application Support/Plex Media Server/Plug-ins/```
* ```rm -r Sub-Zero.bundle LocalMediaExtended.bundle```
* ```wget https://github.com/pannal/Sub-Zero/releases/download/1.1-rc5.2/Sub-Zero-1.1-rc5.2.zip```
* ```unzip Sub-Zero-1.1-rc5.2.zip```
* more indepth: look below on ```Installation```
### Usage
Use the following agent order:
1. Sub-Zero TV/Movie Subtitles
2. Local Media Assets Extended
3. anything else
4. again, **DISABLE Local Media Assets**!
### Encountered a bug?
* be sure to post your logs: ```Library/Application Support/Plex Media Server/Logs/PMS Plugin Logs/com.plexapp.agents.subzero.log```; there may be multiple logs (com.plexapp.agents.subzero.log.*) depending on the amount of Videos you're refreshing
* **Remember: before you open a bug-ticket please double-check, that you've deleted the Sub-Zero.bundle folder BEFORE every update** (to avoid .pyc leftovers)
## Changelog
RC-5.2
- revert back to /plexinc-agents/LocalMedia.bundle/tree/dist instead of /plexinc-agents/LocalMedia.bundle/tree/master, as the current public PMS version is too old for that
RC-5.1
- make hearing_impaired option more configurable and clear (see #configuration-)
RC-5
- fix wrong video type matching by hinting video type to guessit
- update to newest LocalMediaExtended.bundle (incorporated plex-inc's changes)
- show page links for subtitles in log file instead of subtitle ID
- add custom language setting in addition to the three hardcoded ones
- if a subtitle doesn't match our hearing_impaired setting, ignore it
- add an optional boost for addic7ed subtitles, if their series, season, episode, year, and format (e.g. WEB-DL) matches
RC-4
- rename project to Sub-Zero
- incorporate LocalMediaExtended.bundle
- making this a multi-bundle plugin
- update default scores
- add icon
RC-3
- addic7ed/tvsubtitles: punctuation fixes (correctly get show ids for series like "Mr. Poopster" now)
- podnapisi: fix logging
- opensubtitles: add login credentials (for VIPs)
- add retry functionality to retry failed subtitle downloads, including configurable amount of retries until discarding of provider
- move possibly not needed setting "Restrict to one language" to the bottom
- more detailed logging
- some cleanup
RC-2
- fix empty custom subtitle folder creation
- fix detection of existing embedded subtitles (switch to https://github.com/tonswieb/enzyme)
- better logging
- set default TV score to 15; movie score to 30
RC-1
- fix subliminal's logging error on min_score not met (fixes #15)
- separated tv and movies subtitle scores settings (fixes #16)
- add option to save only one subtitle per video (skipping the ".lang." naming scheme plex supports) (fixes #3)
beta5
- fix storing subtitles besides the actual video file, not subfolder (fixes #14)
- "custom folder" setting now always used if given (properly overrides "subtitle folder" setting)
- also scan (custom) given subtitle folders for existing subtitles instead of redownloading them on every refresh (fixes #9, #2)
beta4
- ~~increased score of addic7ed subtitles a bit~~ (not existing currently)
- **support for newest Subliminal ([1.0.1](27a6e51cd36ffb2910cd9a7add6d797a2c6469b7)) and guessit ([0.11.0](2814f57e8999dcc31575619f076c0c1a63ce78f2))**
- **plugin now also [works with com.plexapp.agents.thetvdbdvdorder](924470d2c0db3a71529278bce4b7247eaf2f85b8)**
- providers fixed for subliminal 1.0.1 ([at least addic7ed](131504e7eed8b3400c457fbe49beea3b115bc916))
- providers [don't simply fail and get excluded on non-detected language](1a779020792e0201ad689eefbf5a126155e89c97)
- support for addic7ed languages: [French (Canadian)](b11a051c233fd72033f0c3b5a8c1965260e7e19f)
- support for additional languages: [pt-br (Portuguese (Brasil)), fa (Persian (Farsi))](131504e7eed8b3400c457fbe49beea3b115bc916)
- support for [three (two optional) subtitle languages](e543c927cf49c264eaece36640c99d67a99c7da2)
- optionally use [random user agent for addic7ed provider](83ace14faf75fbd75313f0ceda9b78161895fbcf) (should not be needed)
Description
------------
Plex Metadata agent plugin based on Subliminal. This agent will search on the following sites for the best matching subtitles:
- OpenSubtitles
- TheSubDB
- Podnapisi.NET
- Addic7ed
- TVsubtitles.net
All providers can be disabled or enabled on a per provider setting. Certain preferences change the behaviour of subliminal, for instance the minimum score of subtitles to download, or whether to download hearing impaired subtitles or not. The agent stores the subtitles as metadata, but can be configured (See Configuration) to store it next to the media files.
Installation
------------
See [article](https://support.plex.tv/hc/en-us/articles/201187656-How-do-I-manually-install-a-channel-) on Plex website.
Configuration
-------------
Several options are provided in the preferences of this agent.
* Addic7ed username/password: Provide your addic7ed username here, otherwise the provider won't work. Please make sure your account is activated, before using the agent.
* Subtitle language (1)/(2)/(3): Your preferred languages to download subtitles for.
* Additional Subtitle Languages: Additional languages to download; comma-separated; use ISO-639-1 codes)
* Provider: Enable ...: Enable/disable this provider. Affects both movies and series.
* Addic7ed: boost over hash score if requirements met: if an Addic7ed subtitle matches the video's series, season, episode, year, and format (e.g. WEB-DL), boost its score, possibly over OpenSubtitles/TheSubDB direct hash match
* Scan: Include embedded subtitles: When enabled, subliminal finds embedded subtitles that are already present within the media file.
* Scan: Include external subtitles: When enabled, subliminal finds subtitles located near the media file on the filesystem.
* Minimum score for download: When configured, what is the minimum score for subtitles to download them? Lower scored subtitles are not downloaded.
* Download hearing impaired subtitles:
* "prefer": score subtitles for hearing impaired higher
* "don't prefer": score subtitles for hearing impaired lower
* "force HI": skip subtitles if the hearing impaired flag isn't set
* "force non-HI": skip subtitles if the hearing impaired flag is set
* Store subtitles next to media files (instead of metadata): See Store as metadata or on filesystem
* Subtitle folder: See Store as metadata or on filesystem
* Custom Subtitle folder: See Store as metadata or on filesystem
Store as metadata or on filesystem
----------------------------------
By default, Plex stores posters, fan art and subtitles as metadata in a separate folder which is not managed by the user. This is the default behaviour of this agent. However, expert users can enable 'Store subtitles next to media files'. The agent will write the subtitle files in the media folder. The setting 'Subtitle folder' configures in which folder (current folder or other subfolder) the subtitles are stored. The expert user can also supply 'Custom Subtitle folder' which can also be an absolute path.
Please note that you need a way to pick up external subtitles to show up in the Plex Media server. When the subtitles are stored next to your media folders, it is sufficient to enable Local Media agent and place it below the Subliminal agent in the agent priorities. When a subfolder (either custom or predefined) is used, you need [LocalMediaExtended](https://github.com/pannal/LocalMediaExtended.bundle).
License
-------
MIT
Libraries
---------
Uses the following libraries and their LICENSE:
- [babelfish](https://pypi.python.org/pypi/babelfish/) (BSD-3-Clause)
- [beautifulsoup4](https://pypi.python.org/pypi/beautifulsoup4/) (MIT)
- [chardet](https://pypi.python.org/pypi/chardet/) (LGPL)
- [dogpile.core](https://pypi.python.org/pypi/dogpile.core/) (BSD)
- [dogpile.cache](https://pypi.python.org/pypi/dogpile.cache/) (BSD)
- [enzyme](https://pypi.python.org/pypi/enzyme/) (Apache 2.0)
- [guessit](https://pypi.python.org/pypi/guessit/) (LGPLv3)
- [html5lib](https://pypi.python.org/pypi/html5lib/) (MIT)
- [pysrt](https://pypi.python.org/pypi/pysrt/) (GPLv3)
- [requests](https://pypi.python.org/pypi/requests/) (Apache 2.0)
- [stevedore](https://pypi.python.org/pypi/stevedore/) (Apache)
- [subliminal](https://pypi.python.org/pypi/subliminal/) (MIT)
- [xdg](https://pypi.python.org/pypi/pyxdg/) (LGPLv2)
- [setuptools](https://pypi.python.org/pypi/setuptools/) (PSF ZPL)
-82
View File
@@ -1,82 +0,0 @@
Subliminal
==========
Subtitles, faster than your thoughts.
.. image:: https://img.shields.io/pypi/v/subliminal.svg
:target: https://pypi.python.org/pypi/subliminal
:alt: Latest Version
.. image:: https://travis-ci.org/Diaoul/subliminal.svg?branch=develop
:target: https://travis-ci.org/Diaoul/subliminal
:alt: Travis CI build status
.. image:: https://readthedocs.org/projects/subliminal/badge/?version=latest
:target: https://subliminal.readthedocs.org/
:alt: Documentation Status
.. image:: https://coveralls.io/repos/Diaoul/subliminal/badge.svg?branch=develop&service=github
:target: https://coveralls.io/github/Diaoul/subliminal?branch=develop
:alt: Code coverage
.. image:: https://img.shields.io/github/license/Diaoul/subliminal.svg
:target: https://github.com/Diaoul/subliminal/blob/master/LICENSE
:alt: License
.. image:: https://img.shields.io/badge/gitter-join%20chat-1dce73.svg
:alt: Join the chat at https://gitter.im/Diaoul/subliminal
:target: https://gitter.im/Diaoul/subliminal
:Project page: https://github.com/Diaoul/subliminal
:Documentation: https://subliminal.readthedocs.org/
Usage
-----
CLI
^^^
Download English subtitles::
$ subliminal download -l en The.Big.Bang.Theory.S05E18.HDTV.x264-LOL.mp4
Collecting videos [####################################] 100%
1 video collected / 0 video ignored / 0 error
Downloading subtitles [####################################] 100%
Downloaded 1 subtitle
Library
^^^^^^^
Download best subtitles in French and English for videos less than two weeks old in a video folder:
.. code:: python
from datetime import timedelta
from babelfish import Language
from subliminal import download_best_subtitles, region, save_subtitles, scan_videos
# configure the cache
region.configure('dogpile.cache.dbm', arguments={'filename': 'cachefile.dbm'})
# scan for videos newer than 2 weeks and their existing subtitles in a folder
videos = scan_videos('/video/folder', age=timedelta(weeks=2))
# download best subtitles
subtitles = download_best_subtitles(videos, {Language('eng'), Language('fra')})
# save them to disk, next to the video
for v in videos:
save_subtitles(v, subtitles[v])
Installation
------------
Subliminal can be installed as a regular python module by running::
$ [sudo] pip install subliminal
For a better isolation with your system you should use a dedicated virtualenv or install for your user only using
the ``--user`` flag.
Nautilus/Nemo integration
-------------------------
See the dedicated `project page <https://github.com/Diaoul/nautilus-subliminal>`_ for more information.
+211
View File
@@ -0,0 +1,211 @@
# hdbits.org
import string, os, urllib, zipfile, re, copy
from babelfish import Language
from datetime import timedelta
import subliminal
import subliminal_patch
import logger
OS_PLEX_USERAGENT = 'plexapp.com v9.0'
DEPENDENCY_MODULE_NAMES = ['subliminal', 'subliminal_patch', 'enzyme', 'guessit', 'requests']
def Start():
HTTP.CacheTime = 0
HTTP.Headers['User-agent'] = OS_PLEX_USERAGENT
Log.Debug("START CALLED")
logger.registerLoggingHander(DEPENDENCY_MODULE_NAMES)
# configured cache to be in memory as per https://github.com/Diaoul/subliminal/issues/303
subliminal.region.configure('dogpile.cache.memory')
def ValidatePrefs():
Log.Debug("Validate Prefs called.")
return
# Prepare a list of languages we want subs for
def getLangList():
langList = {Language.fromietf(Prefs["langPref1"])}
langCustom = Prefs["langPrefCustom"].strip()
if Prefs['subtitles.only_one']:
return langList
if Prefs["langPref2"] != "None":
langList.update({Language.fromietf(Prefs["langPref2"])})
if Prefs["langPref3"] != "None":
langList.update({Language.fromietf(Prefs["langPref3"])})
if len(langCustom) and langCustom != "None":
for lang in langCustom.split(u","):
lang = lang.strip()
try:
real_lang = Language.fromietf(lang)
except:
try:
real_lang = Language.fromname(lang)
except:
continue
langList.update({real_lang})
return langList
def getSubtitleDestinationFolder():
if not Prefs["subtitles.save.filesystem"]:
return
fld_custom = Prefs["subtitles.save.subFolder.Custom"].strip() if bool(Prefs["subtitles.save.subFolder.Custom"]) else None
return fld_custom or (Prefs["subtitles.save.subFolder"] if Prefs["subtitles.save.subFolder"] != "current folder" else None)
def initSubliminalPatches():
# configure custom subtitle destination folders for scanning pre-existing subs
dest_folder = getSubtitleDestinationFolder()
subliminal_patch.patch_video.CUSTOM_PATHS = [dest_folder] if dest_folder else []
subliminal_patch.patch_provider_pool.DOWNLOAD_TRIES = int(Prefs['subtitles.try_downloads'])
subliminal_patch.patch_providers.addic7ed.USE_BOOST = bool(Prefs['provider.addic7ed.boost'])
def getProviders():
providers = {'opensubtitles' : Prefs['provider.opensubtitles.enabled'],
'thesubdb' : Prefs['provider.thesubdb.enabled'],
'podnapisi' : Prefs['provider.podnapisi.enabled'],
'addic7ed' : Prefs['provider.addic7ed.enabled'],
'tvsubtitles' : Prefs['provider.tvsubtitles.enabled']
}
return filter(lambda prov: providers[prov], providers)
def getProviderSettings():
provider_settings = {'addic7ed': {'username': Prefs['provider.addic7ed.username'],
'password': Prefs['provider.addic7ed.password'],
'use_random_agents': Prefs['provider.addic7ed.use_random_agents'],
},
'opensubtitles': {'username': Prefs['provider.opensubtitles.username'],
'password': Prefs['provider.opensubtitles.password'],
},
}
return provider_settings
def scanTvMedia(media):
videos = {}
for season in media.seasons:
for episode in media.seasons[season].episodes:
for item in media.seasons[season].episodes[episode].items:
for part in item.parts:
scannedVideo = scanVideo(part, "episode")
videos[scannedVideo] = part
return videos
def scanMovieMedia(media):
videos = {}
for item in media.items:
for part in item.parts:
scannedVideo = scanVideo(part, "movie")
videos[scannedVideo] = part
return videos
def scanVideo(part, video_type):
embedded_subtitles = Prefs['subtitles.scan.embedded']
external_subtitles = Prefs['subtitles.scan.external']
Log.Debug("Scanning video: %s, subtitles=%s, embedded_subtitles=%s" % (part.file, external_subtitles, embedded_subtitles))
try:
return subliminal.video.scan_video(part.file, subtitles=external_subtitles, embedded_subtitles=embedded_subtitles, video_type=video_type)
except ValueError:
Log.Warn("File could not be guessed by subliminal")
def downloadBestSubtitles(videos, min_score=0):
hearing_impaired = Prefs['subtitles.search.hearingImpaired']
languages = getLangList()
if not languages:
return
missing_languages = False
for video in videos:
if not (languages - video.subtitle_languages):
Log.Debug('All languages %r exist for %s', languages, video)
continue
missing_languages = True
break
if missing_languages:
Log.Debug("Download best subtitles using settings: min_score: %s, hearing_impaired: %s" %(min_score, hearing_impaired))
return subliminal.api.download_best_subtitles(videos, languages, min_score, hearing_impaired, providers=getProviders(), provider_configs=getProviderSettings(), only_one=Prefs['subtitles.only_one'])
Log.Debug("All languages for all requested videos exist. Doing nothing.")
def saveSubtitles(videos, subtitles):
if Prefs['subtitles.save.filesystem']:
Log.Debug("Using filesystem as subtitle storage")
saveSubtitlesToFile(subtitles)
else:
Log.Debug("Using metadata as subtitle storage")
saveSubtitlesToMetadata(videos, subtitles)
def saveSubtitlesToFile(subtitles):
fld_custom = Prefs["subtitles.save.subFolder.Custom"].strip() if bool(Prefs["subtitles.save.subFolder.Custom"]) else None
for video, video_subtitles in subtitles.items():
if not video_subtitles:
continue
fld = None
if fld_custom or Prefs["subtitles.save.subFolder"] != "current folder":
# specific subFolder requested, create it if it doesn't exist
fld_base = os.path.split(video.name)[0]
if fld_custom:
if fld_custom.startswith("/"):
# absolute folder
fld = fld_custom
else:
fld = os.path.join(fld_base, fld_custom)
else:
fld = os.path.join(fld_base, Prefs["subtitles.save.subFolder"])
if not os.path.exists(fld):
os.makedirs(fld)
subliminal.api.save_subtitles(video, video_subtitles, directory=fld, single=Prefs['subtitles.only_one'])
def saveSubtitlesToMetadata(videos, subtitles):
for video, video_subtitles in subtitles.items():
mediaPart = videos[video]
for subtitle in video_subtitles:
mediaPart.subtitles[Locale.Language.Match(subtitle.language.alpha2)][subtitle.page_link] = Proxy.Media(subtitle.content, ext="srt")
class SubZeroSubtitlesAgentMovies(Agent.Movies):
name = 'Sub-Zero Movie Subtitles'
languages = [Locale.Language.English]
primary_provider = False
contributes_to = ['com.plexapp.agents.imdb']
def search(self, results, media, lang):
Log.Debug("MOVIE SEARCH CALLED")
results.Append(MetadataSearchResult(id='null', score=100))
def update(self, metadata, media, lang):
Log.Debug("MOVIE UPDATE CALLED")
initSubliminalPatches()
videos = scanMovieMedia(media)
subtitles = downloadBestSubtitles(videos.keys(), min_score=int(Prefs["subtitles.search.minimumMovieScore"]))
if subtitles:
saveSubtitles(videos, subtitles)
class SubZeroSubtitlesAgentTvShows(Agent.TV_Shows):
name = 'Sub-Zero TV Subtitles'
languages = [Locale.Language.English]
primary_provider = False
contributes_to = ['com.plexapp.agents.thetvdb', 'com.plexapp.agents.thetvdbdvdorder']
def search(self, results, media, lang):
Log.Debug("TV SEARCH CALLED")
results.Append(MetadataSearchResult(id='null', score=100))
def update(self, metadata, media, lang):
Log.Debug("TvUpdate. Lang %s" % lang)
initSubliminalPatches()
videos = scanTvMedia(media)
subtitles = downloadBestSubtitles(videos.keys(), min_score=int(Prefs["subtitles.search.minimumTVScore"]))
if subtitles:
saveSubtitles(videos, subtitles)
+33
View File
@@ -0,0 +1,33 @@
import logging
def registerLoggingHander(dependencies):
plexHandler = PlexLoggerHandler()
for dependency in dependencies:
Log.Debug("Registering LoggerHandler for dependency: %s" % dependency)
log = logging.getLogger(dependency)
log.setLevel('DEBUG')
log.addHandler(plexHandler)
class PlexLoggerHandler(logging.StreamHandler):
def __init__(self, level=0):
super(PlexLoggerHandler, self).__init__(level)
def getFormattedString(self, record):
return record.name + ": " + record.getMessage()
def emit(self, record):
if record.levelno == logging.DEBUG:
Log.Debug(self.getFormattedString(record))
elif record.levelno == logging.INFO:
Log.Info(self.getFormattedString(record))
elif record.levelno == logging.WARNING:
Log.Warn(self.getFormattedString(record))
elif record.levelno == logging.ERROR:
Log.Error(self.getFormattedString(record))
elif record.levelno == logging.CRITICAL:
Log.Critical(self.getFormattedString(record))
elif record.levelno == logging.FATAL:
Log.Exception(self.getFormattedString(record))
else:
Log.Error("UNKNOWN LEVEL: %s", record.getMessage())
+163
View File
@@ -0,0 +1,163 @@
[
{ "id": "subtitles.try_downloads",
"label": "How many download tries per subtitle (on timeout or error)",
"type": "enum",
"values": ["1", "2", "3", "4"],
"default": "2"
},
{
"id": "provider.addic7ed.username",
"label": "Addic7ed Username",
"type": "text",
"default": "Username"
},
{
"id": "provider.addic7ed.password",
"label": "Addic7ed Password",
"type": "text",
"option": "hidden",
"default": "",
"secure": "true"
},
{
"id": "provider.opensubtitles.username",
"label": "Opensubtitles Username (VIP)",
"type": "text",
"default": ""
},
{
"id": "provider.opensubtitles.password",
"label": "Opensubtitles Password",
"type": "text",
"option": "hidden",
"default": "",
"secure": "true"
},
{
"id": "provider.addic7ed.use_random_agents",
"label": "Addic7ed: Use random user agents (should not be necessary)",
"type": "bool",
"default": "false"
},
{
"id": "langPref1",
"label": "Subtitle Language (1)",
"type": "enum",
"values": ["sq","ar","be","bs","bg","ca","zh","cs","da","nl","en","et","fi","fr","de","el","he","hi","hu","is","id","it","ja","ko","lv","lt","mk","ms","no","fa","pl","pt","pt-br","ro","ru","sr","sk","sl","es","sv","th","tr","uk","vi","hr"],
"default": "en"
},
{
"id": "langPref2",
"label": "Subtitle Language (2)",
"type": "enum",
"values": ["None", "sq","ar","be","bs","bg","ca","zh","cs","da","nl","en","et","fi","fr","de","el","he","hi","hu","is","id","it","ja","ko","lv","lt","mk","ms","no","fa","pl","pt","pt-br","ro","ru","sr","sk","sl","es","sv","th","tr","uk","vi","hr"],
"default": "None"
},
{
"id": "langPref3",
"label": "Subtitle Language (3)",
"type": "enum",
"values": ["None", "sq","ar","be","bs","bg","ca","zh","cs","da","nl","en","et","fi","fr","de","el","he","hi","hu","is","id","it","ja","ko","lv","lt","mk","ms","no","fa","pl","pt","pt-br","ro","ru","sr","sk","sl","es","sv","th","tr","uk","vi","hr"],
"default": "None"
},
{
"id": "langPrefCustom",
"label": "Additional Subtitle Languages (use ISO-639-1 codes; comma-separated)",
"type": "text",
"default": "None"
},
{
"id": "provider.opensubtitles.enabled",
"label": "Provider: Enable OpenSubtitles",
"type": "bool",
"default": "true"
},
{
"id": "provider.thesubdb.enabled",
"label": "Provider: Enable TheSubDB",
"type": "bool",
"default": "true"
},
{
"id": "provider.podnapisi.enabled",
"label": "Provider: Enable Podnapisi.NET",
"type": "bool",
"default": "true"
},
{
"id": "provider.addic7ed.enabled",
"label": "Provider: Enable Addic7ed",
"type": "bool",
"default": "true"
},
{
"id": "provider.addic7ed.boost",
"label": "Addic7ed: boost over hash score if requirements met (prefer over other providers)",
"type": "bool",
"default": "false"
},
{
"id": "provider.tvsubtitles.enabled",
"label": "Provider: Enable TVsubtitles.net",
"type": "bool",
"default": "true"
},
{
"id": "subtitles.scan.embedded",
"label": "Scan: include embedded subtitles (skip if existing)",
"type": "bool",
"default": "true"
},
{
"id": "subtitles.scan.external",
"label": "Scan: include external subtitles (skip if existing)",
"type": "bool",
"default": "true"
},
{
"id": "subtitles.search.minimumTVScore",
"label": "Minimum score for TV subtitles to download",
"type": "enum",
"values": ["100","95","90","85","80","75","70","65","60","55","50","45","40","35","30","25","20","15","10","5","0"],
"default": "40"
},
{
"id": "subtitles.search.minimumMovieScore",
"label": "Minimum score for movie subtitles to download",
"type": "enum",
"values": ["100","95","90","85","80","75","70","65","60","55","50","45","40","35","30","25","20","15","10","5","0"],
"default": "20"
},
{
"id": "subtitles.search.hearingImpaired",
"label": "Download hearing impaired subtitles.",
"type": "enum",
"values": ["prefer", "don't prefer", "force HI", "force non-HI"],
"default": "don't prefer"
},
{
"id": "subtitles.save.filesystem",
"label": "Store subtitles next to media files (instead of metadata)",
"type": "bool",
"default": "true"
},
{
"id": "subtitles.save.subFolder",
"label": "Subtitle Folder (\"current folder\" is the folder the current media file lives in) - needs LocalMediaExtended agent",
"type": "enum",
"values": ["current folder", "sub", "subs", "subtitle", "subtitles"],
"default": "subs"
},
{
"id": "subtitles.save.subFolder.Custom",
"label": "Custom Subtitle folder (overrides \"Subtitle Folder\"; computes to real paths; use for example \"bla\" as a subfolder of the current media file folder or an absolute path) - needs LocalMediaExtended agent",
"type": "text",
"default": ""
},
{
"id": "subtitles.only_one",
"label": "Restrict to one language (skips adding \".lang.\" to the subtitle filename; only uses \"Subtitle Language (1)\")",
"type": "bool",
"default": "false"
}
]
+47
View File
@@ -0,0 +1,47 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleDevelopmentRegion</key>
<string>English</string>
<key>CFBundleIdentifier</key>
<string>com.plexapp.agents.subzero</string>
<key>CFBundleInfoDictionaryVersion</key>
<string>6.0</string>
<key>CFBundleShortVersionString</key>
<string>1.0.9</string>
<key>CFBundleSignature</key>
<string>????</string>
<key>CFBundleVersion</key>
<string>1.0.9.7</string>
<key>PlexFrameworkVersion</key>
<string>2</string>
<key>PlexPluginClass</key>
<string>Agent</string>
<key>PlexPluginMode</key>
<string>Daemon</string>
<key>PlexPluginConsoleLogging</key>
<string>1</string>
<key>PlexPluginDevMode</key>
<string>1</string>
<key>PlexPluginCodePolicy</key>
<!-- this allows channels to access some python methods which are otherwise blocked, as well as import external code libraries, and interact with the PMS HTTP API -->
<string>Elevated</string>
<key>PlexAgentAttributionText</key>
<string>&lt;div style=&quot;white-space: pre;&quot;&gt;&lt;img src=&quot;https://raw.githubusercontent.com/pannal/Sub-Zero/master/Sub-Zero.bundle/Contents/Resources/subzero.gif&quot; /&gt;
&lt;h1&gt;Sub-Zero for Plex&lt;/h1&gt;&lt;i&gt;Subtitles done right&lt;/i&gt;
Version 1.1-rc5.2
Originally based on @bramwalet's awesome &lt;a href=&quot;https://github.com/bramwalet/Subliminal.bundle&quot;&gt;Subliminal.bundle&lt;/a&gt;
&lt;strong&gt;Need help?&lt;/strong&gt;
Plex thread: &lt;a href=&quot;https://forums.plex.tv/discussion/186575&quot;>https://forums.plex.tv/discussion/186575&lt;/a&gt;
Github: &lt;a href=&quot;https://github.com/pannal/Sub-Zero&quot;&gt;https://github.com/pannal/Sub-Zero&lt;/a&gt;
panni, 2015
&lt;/div&gt;
</string>
</dict>
</plist>
@@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
__title__ = 'babelfish'
__version__ = '0.5.5-dev'
__author__ = 'Antoine Bertin'
__license__ = 'BSD'
__copyright__ = 'Copyright 2015 the BabelFish authors'
import sys
if sys.version_info[0] >= 3:
basestr = str
else:
basestr = basestring
from .converters import (LanguageConverter, LanguageReverseConverter, LanguageEquivalenceConverter, CountryConverter,
CountryReverseConverter)
from .country import country_converters, COUNTRIES, COUNTRY_MATRIX, Country
from .exceptions import Error, LanguageConvertError, LanguageReverseError, CountryConvertError, CountryReverseError
from .language import language_converters, LANGUAGES, LANGUAGE_MATRIX, Language
from .script import SCRIPTS, SCRIPT_MATRIX, Script
@@ -0,0 +1,287 @@
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
import collections
from pkg_resources import iter_entry_points, EntryPoint
from ..exceptions import LanguageConvertError, LanguageReverseError
# from https://github.com/kennethreitz/requests/blob/master/requests/structures.py
class CaseInsensitiveDict(collections.MutableMapping):
"""A case-insensitive ``dict``-like object.
Implements all methods and operations of
``collections.MutableMapping`` as well as dict's ``copy``. Also
provides ``lower_items``.
All keys are expected to be strings. The structure remembers the
case of the last key to be set, and ``iter(instance)``,
``keys()``, ``items()``, ``iterkeys()``, and ``iteritems()``
will contain case-sensitive keys. However, querying and contains
testing is case insensitive:
cid = CaseInsensitiveDict()
cid['English'] = 'eng'
cid['ENGLISH'] == 'eng' # True
list(cid) == ['English'] # True
If the constructor, ``.update``, or equality comparison
operations are given keys that have equal ``.lower()``s, the
behavior is undefined.
"""
def __init__(self, data=None, **kwargs):
self._store = dict()
if data is None:
data = {}
self.update(data, **kwargs)
def __setitem__(self, key, value):
# Use the lowercased key for lookups, but store the actual
# key alongside the value.
self._store[key.lower()] = (key, value)
def __getitem__(self, key):
return self._store[key.lower()][1]
def __delitem__(self, key):
del self._store[key.lower()]
def __iter__(self):
return (casedkey for casedkey, mappedvalue in self._store.values())
def __len__(self):
return len(self._store)
def lower_items(self):
"""Like iteritems(), but with all lowercase keys."""
return (
(lowerkey, keyval[1])
for (lowerkey, keyval)
in self._store.items()
)
def __eq__(self, other):
if isinstance(other, collections.Mapping):
other = CaseInsensitiveDict(other)
else:
return NotImplemented
# Compare insensitively
return dict(self.lower_items()) == dict(other.lower_items())
# Copy is required
def copy(self):
return CaseInsensitiveDict(self._store.values())
def __repr__(self):
return '%s(%r)' % (self.__class__.__name__, dict(self.items()))
class LanguageConverter(object):
"""A :class:`LanguageConverter` supports converting an alpha3 language code with an
alpha2 country code and a script code into a custom code
.. attribute:: codes
Set of possible custom codes
"""
def convert(self, alpha3, country=None, script=None):
"""Convert an alpha3 language code with an alpha2 country code and a script code
into a custom code
:param string alpha3: ISO-639-3 language code
:param country: ISO-3166 country code, if any
:type country: string or None
:param script: ISO-15924 script code, if any
:type script: string or None
:return: the corresponding custom code
:rtype: string
:raise: :class:`~babelfish.exceptions.LanguageConvertError`
"""
raise NotImplementedError
class LanguageReverseConverter(LanguageConverter):
"""A :class:`LanguageConverter` able to reverse a custom code into a alpha3
ISO-639-3 language code, alpha2 ISO-3166-1 country code and ISO-15924 script code
"""
def reverse(self, code):
"""Reverse a custom code into alpha3, country and script code
:param string code: custom code to reverse
:return: the corresponding alpha3 ISO-639-3 language code, alpha2 ISO-3166-1 country code and ISO-15924 script code
:rtype: tuple
:raise: :class:`~babelfish.exceptions.LanguageReverseError`
"""
raise NotImplementedError
class LanguageEquivalenceConverter(LanguageReverseConverter):
"""A :class:`LanguageEquivalenceConverter` is a utility class that allows you to easily define a
:class:`LanguageReverseConverter` by only specifying the dict from alpha3 to their corresponding symbols.
You must specify the dict of equivalence as a class variable named SYMBOLS.
If you also set the class variable CASE_SENSITIVE to ``True`` then the reverse conversion function will be
case-sensitive (it is case-insensitive by default).
Example::
class MyCodeConverter(babelfish.LanguageEquivalenceConverter):
CASE_SENSITIVE = True
SYMBOLS = {'fra': 'mycode1', 'eng': 'mycode2'}
"""
CASE_SENSITIVE = False
def __init__(self):
self.codes = set()
self.to_symbol = {}
if self.CASE_SENSITIVE:
self.from_symbol = {}
else:
self.from_symbol = CaseInsensitiveDict()
for alpha3, symbol in self.SYMBOLS.items():
self.to_symbol[alpha3] = symbol
self.from_symbol[symbol] = (alpha3, None, None)
self.codes.add(symbol)
def convert(self, alpha3, country=None, script=None):
try:
return self.to_symbol[alpha3]
except KeyError:
raise LanguageConvertError(alpha3, country, script)
def reverse(self, code):
try:
return self.from_symbol[code]
except KeyError:
raise LanguageReverseError(code)
class CountryConverter(object):
"""A :class:`CountryConverter` supports converting an alpha2 country code
into a custom code
.. attribute:: codes
Set of possible custom codes
"""
def convert(self, alpha2):
"""Convert an alpha2 country code into a custom code
:param string alpha2: ISO-3166-1 language code
:return: the corresponding custom code
:rtype: string
:raise: :class:`~babelfish.exceptions.CountryConvertError`
"""
raise NotImplementedError
class CountryReverseConverter(CountryConverter):
"""A :class:`CountryConverter` able to reverse a custom code into a alpha2
ISO-3166-1 country code
"""
def reverse(self, code):
"""Reverse a custom code into alpha2 code
:param string code: custom code to reverse
:return: the corresponding alpha2 ISO-3166-1 country code
:rtype: string
:raise: :class:`~babelfish.exceptions.CountryReverseError`
"""
raise NotImplementedError
class ConverterManager(object):
"""Manager for babelfish converters behaving like a dict with lazy loading
Loading is done in this order:
* Entry point converters
* Registered converters
* Internal converters
.. attribute:: entry_point
The entry point where to look for converters
.. attribute:: internal_converters
Internal converters with entry point syntax
"""
entry_point = ''
internal_converters = []
def __init__(self):
#: Registered converters with entry point syntax
self.registered_converters = []
#: Loaded converters
self.converters = {}
def __getitem__(self, name):
"""Get a converter, lazy loading it if necessary"""
if name in self.converters:
return self.converters[name]
for ep in iter_entry_points(self.entry_point):
if ep.name == name:
self.converters[ep.name] = ep.load()()
return self.converters[ep.name]
for ep in (EntryPoint.parse(c) for c in self.registered_converters + self.internal_converters):
if ep.name == name:
# `require` argument of ep.load() is deprecated in newer versions of setuptools
if hasattr(ep, 'resolve'):
plugin = ep.resolve()
elif hasattr(ep, '_load'):
plugin = ep._load()
else:
plugin = ep.load(require=False)
self.converters[ep.name] = plugin()
return self.converters[ep.name]
raise KeyError(name)
def __setitem__(self, name, converter):
"""Load a converter"""
self.converters[name] = converter
def __delitem__(self, name):
"""Unload a converter"""
del self.converters[name]
def __iter__(self):
"""Iterator over loaded converters"""
return iter(self.converters)
def register(self, entry_point):
"""Register a converter
:param string entry_point: converter to register (entry point syntax)
:raise: ValueError if already registered
"""
if entry_point in self.registered_converters:
raise ValueError('Already registered')
self.registered_converters.insert(0, entry_point)
def unregister(self, entry_point):
"""Unregister a converter
:param string entry_point: converter to unregister (entry point syntax)
"""
self.registered_converters.remove(entry_point)
def __contains__(self, name):
return name in self.converters
@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageEquivalenceConverter
from ..language import LANGUAGE_MATRIX
class Alpha2Converter(LanguageEquivalenceConverter):
CASE_SENSITIVE = True
SYMBOLS = {}
for iso_language in LANGUAGE_MATRIX:
if iso_language.alpha2:
SYMBOLS[iso_language.alpha3] = iso_language.alpha2
@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageEquivalenceConverter
from ..language import LANGUAGE_MATRIX
class Alpha3BConverter(LanguageEquivalenceConverter):
CASE_SENSITIVE = True
SYMBOLS = {}
for iso_language in LANGUAGE_MATRIX:
if iso_language.alpha3b:
SYMBOLS[iso_language.alpha3] = iso_language.alpha3b
@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageEquivalenceConverter
from ..language import LANGUAGE_MATRIX
class Alpha3TConverter(LanguageEquivalenceConverter):
CASE_SENSITIVE = True
SYMBOLS = {}
for iso_language in LANGUAGE_MATRIX:
if iso_language.alpha3t:
SYMBOLS[iso_language.alpha3] = iso_language.alpha3t
@@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import CountryReverseConverter, CaseInsensitiveDict
from ..country import COUNTRY_MATRIX
from ..exceptions import CountryConvertError, CountryReverseError
class CountryNameConverter(CountryReverseConverter):
def __init__(self):
self.codes = set()
self.to_name = {}
self.from_name = CaseInsensitiveDict()
for country in COUNTRY_MATRIX:
self.codes.add(country.name)
self.to_name[country.alpha2] = country.name
self.from_name[country.name] = country.alpha2
def convert(self, alpha2):
if alpha2 not in self.to_name:
raise CountryConvertError(alpha2)
return self.to_name[alpha2]
def reverse(self, name):
if name not in self.from_name:
raise CountryReverseError(name)
return self.from_name[name]
@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageEquivalenceConverter
from ..language import LANGUAGE_MATRIX
class NameConverter(LanguageEquivalenceConverter):
CASE_SENSITIVE = False
SYMBOLS = {}
for iso_language in LANGUAGE_MATRIX:
if iso_language.name:
SYMBOLS[iso_language.alpha3] = iso_language.name
@@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageReverseConverter, CaseInsensitiveDict
from ..exceptions import LanguageReverseError
from ..language import language_converters
class OpenSubtitlesConverter(LanguageReverseConverter):
def __init__(self):
self.alpha3b_converter = language_converters['alpha3b']
self.alpha2_converter = language_converters['alpha2']
self.to_opensubtitles = {('por', 'BR'): 'pob', ('gre', None): 'ell', ('srp', None): 'scc', ('srp', 'ME'): 'mne'}
self.from_opensubtitles = CaseInsensitiveDict({'pob': ('por', 'BR'), 'pb': ('por', 'BR'), 'ell': ('ell', None),
'scc': ('srp', None), 'mne': ('srp', 'ME')})
self.codes = (self.alpha2_converter.codes | self.alpha3b_converter.codes | set(['pob', 'pb', 'scc', 'mne']))
def convert(self, alpha3, country=None, script=None):
alpha3b = self.alpha3b_converter.convert(alpha3, country, script)
if (alpha3b, country) in self.to_opensubtitles:
return self.to_opensubtitles[(alpha3b, country)]
return alpha3b
def reverse(self, opensubtitles):
if opensubtitles in self.from_opensubtitles:
return self.from_opensubtitles[opensubtitles]
for conv in [self.alpha3b_converter, self.alpha2_converter]:
try:
return conv.reverse(opensubtitles)
except LanguageReverseError:
pass
raise LanguageReverseError(opensubtitles)
@@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageConverter
from ..exceptions import LanguageConvertError
from ..language import LANGUAGE_MATRIX
class ScopeConverter(LanguageConverter):
FULLNAME = {'I': 'individual', 'M': 'macrolanguage', 'S': 'special'}
SYMBOLS = {}
for iso_language in LANGUAGE_MATRIX:
SYMBOLS[iso_language.alpha3] = iso_language.scope
codes = set(SYMBOLS.values())
def convert(self, alpha3, country=None, script=None):
if self.SYMBOLS[alpha3] in self.FULLNAME:
return self.FULLNAME[self.SYMBOLS[alpha3]]
raise LanguageConvertError(alpha3, country, script)
@@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageConverter
from ..exceptions import LanguageConvertError
from ..language import LANGUAGE_MATRIX
class LanguageTypeConverter(LanguageConverter):
FULLNAME = {'A': 'ancient', 'C': 'constructed', 'E': 'extinct', 'H': 'historical', 'L': 'living', 'S': 'special'}
SYMBOLS = {}
for iso_language in LANGUAGE_MATRIX:
SYMBOLS[iso_language.alpha3] = iso_language.type
codes = set(SYMBOLS.values())
def convert(self, alpha3, country=None, script=None):
if self.SYMBOLS[alpha3] in self.FULLNAME:
return self.FULLNAME[self.SYMBOLS[alpha3]]
raise LanguageConvertError(alpha3, country, script)
@@ -0,0 +1,104 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from collections import namedtuple
from functools import partial
from pkg_resources import resource_stream # @UnresolvedImport
from .converters import ConverterManager
from . import basestr
COUNTRIES = {}
COUNTRY_MATRIX = []
#: The namedtuple used in the :data:`COUNTRY_MATRIX`
IsoCountry = namedtuple('IsoCountry', ['name', 'alpha2'])
f = resource_stream('babelfish', 'data/iso-3166-1.txt')
f.readline()
for l in f:
iso_country = IsoCountry(*l.decode('utf-8').strip().split(';'))
COUNTRIES[iso_country.alpha2] = iso_country.name
COUNTRY_MATRIX.append(iso_country)
f.close()
class CountryConverterManager(ConverterManager):
""":class:`~babelfish.converters.ConverterManager` for country converters"""
entry_point = 'babelfish.country_converters'
internal_converters = ['name = babelfish.converters.countryname:CountryNameConverter']
country_converters = CountryConverterManager()
class CountryMeta(type):
"""The :class:`Country` metaclass
Dynamically redirect :meth:`Country.frommycode` to :meth:`Country.fromcode` with the ``mycode`` `converter`
"""
def __getattr__(cls, name):
if name.startswith('from'):
return partial(cls.fromcode, converter=name[4:])
return type.__getattribute__(cls, name)
class Country(CountryMeta(str('CountryBase'), (object,), {})):
"""A country on Earth
A country is represented by a 2-letter code from the ISO-3166 standard
:param string country: 2-letter ISO-3166 country code
"""
def __init__(self, country):
if country not in COUNTRIES:
raise ValueError('%r is not a valid country' % country)
#: ISO-3166 2-letter country code
self.alpha2 = country
@classmethod
def fromcode(cls, code, converter):
"""Create a :class:`Country` by its `code` using `converter` to
:meth:`~babelfish.converters.CountryReverseConverter.reverse` it
:param string code: the code to reverse
:param string converter: name of the :class:`~babelfish.converters.CountryReverseConverter` to use
:return: the corresponding :class:`Country` instance
:rtype: :class:`Country`
"""
return cls(country_converters[converter].reverse(code))
def __getstate__(self):
return self.alpha2
def __setstate__(self, state):
self.alpha2 = state
def __getattr__(self, name):
return country_converters[name].convert(self.alpha2)
def __hash__(self):
return hash(self.alpha2)
def __eq__(self, other):
if isinstance(other, basestr):
return str(self) == other
if not isinstance(other, Country):
return False
return self.alpha2 == other.alpha2
def __ne__(self, other):
return not self == other
def __repr__(self):
return '<Country [%s]>' % self
def __str__(self):
return self.alpha2
@@ -0,0 +1,45 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
import os.path
import tempfile
import zipfile
import requests
DATA_DIR = os.path.dirname(__file__)
# iso-3166-1.txt
print('Downloading ISO-3166-1 standard (ISO country codes)...')
with open(os.path.join(DATA_DIR, 'iso-3166-1.txt'), 'w') as f:
r = requests.get('http://www.iso.org/iso/home/standards/country_codes/country_names_and_code_elements_txt.htm')
f.write(r.content.strip())
# iso-639-3.tab
print('Downloading ISO-639-3 standard (ISO language codes)...')
with tempfile.TemporaryFile() as f:
r = requests.get('http://www-01.sil.org/iso639-3/iso-639-3_Code_Tables_20130531.zip')
f.write(r.content)
with zipfile.ZipFile(f) as z:
z.extract('iso-639-3.tab', DATA_DIR)
# iso-15924
print('Downloading ISO-15924 standard (ISO script codes)...')
with tempfile.TemporaryFile() as f:
r = requests.get('http://www.unicode.org/iso15924/iso15924.txt.zip')
f.write(r.content)
with zipfile.ZipFile(f) as z:
z.extract('iso15924-utf8-20131012.txt', DATA_DIR)
# opensubtitles supported languages
print('Downloading OpenSubtitles supported languages...')
with open(os.path.join(DATA_DIR, 'opensubtitles_languages.txt'), 'w') as f:
r = requests.get('http://www.opensubtitles.org/addons/export_languages.php')
f.write(r.content)
print('Done!')
@@ -0,0 +1,250 @@
Country Name;ISO 3166-1-alpha-2 code
AFGHANISTAN;AF
ÅLAND ISLANDS;AX
ALBANIA;AL
ALGERIA;DZ
AMERICAN SAMOA;AS
ANDORRA;AD
ANGOLA;AO
ANGUILLA;AI
ANTARCTICA;AQ
ANTIGUA AND BARBUDA;AG
ARGENTINA;AR
ARMENIA;AM
ARUBA;AW
AUSTRALIA;AU
AUSTRIA;AT
AZERBAIJAN;AZ
BAHAMAS;BS
BAHRAIN;BH
BANGLADESH;BD
BARBADOS;BB
BELARUS;BY
BELGIUM;BE
BELIZE;BZ
BENIN;BJ
BERMUDA;BM
BHUTAN;BT
BOLIVIA, PLURINATIONAL STATE OF;BO
BONAIRE, SINT EUSTATIUS AND SABA;BQ
BOSNIA AND HERZEGOVINA;BA
BOTSWANA;BW
BOUVET ISLAND;BV
BRAZIL;BR
BRITISH INDIAN OCEAN TERRITORY;IO
BRUNEI DARUSSALAM;BN
BULGARIA;BG
BURKINA FASO;BF
BURUNDI;BI
CAMBODIA;KH
CAMEROON;CM
CANADA;CA
CAPE VERDE;CV
CAYMAN ISLANDS;KY
CENTRAL AFRICAN REPUBLIC;CF
CHAD;TD
CHILE;CL
CHINA;CN
CHRISTMAS ISLAND;CX
COCOS (KEELING) ISLANDS;CC
COLOMBIA;CO
COMOROS;KM
CONGO;CG
CONGO, THE DEMOCRATIC REPUBLIC OF THE;CD
COOK ISLANDS;CK
COSTA RICA;CR
CÔTE D'IVOIRE;CI
CROATIA;HR
CUBA;CU
CURAÇAO;CW
CYPRUS;CY
CZECH REPUBLIC;CZ
DENMARK;DK
DJIBOUTI;DJ
DOMINICA;DM
DOMINICAN REPUBLIC;DO
ECUADOR;EC
EGYPT;EG
EL SALVADOR;SV
EQUATORIAL GUINEA;GQ
ERITREA;ER
ESTONIA;EE
ETHIOPIA;ET
FALKLAND ISLANDS (MALVINAS);FK
FAROE ISLANDS;FO
FIJI;FJ
FINLAND;FI
FRANCE;FR
FRENCH GUIANA;GF
FRENCH POLYNESIA;PF
FRENCH SOUTHERN TERRITORIES;TF
GABON;GA
GAMBIA;GM
GEORGIA;GE
GERMANY;DE
GHANA;GH
GIBRALTAR;GI
GREECE;GR
GREENLAND;GL
GRENADA;GD
GUADELOUPE;GP
GUAM;GU
GUATEMALA;GT
GUERNSEY;GG
GUINEA;GN
GUINEA-BISSAU;GW
GUYANA;GY
HAITI;HT
HEARD ISLAND AND MCDONALD ISLANDS;HM
HOLY SEE (VATICAN CITY STATE);VA
HONDURAS;HN
HONG KONG;HK
HUNGARY;HU
ICELAND;IS
INDIA;IN
INDONESIA;ID
IRAN, ISLAMIC REPUBLIC OF;IR
IRAQ;IQ
IRELAND;IE
ISLE OF MAN;IM
ISRAEL;IL
ITALY;IT
JAMAICA;JM
JAPAN;JP
JERSEY;JE
JORDAN;JO
KAZAKHSTAN;KZ
KENYA;KE
KIRIBATI;KI
KOREA, DEMOCRATIC PEOPLE'S REPUBLIC OF;KP
KOREA, REPUBLIC OF;KR
KUWAIT;KW
KYRGYZSTAN;KG
LAO PEOPLE'S DEMOCRATIC REPUBLIC;LA
LATVIA;LV
LEBANON;LB
LESOTHO;LS
LIBERIA;LR
LIBYA;LY
LIECHTENSTEIN;LI
LITHUANIA;LT
LUXEMBOURG;LU
MACAO;MO
MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF;MK
MADAGASCAR;MG
MALAWI;MW
MALAYSIA;MY
MALDIVES;MV
MALI;ML
MALTA;MT
MARSHALL ISLANDS;MH
MARTINIQUE;MQ
MAURITANIA;MR
MAURITIUS;MU
MAYOTTE;YT
MEXICO;MX
MICRONESIA, FEDERATED STATES OF;FM
MOLDOVA, REPUBLIC OF;MD
MONACO;MC
MONGOLIA;MN
MONTENEGRO;ME
MONTSERRAT;MS
MOROCCO;MA
MOZAMBIQUE;MZ
MYANMAR;MM
NAMIBIA;NA
NAURU;NR
NEPAL;NP
NETHERLANDS;NL
NEW CALEDONIA;NC
NEW ZEALAND;NZ
NICARAGUA;NI
NIGER;NE
NIGERIA;NG
NIUE;NU
NORFOLK ISLAND;NF
NORTHERN MARIANA ISLANDS;MP
NORWAY;NO
OMAN;OM
PAKISTAN;PK
PALAU;PW
PALESTINE, STATE OF;PS
PANAMA;PA
PAPUA NEW GUINEA;PG
PARAGUAY;PY
PERU;PE
PHILIPPINES;PH
PITCAIRN;PN
POLAND;PL
PORTUGAL;PT
PUERTO RICO;PR
QATAR;QA
RÉUNION;RE
ROMANIA;RO
RUSSIAN FEDERATION;RU
RWANDA;RW
SAINT BARTHÉLEMY;BL
SAINT HELENA, ASCENSION AND TRISTAN DA CUNHA;SH
SAINT KITTS AND NEVIS;KN
SAINT LUCIA;LC
SAINT MARTIN (FRENCH PART);MF
SAINT PIERRE AND MIQUELON;PM
SAINT VINCENT AND THE GRENADINES;VC
SAMOA;WS
SAN MARINO;SM
SAO TOME AND PRINCIPE;ST
SAUDI ARABIA;SA
SENEGAL;SN
SERBIA;RS
SEYCHELLES;SC
SIERRA LEONE;SL
SINGAPORE;SG
SINT MAARTEN (DUTCH PART);SX
SLOVAKIA;SK
SLOVENIA;SI
SOLOMON ISLANDS;SB
SOMALIA;SO
SOUTH AFRICA;ZA
SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS;GS
SOUTH SUDAN;SS
SPAIN;ES
SRI LANKA;LK
SUDAN;SD
SURINAME;SR
SVALBARD AND JAN MAYEN;SJ
SWAZILAND;SZ
SWEDEN;SE
SWITZERLAND;CH
SYRIAN ARAB REPUBLIC;SY
TAIWAN, PROVINCE OF CHINA;TW
TAJIKISTAN;TJ
TANZANIA, UNITED REPUBLIC OF;TZ
THAILAND;TH
TIMOR-LESTE;TL
TOGO;TG
TOKELAU;TK
TONGA;TO
TRINIDAD AND TOBAGO;TT
TUNISIA;TN
TURKEY;TR
TURKMENISTAN;TM
TURKS AND CAICOS ISLANDS;TC
TUVALU;TV
UGANDA;UG
UKRAINE;UA
UNITED ARAB EMIRATES;AE
UNITED KINGDOM;GB
UNITED STATES;US
UNITED STATES MINOR OUTLYING ISLANDS;UM
URUGUAY;UY
UZBEKISTAN;UZ
VANUATU;VU
VENEZUELA, BOLIVARIAN REPUBLIC OF;VE
VIET NAM;VN
VIRGIN ISLANDS, BRITISH;VG
VIRGIN ISLANDS, U.S.;VI
WALLIS AND FUTUNA;WF
WESTERN SAHARA;EH
YEMEN;YE
ZAMBIA;ZM
ZIMBABWE;ZW
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,176 @@
#
# ISO 15924 - Codes for the representation of names of scripts
# Codes pour la représentation des noms d’écritures
# Format:
# Code;N°;English Name;Nom français;PVA;Date
#
Afak;439;Afaka;afaka;;2010-12-21
Aghb;239;Caucasian Albanian;aghbanien;;2012-10-16
Ahom;338;Ahom, Tai Ahom;âhom;;2012-11-01
Arab;160;Arabic;arabe;Arabic;2004-05-01
Armi;124;Imperial Aramaic;araméen impérial;Imperial_Aramaic;2009-06-01
Armn;230;Armenian;arménien;Armenian;2004-05-01
Avst;134;Avestan;avestique;Avestan;2009-06-01
Bali;360;Balinese;balinais;Balinese;2006-10-10
Bamu;435;Bamum;bamoum;Bamum;2009-06-01
Bass;259;Bassa Vah;bassa;;2010-03-26
Batk;365;Batak;batik;Batak;2010-07-23
Beng;325;Bengali;bengalî;Bengali;2004-05-01
Blis;550;Blissymbols;symboles Bliss;;2004-05-01
Bopo;285;Bopomofo;bopomofo;Bopomofo;2004-05-01
Brah;300;Brahmi;brahma;Brahmi;2010-07-23
Brai;570;Braille;braille;Braille;2004-05-01
Bugi;367;Buginese;bouguis;Buginese;2006-06-21
Buhd;372;Buhid;bouhide;Buhid;2004-05-01
Cakm;349;Chakma;chakma;Chakma;2012-02-06
Cans;440;Unified Canadian Aboriginal Syllabics;syllabaire autochtone canadien unifié;Canadian_Aboriginal;2004-05-29
Cari;201;Carian;carien;Carian;2007-07-02
Cham;358;Cham;cham (čam, tcham);Cham;2009-11-11
Cher;445;Cherokee;tchérokî;Cherokee;2004-05-01
Cirt;291;Cirth;cirth;;2004-05-01
Copt;204;Coptic;copte;Coptic;2006-06-21
Cprt;403;Cypriot;syllabaire chypriote;Cypriot;2004-05-01
Cyrl;220;Cyrillic;cyrillique;Cyrillic;2004-05-01
Cyrs;221;Cyrillic (Old Church Slavonic variant);cyrillique (variante slavonne);;2004-05-01
Deva;315;Devanagari (Nagari);dévanâgarî;Devanagari;2004-05-01
Dsrt;250;Deseret (Mormon);déseret (mormon);Deseret;2004-05-01
Dupl;755;Duployan shorthand, Duployan stenography;sténographie Duployé;;2010-07-18
Egyd;070;Egyptian demotic;démotique égyptien;;2004-05-01
Egyh;060;Egyptian hieratic;hiératique égyptien;;2004-05-01
Egyp;050;Egyptian hieroglyphs;hiéroglyphes égyptiens;Egyptian_Hieroglyphs;2009-06-01
Elba;226;Elbasan;elbasan;;2010-07-18
Ethi;430;Ethiopic (Geʻez);éthiopien (geʻez, guèze);Ethiopic;2004-10-25
Geor;240;Georgian (Mkhedruli);géorgien (mkhédrouli);Georgian;2004-05-29
Geok;241;Khutsuri (Asomtavruli and Nuskhuri);khoutsouri (assomtavrouli et nouskhouri);Georgian;2012-10-16
Glag;225;Glagolitic;glagolitique;Glagolitic;2006-06-21
Goth;206;Gothic;gotique;Gothic;2004-05-01
Gran;343;Grantha;grantha;;2009-11-11
Grek;200;Greek;grec;Greek;2004-05-01
Gujr;320;Gujarati;goudjarâtî (gujrâtî);Gujarati;2004-05-01
Guru;310;Gurmukhi;gourmoukhî;Gurmukhi;2004-05-01
Hang;286;Hangul (Hangŭl, Hangeul);hangûl (hangŭl, hangeul);Hangul;2004-05-29
Hani;500;Han (Hanzi, Kanji, Hanja);idéogrammes han (sinogrammes);Han;2009-02-23
Hano;371;Hanunoo (Hanunóo);hanounóo;Hanunoo;2004-05-29
Hans;501;Han (Simplified variant);idéogrammes han (variante simplifiée);;2004-05-29
Hant;502;Han (Traditional variant);idéogrammes han (variante traditionnelle);;2004-05-29
Hatr;127;Hatran;hatrénien;;2012-11-01
Hebr;125;Hebrew;hébreu;Hebrew;2004-05-01
Hira;410;Hiragana;hiragana;Hiragana;2004-05-01
Hluw;080;Anatolian Hieroglyphs (Luwian Hieroglyphs, Hittite Hieroglyphs);hiéroglyphes anatoliens (hiéroglyphes louvites, hiéroglyphes hittites);;2011-12-09
Hmng;450;Pahawh Hmong;pahawh hmong;;2004-05-01
Hrkt;412;Japanese syllabaries (alias for Hiragana + Katakana);syllabaires japonais (alias pour hiragana + katakana);Katakana_Or_Hiragana;2011-06-21
Hung;176;Old Hungarian (Hungarian Runic);runes hongroises (ancien hongrois);;2012-10-16
Inds;610;Indus (Harappan);indus;;2004-05-01
Ital;210;Old Italic (Etruscan, Oscan, etc.);ancien italique (étrusque, osque, etc.);Old_Italic;2004-05-29
Java;361;Javanese;javanais;Javanese;2009-06-01
Jpan;413;Japanese (alias for Han + Hiragana + Katakana);japonais (alias pour han + hiragana + katakana);;2006-06-21
Jurc;510;Jurchen;jurchen;;2010-12-21
Kali;357;Kayah Li;kayah li;Kayah_Li;2007-07-02
Kana;411;Katakana;katakana;Katakana;2004-05-01
Khar;305;Kharoshthi;kharochthî;Kharoshthi;2006-06-21
Khmr;355;Khmer;khmer;Khmer;2004-05-29
Khoj;322;Khojki;khojkî;;2011-06-21
Knda;345;Kannada;kannara (canara);Kannada;2004-05-29
Kore;287;Korean (alias for Hangul + Han);coréen (alias pour hangûl + han);;2007-06-13
Kpel;436;Kpelle;kpèllé;;2010-03-26
Kthi;317;Kaithi;kaithî;Kaithi;2009-06-01
Lana;351;Tai Tham (Lanna);taï tham (lanna);Tai_Tham;2009-06-01
Laoo;356;Lao;laotien;Lao;2004-05-01
Latf;217;Latin (Fraktur variant);latin (variante brisée);;2004-05-01
Latg;216;Latin (Gaelic variant);latin (variante gaélique);;2004-05-01
Latn;215;Latin;latin;Latin;2004-05-01
Lepc;335;Lepcha (Róng);lepcha (róng);Lepcha;2007-07-02
Limb;336;Limbu;limbou;Limbu;2004-05-29
Lina;400;Linear A;linéaire A;;2004-05-01
Linb;401;Linear B;linéaire B;Linear_B;2004-05-29
Lisu;399;Lisu (Fraser);lisu (Fraser);Lisu;2009-06-01
Loma;437;Loma;loma;;2010-03-26
Lyci;202;Lycian;lycien;Lycian;2007-07-02
Lydi;116;Lydian;lydien;Lydian;2007-07-02
Mahj;314;Mahajani;mahâjanî;;2012-10-16
Mand;140;Mandaic, Mandaean;mandéen;Mandaic;2010-07-23
Mani;139;Manichaean;manichéen;;2007-07-15
Maya;090;Mayan hieroglyphs;hiéroglyphes mayas;;2004-05-01
Mend;438;Mende Kikakui;mendé kikakui;;2013-10-12
Merc;101;Meroitic Cursive;cursif méroïtique;Meroitic_Cursive;2012-02-06
Mero;100;Meroitic Hieroglyphs;hiéroglyphes méroïtiques;Meroitic_Hieroglyphs;2012-02-06
Mlym;347;Malayalam;malayâlam;Malayalam;2004-05-01
Modi;323;Modi, Moḍī;modî;;2013-10-12
Moon;218;Moon (Moon code, Moon script, Moon type);écriture Moon;;2006-12-11
Mong;145;Mongolian;mongol;Mongolian;2004-05-01
Mroo;199;Mro, Mru;mro;;2010-12-21
Mtei;337;Meitei Mayek (Meithei, Meetei);meitei mayek;Meetei_Mayek;2009-06-01
Mult;323; Multani;multanî;;2012-11-01
Mymr;350;Myanmar (Burmese);birman;Myanmar;2004-05-01
Narb;106;Old North Arabian (Ancient North Arabian);nord-arabique;;2010-03-26
Nbat;159;Nabataean;nabatéen;;2010-03-26
Nkgb;420;Nakhi Geba ('Na-'Khi ²Ggŏ-¹baw, Naxi Geba);nakhi géba;;2009-02-23
Nkoo;165;NKo;nko;Nko;2006-10-10
Nshu;499;Nüshu;nüshu;;2010-12-21
Ogam;212;Ogham;ogam;Ogham;2004-05-01
Olck;261;Ol Chiki (Ol Cemet, Ol, Santali);ol tchiki;Ol_Chiki;2007-07-02
Orkh;175;Old Turkic, Orkhon Runic;orkhon;Old_Turkic;2009-06-01
Orya;327;Oriya;oriyâ;Oriya;2004-05-01
Osma;260;Osmanya;osmanais;Osmanya;2004-05-01
Palm;126;Palmyrene;palmyrénien;;2010-03-26
Pauc;263;Pau Cin Hau;paou chin haou;;2013-10-12
Perm;227;Old Permic;ancien permien;;2004-05-01
Phag;331;Phags-pa;phags pa;Phags_Pa;2006-10-10
Phli;131;Inscriptional Pahlavi;pehlevi des inscriptions;Inscriptional_Pahlavi;2009-06-01
Phlp;132;Psalter Pahlavi;pehlevi des psautiers;;2007-11-26
Phlv;133;Book Pahlavi;pehlevi des livres;;2007-07-15
Phnx;115;Phoenician;phénicien;Phoenician;2006-10-10
Plrd;282;Miao (Pollard);miao (Pollard);Miao;2012-02-06
Prti;130;Inscriptional Parthian;parthe des inscriptions;Inscriptional_Parthian;2009-06-01
Qaaa;900;Reserved for private use (start);réservé à lusage privé (début);;2004-05-29
Qabx;949;Reserved for private use (end);réservé à lusage privé (fin);;2004-05-29
Rjng;363;Rejang (Redjang, Kaganga);redjang (kaganga);Rejang;2009-02-23
Roro;620;Rongorongo;rongorongo;;2004-05-01
Runr;211;Runic;runique;Runic;2004-05-01
Samr;123;Samaritan;samaritain;Samaritan;2009-06-01
Sara;292;Sarati;sarati;;2004-05-29
Sarb;105;Old South Arabian;sud-arabique, himyarite;Old_South_Arabian;2009-06-01
Saur;344;Saurashtra;saurachtra;Saurashtra;2007-07-02
Sgnw;095;SignWriting;SignÉcriture, SignWriting;;2006-10-10
Shaw;281;Shavian (Shaw);shavien (Shaw);Shavian;2004-05-01
Shrd;319;Sharada, Śāradā;charada, shard;Sharada;2012-02-06
Sidd;302;Siddham, Siddhaṃ, Siddhamātṛkā;siddham;;2013-10-12
Sind;318;Khudawadi, Sindhi;khoudawadî, sindhî;;2010-12-21
Sinh;348;Sinhala;singhalais;Sinhala;2004-05-01
Sora;398;Sora Sompeng;sora sompeng;Sora_Sompeng;2012-02-06
Sund;362;Sundanese;sundanais;Sundanese;2007-07-02
Sylo;316;Syloti Nagri;sylotî nâgrî;Syloti_Nagri;2006-06-21
Syrc;135;Syriac;syriaque;Syriac;2004-05-01
Syre;138;Syriac (Estrangelo variant);syriaque (variante estranghélo);;2004-05-01
Syrj;137;Syriac (Western variant);syriaque (variante occidentale);;2004-05-01
Syrn;136;Syriac (Eastern variant);syriaque (variante orientale);;2004-05-01
Tagb;373;Tagbanwa;tagbanoua;Tagbanwa;2004-05-01
Takr;321;Takri, Ṭākrī, Ṭāṅkrī;tâkrî;Takri;2012-02-06
Tale;353;Tai Le;taï-le;Tai_Le;2004-10-25
Talu;354;New Tai Lue;nouveau taï-lue;New_Tai_Lue;2006-06-21
Taml;346;Tamil;tamoul;Tamil;2004-05-01
Tang;520;Tangut;tangoute;;2010-12-21
Tavt;359;Tai Viet;taï viêt;Tai_Viet;2009-06-01
Telu;340;Telugu;télougou;Telugu;2004-05-01
Teng;290;Tengwar;tengwar;;2004-05-01
Tfng;120;Tifinagh (Berber);tifinagh (berbère);Tifinagh;2006-06-21
Tglg;370;Tagalog (Baybayin, Alibata);tagal (baybayin, alibata);Tagalog;2009-02-23
Thaa;170;Thaana;thâna;Thaana;2004-05-01
Thai;352;Thai;thaï;Thai;2004-05-01
Tibt;330;Tibetan;tibétain;Tibetan;2004-05-01
Tirh;326;Tirhuta;tirhouta;;2011-12-09
Ugar;040;Ugaritic;ougaritique;Ugaritic;2004-05-01
Vaii;470;Vai;vaï;Vai;2007-07-02
Visp;280;Visible Speech;parole visible;;2004-05-01
Wara;262;Warang Citi (Varang Kshiti);warang citi;;2009-11-11
Wole;480;Woleai;woléaï;;2010-12-21
Xpeo;030;Old Persian;cunéiforme persépolitain;Old_Persian;2006-06-21
Xsux;020;Cuneiform, Sumero-Akkadian;cunéiforme suméro-akkadien;Cuneiform;2006-10-10
Yiii;460;Yi;yi;Yi;2004-05-01
Zinh;994;Code for inherited script;codet pour écriture héritée;Inherited;2009-02-23
Zmth;995;Mathematical notation;notation mathématique;;2007-11-26
Zsym;996;Symbols;symboles;;2007-11-26
Zxxx;997;Code for unwritten documents;codet pour les documents non écrits;;2011-06-21
Zyyy;998;Code for undetermined script;codet pour écriture indéterminée;Common;2004-05-29
Zzzz;999;Code for uncoded script;codet pour écriture non codée;Unknown;2006-10-10
@@ -0,0 +1,474 @@
IdSubLanguage ISO639 LanguageName UploadEnabled WebEnabled
aar aa Afar, afar 0 0
abk ab Abkhazian 0 0
ace Achinese 0 0
ach Acoli 0 0
ada Adangme 0 0
ady adyghé 0 0
afa Afro-Asiatic (Other) 0 0
afh Afrihili 0 0
afr af Afrikaans 1 0
ain Ainu 0 0
aka ak Akan 0 0
akk Akkadian 0 0
alb sq Albanian 1 1
ale Aleut 0 0
alg Algonquian languages 0 0
alt Southern Altai 0 0
amh am Amharic 0 0
ang English, Old (ca.450-1100) 0 0
apa Apache languages 0 0
ara ar Arabic 1 1
arc Aramaic 0 0
arg an Aragonese 0 0
arm hy Armenian 1 0
arn Araucanian 0 0
arp Arapaho 0 0
art Artificial (Other) 0 0
arw Arawak 0 0
asm as Assamese 0 0
ast Asturian, Bable 0 0
ath Athapascan languages 0 0
aus Australian languages 0 0
ava av Avaric 0 0
ave ae Avestan 0 0
awa Awadhi 0 0
aym ay Aymara 0 0
aze az Azerbaijani 0 0
bad Banda 0 0
bai Bamileke languages 0 0
bak ba Bashkir 0 0
bal Baluchi 0 0
bam bm Bambara 0 0
ban Balinese 0 0
baq eu Basque 1 1
bas Basa 0 0
bat Baltic (Other) 0 0
bej Beja 0 0
bel be Belarusian 0 0
bem Bemba 0 0
ben bn Bengali 1 0
ber Berber (Other) 0 0
bho Bhojpuri 0 0
bih bh Bihari 0 0
bik Bikol 0 0
bin Bini 0 0
bis bi Bislama 0 0
bla Siksika 0 0
bnt Bantu (Other) 0 0
bos bs Bosnian 1 0
bra Braj 0 0
bre br Breton 1 0
btk Batak (Indonesia) 0 0
bua Buriat 0 0
bug Buginese 0 0
bul bg Bulgarian 1 1
bur my Burmese 1 0
byn Blin 0 0
cad Caddo 0 0
cai Central American Indian (Other) 0 0
car Carib 0 0
cat ca Catalan 1 1
cau Caucasian (Other) 0 0
ceb Cebuano 0 0
cel Celtic (Other) 0 0
cha ch Chamorro 0 0
chb Chibcha 0 0
che ce Chechen 0 0
chg Chagatai 0 0
chi zh Chinese 1 1
chk Chuukese 0 0
chm Mari 0 0
chn Chinook jargon 0 0
cho Choctaw 0 0
chp Chipewyan 0 0
chr Cherokee 0 0
chu cu Church Slavic 0 0
chv cv Chuvash 0 0
chy Cheyenne 0 0
cmc Chamic languages 0 0
cop Coptic 0 0
cor kw Cornish 0 0
cos co Corsican 0 0
cpe Creoles and pidgins, English based (Other) 0 0
cpf Creoles and pidgins, French-based (Other) 0 0
cpp Creoles and pidgins, Portuguese-based (Other) 0 0
cre cr Cree 0 0
crh Crimean Tatar 0 0
crp Creoles and pidgins (Other) 0 0
csb Kashubian 0 0
cus Cushitic (Other)' couchitiques, autres langues 0 0
cze cs Czech 1 1
dak Dakota 0 0
dan da Danish 1 1
dar Dargwa 0 0
day Dayak 0 0
del Delaware 0 0
den Slave (Athapascan) 0 0
dgr Dogrib 0 0
din Dinka 0 0
div dv Divehi 0 0
doi Dogri 0 0
dra Dravidian (Other) 0 0
dua Duala 0 0
dum Dutch, Middle (ca.1050-1350) 0 0
dut nl Dutch 1 1
dyu Dyula 0 0
dzo dz Dzongkha 0 0
efi Efik 0 0
egy Egyptian (Ancient) 0 0
eka Ekajuk 0 0
elx Elamite 0 0
eng en English 1 1
enm English, Middle (1100-1500) 0 0
epo eo Esperanto 1 0
est et Estonian 1 1
ewe ee Ewe 0 0
ewo Ewondo 0 0
fan Fang 0 0
fao fo Faroese 0 0
fat Fanti 0 0
fij fj Fijian 0 0
fil Filipino 0 0
fin fi Finnish 1 1
fiu Finno-Ugrian (Other) 0 0
fon Fon 0 0
fre fr French 1 1
frm French, Middle (ca.1400-1600) 0 0
fro French, Old (842-ca.1400) 0 0
fry fy Frisian 0 0
ful ff Fulah 0 0
fur Friulian 0 0
gaa Ga 0 0
gay Gayo 0 0
gba Gbaya 0 0
gem Germanic (Other) 0 0
geo ka Georgian 1 1
ger de German 1 1
gez Geez 0 0
gil Gilbertese 0 0
gla gd Gaelic 0 0
gle ga Irish 0 0
glg gl Galician 1 1
glv gv Manx 0 0
gmh German, Middle High (ca.1050-1500) 0 0
goh German, Old High (ca.750-1050) 0 0
gon Gondi 0 0
gor Gorontalo 0 0
got Gothic 0 0
grb Grebo 0 0
grc Greek, Ancient (to 1453) 0 0
ell el Greek 1 1
grn gn Guarani 0 0
guj gu Gujarati 0 0
gwi Gwich´in 0 0
hai Haida 0 0
hat ht Haitian 0 0
hau ha Hausa 0 0
haw Hawaiian 0 0
heb he Hebrew 1 1
her hz Herero 0 0
hil Hiligaynon 0 0
him Himachali 0 0
hin hi Hindi 1 1
hit Hittite 0 0
hmn Hmong 0 0
hmo ho Hiri Motu 0 0
hrv hr Croatian 1 1
hun hu Hungarian 1 1
hup Hupa 0 0
iba Iban 0 0
ibo ig Igbo 0 0
ice is Icelandic 1 1
ido io Ido 0 0
iii ii Sichuan Yi 0 0
ijo Ijo 0 0
iku iu Inuktitut 0 0
ile ie Interlingue 0 0
ilo Iloko 0 0
ina ia Interlingua (International Auxiliary Language Asso 0 0
inc Indic (Other) 0 0
ind id Indonesian 1 1
ine Indo-European (Other) 0 0
inh Ingush 0 0
ipk ik Inupiaq 0 0
ira Iranian (Other) 0 0
iro Iroquoian languages 0 0
ita it Italian 1 1
jav jv Javanese 0 0
jpn ja Japanese 1 1
jpr Judeo-Persian 0 0
jrb Judeo-Arabic 0 0
kaa Kara-Kalpak 0 0
kab Kabyle 0 0
kac Kachin 0 0
kal kl Kalaallisut 0 0
kam Kamba 0 0
kan kn Kannada 0 0
kar Karen 0 0
kas ks Kashmiri 0 0
kau kr Kanuri 0 0
kaw Kawi 0 0
kaz kk Kazakh 1 0
kbd Kabardian 0 0
kha Khasi 0 0
khi Khoisan (Other) 0 0
khm km Khmer 1 1
kho Khotanese 0 0
kik ki Kikuyu 0 0
kin rw Kinyarwanda 0 0
kir ky Kirghiz 0 0
kmb Kimbundu 0 0
kok Konkani 0 0
kom kv Komi 0 0
kon kg Kongo 0 0
kor ko Korean 1 1
kos Kosraean 0 0
kpe Kpelle 0 0
krc Karachay-Balkar 0 0
kro Kru 0 0
kru Kurukh 0 0
kua kj Kuanyama 0 0
kum Kumyk 0 0
kur ku Kurdish 0 0
kut Kutenai 0 0
lad Ladino 0 0
lah Lahnda 0 0
lam Lamba 0 0
lao lo Lao 0 0
lat la Latin 0 0
lav lv Latvian 1 0
lez Lezghian 0 0
lim li Limburgan 0 0
lin ln Lingala 0 0
lit lt Lithuanian 1 0
lol Mongo 0 0
loz Lozi 0 0
ltz lb Luxembourgish 1 0
lua Luba-Lulua 0 0
lub lu Luba-Katanga 0 0
lug lg Ganda 0 0
lui Luiseno 0 0
lun Lunda 0 0
luo Luo (Kenya and Tanzania) 0 0
lus lushai 0 0
mac mk Macedonian 1 1
mad Madurese 0 0
mag Magahi 0 0
mah mh Marshallese 0 0
mai Maithili 0 0
mak Makasar 0 0
mal ml Malayalam 1 0
man Mandingo 0 0
mao mi Maori 0 0
map Austronesian (Other) 0 0
mar mr Marathi 0 0
mas Masai 0 0
may ms Malay 1 1
mdf Moksha 0 0
mdr Mandar 0 0
men Mende 0 0
mga Irish, Middle (900-1200) 0 0
mic Mi'kmaq 0 0
min Minangkabau 0 0
mis Miscellaneous languages 0 0
mkh Mon-Khmer (Other) 0 0
mlg mg Malagasy 0 0
mlt mt Maltese 0 0
mnc Manchu 0 0
mni Manipuri 0 0
mno Manobo languages 0 0
moh Mohawk 0 0
mol mo Moldavian 0 0
mon mn Mongolian 1 0
mos Mossi 0 0
mwl Mirandese 0 0
mul Multiple languages 0 0
mun Munda languages 0 0
mus Creek 0 0
mwr Marwari 0 0
myn Mayan languages 0 0
myv Erzya 0 0
nah Nahuatl 0 0
nai North American Indian 0 0
nap Neapolitan 0 0
nau na Nauru 0 0
nav nv Navajo 0 0
nbl nr Ndebele, South 0 0
nde nd Ndebele, North 0 0
ndo ng Ndonga 0 0
nds Low German 0 0
nep ne Nepali 0 0
new Nepal Bhasa 0 0
nia Nias 0 0
nic Niger-Kordofanian (Other) 0 0
niu Niuean 0 0
nno nn Norwegian Nynorsk 0 0
nob nb Norwegian Bokmal 0 0
nog Nogai 0 0
non Norse, Old 0 0
nor no Norwegian 1 1
nso Northern Sotho 0 0
nub Nubian languages 0 0
nwc Classical Newari 0 0
nya ny Chichewa 0 0
nym Nyamwezi 0 0
nyn Nyankole 0 0
nyo Nyoro 0 0
nzi Nzima 0 0
oci oc Occitan 1 1
oji oj Ojibwa 0 0
ori or Oriya 0 0
orm om Oromo 0 0
osa Osage 0 0
oss os Ossetian 0 0
ota Turkish, Ottoman (1500-1928) 0 0
oto Otomian languages 0 0
paa Papuan (Other) 0 0
pag Pangasinan 0 0
pal Pahlavi 0 0
pam Pampanga 0 0
pan pa Panjabi 0 0
pap Papiamento 0 0
pau Palauan 0 0
peo Persian, Old (ca.600-400 B.C.) 0 0
per fa Persian 1 1
phi Philippine (Other) 0 0
phn Phoenician 0 0
pli pi Pali 0 0
pol pl Polish 1 1
pon Pohnpeian 0 0
por pt Portuguese 1 1
pra Prakrit languages 0 0
pro Provençal, Old (to 1500) 0 0
pus ps Pushto 0 0
que qu Quechua 0 0
raj Rajasthani 0 0
rap Rapanui 0 0
rar Rarotongan 0 0
roa Romance (Other) 0 0
roh rm Raeto-Romance 0 0
rom Romany 0 0
run rn Rundi 0 0
rup Aromanian 0 0
rus ru Russian 1 1
sad Sandawe 0 0
sag sg Sango 0 0
sah Yakut 0 0
sai South American Indian (Other) 0 0
sal Salishan languages 0 0
sam Samaritan Aramaic 0 0
san sa Sanskrit 0 0
sas Sasak 0 0
sat Santali 0 0
scc sr Serbian 1 1
scn Sicilian 0 0
sco Scots 0 0
sel Selkup 0 0
sem Semitic (Other) 0 0
sga Irish, Old (to 900) 0 0
sgn Sign Languages 0 0
shn Shan 0 0
sid Sidamo 0 0
sin si Sinhalese 1 1
sio Siouan languages 0 0
sit Sino-Tibetan (Other) 0 0
sla Slavic (Other) 0 0
slo sk Slovak 1 1
slv sl Slovenian 1 1
sma Southern Sami 0 0
sme se Northern Sami 0 0
smi Sami languages (Other) 0 0
smj Lule Sami 0 0
smn Inari Sami 0 0
smo sm Samoan 0 0
sms Skolt Sami 0 0
sna sn Shona 0 0
snd sd Sindhi 0 0
snk Soninke 0 0
sog Sogdian 0 0
som so Somali 0 0
son Songhai 0 0
sot st Sotho, Southern 0 0
spa es Spanish 1 1
srd sc Sardinian 0 0
srr Serer 0 0
ssa Nilo-Saharan (Other) 0 0
ssw ss Swati 0 0
suk Sukuma 0 0
sun su Sundanese 0 0
sus Susu 0 0
sux Sumerian 0 0
swa sw Swahili 1 0
swe sv Swedish 1 1
syr Syriac 1 0
tah ty Tahitian 0 0
tai Tai (Other) 0 0
tam ta Tamil 1 0
tat tt Tatar 0 0
tel te Telugu 1 0
tem Timne 0 0
ter Tereno 0 0
tet Tetum 0 0
tgk tg Tajik 0 0
tgl tl Tagalog 1 1
tha th Thai 1 1
tib bo Tibetan 0 0
tig Tigre 0 0
tir ti Tigrinya 0 0
tiv Tiv 0 0
tkl Tokelau 0 0
tlh Klingon 0 0
tli Tlingit 0 0
tmh Tamashek 0 0
tog Tonga (Nyasa) 0 0
ton to Tonga (Tonga Islands) 0 0
tpi Tok Pisin 0 0
tsi Tsimshian 0 0
tsn tn Tswana 0 0
tso ts Tsonga 0 0
tuk tk Turkmen 0 0
tum Tumbuka 0 0
tup Tupi languages 0 0
tur tr Turkish 1 1
tut Altaic (Other) 0 0
tvl Tuvalu 0 0
twi tw Twi 0 0
tyv Tuvinian 0 0
udm Udmurt 0 0
uga Ugaritic 0 0
uig ug Uighur 0 0
ukr uk Ukrainian 1 1
umb Umbundu 0 0
und Undetermined 0 0
urd ur Urdu 1 0
uzb uz Uzbek 0 0
vai Vai 0 0
ven ve Venda 0 0
vie vi Vietnamese 1 1
vol vo Volapük 0 0
vot Votic 0 0
wak Wakashan languages 0 0
wal Walamo 0 0
war Waray 0 0
was Washo 0 0
wel cy Welsh 0 0
wen Sorbian languages 0 0
wln wa Walloon 0 0
wol wo Wolof 0 0
xal Kalmyk 0 0
xho xh Xhosa 0 0
yao Yao 0 0
yap Yapese 0 0
yid yi Yiddish 0 0
yor yo Yoruba 0 0
ypk Yupik languages 0 0
zap Zapotec 0 0
zen Zenaga 0 0
zha za Zhuang 0 0
znd Zande 0 0
zul zu Zulu 0 0
zun Zuni 0 0
rum ro Romanian 1 1
pob pb Brazilian 1 1
mne Montenegrin 1 0
@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
class Error(Exception):
"""Base class for all exceptions in babelfish"""
pass
class LanguageError(Error, AttributeError):
"""Base class for all language exceptions in babelfish"""
pass
class LanguageConvertError(LanguageError):
"""Exception raised by converters when :meth:`~babelfish.converters.LanguageConverter.convert` fails
:param string alpha3: alpha3 code that failed conversion
:param country: country code that failed conversion, if any
:type country: string or None
:param script: script code that failed conversion, if any
:type script: string or None
"""
def __init__(self, alpha3, country=None, script=None):
self.alpha3 = alpha3
self.country = country
self.script = script
def __str__(self):
s = self.alpha3
if self.country is not None:
s += '-' + self.country
if self.script is not None:
s += '-' + self.script
return s
class LanguageReverseError(LanguageError):
"""Exception raised by converters when :meth:`~babelfish.converters.LanguageReverseConverter.reverse` fails
:param string code: code that failed reverse conversion
"""
def __init__(self, code):
self.code = code
def __str__(self):
return repr(self.code)
class CountryError(Error, AttributeError):
"""Base class for all country exceptions in babelfish"""
pass
class CountryConvertError(CountryError):
"""Exception raised by converters when :meth:`~babelfish.converters.CountryConverter.convert` fails
:param string alpha2: alpha2 code that failed conversion
"""
def __init__(self, alpha2):
self.alpha2 = alpha2
def __str__(self):
return self.alpha2
class CountryReverseError(CountryError):
"""Exception raised by converters when :meth:`~babelfish.converters.CountryReverseConverter.reverse` fails
:param string code: code that failed reverse conversion
"""
def __init__(self, code):
self.code = code
def __str__(self):
return repr(self.code)
@@ -0,0 +1,185 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from collections import namedtuple
from functools import partial
from pkg_resources import resource_stream # @UnresolvedImport
from .converters import ConverterManager
from .country import Country
from .exceptions import LanguageConvertError
from .script import Script
from . import basestr
LANGUAGES = set()
LANGUAGE_MATRIX = []
#: The namedtuple used in the :data:`LANGUAGE_MATRIX`
IsoLanguage = namedtuple('IsoLanguage', ['alpha3', 'alpha3b', 'alpha3t', 'alpha2', 'scope', 'type', 'name', 'comment'])
f = resource_stream('babelfish', 'data/iso-639-3.tab')
f.readline()
for l in f:
iso_language = IsoLanguage(*l.decode('utf-8').split('\t'))
LANGUAGES.add(iso_language.alpha3)
LANGUAGE_MATRIX.append(iso_language)
f.close()
class LanguageConverterManager(ConverterManager):
""":class:`~babelfish.converters.ConverterManager` for language converters"""
entry_point = 'babelfish.language_converters'
internal_converters = ['alpha2 = babelfish.converters.alpha2:Alpha2Converter',
'alpha3b = babelfish.converters.alpha3b:Alpha3BConverter',
'alpha3t = babelfish.converters.alpha3t:Alpha3TConverter',
'name = babelfish.converters.name:NameConverter',
'scope = babelfish.converters.scope:ScopeConverter',
'type = babelfish.converters.type:LanguageTypeConverter',
'opensubtitles = babelfish.converters.opensubtitles:OpenSubtitlesConverter']
language_converters = LanguageConverterManager()
class LanguageMeta(type):
"""The :class:`Language` metaclass
Dynamically redirect :meth:`Language.frommycode` to :meth:`Language.fromcode` with the ``mycode`` `converter`
"""
def __getattr__(cls, name):
if name.startswith('from'):
return partial(cls.fromcode, converter=name[4:])
return type.__getattribute__(cls, name)
class Language(LanguageMeta(str('LanguageBase'), (object,), {})):
"""A human language
A human language is composed of a language part following the ISO-639
standard and can be country-specific when a :class:`~babelfish.country.Country`
is specified.
The :class:`Language` is extensible with custom converters (see :ref:`custom_converters`)
:param string language: the language as a 3-letter ISO-639-3 code
:param country: the country (if any) as a 2-letter ISO-3166 code or :class:`~babelfish.country.Country` instance
:type country: string or :class:`~babelfish.country.Country` or None
:param script: the script (if any) as a 4-letter ISO-15924 code or :class:`~babelfish.script.Script` instance
:type script: string or :class:`~babelfish.script.Script` or None
:param unknown: the unknown language as a three-letters ISO-639-3 code to use as fallback
:type unknown: string or None
:raise: ValueError if the language could not be recognized and `unknown` is ``None``
"""
def __init__(self, language, country=None, script=None, unknown=None):
if unknown is not None and language not in LANGUAGES:
language = unknown
if language not in LANGUAGES:
raise ValueError('%r is not a valid language' % language)
self.alpha3 = language
self.country = None
if isinstance(country, Country):
self.country = country
elif country is None:
self.country = None
else:
self.country = Country(country)
self.script = None
if isinstance(script, Script):
self.script = script
elif script is None:
self.script = None
else:
self.script = Script(script)
@classmethod
def fromcode(cls, code, converter):
"""Create a :class:`Language` by its `code` using `converter` to
:meth:`~babelfish.converters.LanguageReverseConverter.reverse` it
:param string code: the code to reverse
:param string converter: name of the :class:`~babelfish.converters.LanguageReverseConverter` to use
:return: the corresponding :class:`Language` instance
:rtype: :class:`Language`
"""
return cls(*language_converters[converter].reverse(code))
@classmethod
def fromietf(cls, ietf):
"""Create a :class:`Language` by from an IETF language code
:param string ietf: the ietf code
:return: the corresponding :class:`Language` instance
:rtype: :class:`Language`
"""
subtags = ietf.split('-')
language_subtag = subtags.pop(0).lower()
if len(language_subtag) == 2:
language = cls.fromalpha2(language_subtag)
else:
language = cls(language_subtag)
while subtags:
subtag = subtags.pop(0)
if len(subtag) == 2:
language.country = Country(subtag.upper())
else:
language.script = Script(subtag.capitalize())
if language.script is not None:
if subtags:
raise ValueError('Wrong IETF format. Unmatched subtags: %r' % subtags)
break
return language
def __getstate__(self):
return self.alpha3, self.country, self.script
def __setstate__(self, state):
self.alpha3, self.country, self.script = state
def __getattr__(self, name):
alpha3 = self.alpha3
country = self.country.alpha2 if self.country is not None else None
script = self.script.code if self.script is not None else None
try:
return language_converters[name].convert(alpha3, country, script)
except KeyError:
raise AttributeError(name)
def __hash__(self):
return hash(str(self))
def __eq__(self, other):
if isinstance(other, basestr):
return str(self) == other
if not isinstance(other, Language):
return False
return (self.alpha3 == other.alpha3 and
self.country == other.country and
self.script == other.script)
def __ne__(self, other):
return not self == other
def __bool__(self):
return self.alpha3 != 'und'
__nonzero__ = __bool__
def __repr__(self):
return '<Language [%s]>' % self
def __str__(self):
try:
s = self.alpha2
except LanguageConvertError:
s = self.alpha3
if self.country is not None:
s += '-' + str(self.country)
if self.script is not None:
s += '-' + str(self.script)
return s
@@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from collections import namedtuple
from pkg_resources import resource_stream # @UnresolvedImport
from . import basestr
#: Script code to script name mapping
SCRIPTS = {}
#: List of countries in the ISO-15924 as namedtuple of code, number, name, french_name, pva and date
SCRIPT_MATRIX = []
#: The namedtuple used in the :data:`SCRIPT_MATRIX`
IsoScript = namedtuple('IsoScript', ['code', 'number', 'name', 'french_name', 'pva', 'date'])
f = resource_stream('babelfish', 'data/iso15924-utf8-20131012.txt')
f.readline()
for l in f:
l = l.decode('utf-8').strip()
if not l or l.startswith('#'):
continue
script = IsoScript._make(l.split(';'))
SCRIPT_MATRIX.append(script)
SCRIPTS[script.code] = script.name
f.close()
class Script(object):
"""A human writing system
A script is represented by a 4-letter code from the ISO-15924 standard
:param string script: 4-letter ISO-15924 script code
"""
def __init__(self, script):
if script not in SCRIPTS:
raise ValueError('%r is not a valid script' % script)
#: ISO-15924 4-letter script code
self.code = script
@property
def name(self):
"""English name of the script"""
return SCRIPTS[self.code]
def __getstate__(self):
return self.code
def __setstate__(self, state):
self.code = state
def __hash__(self):
return hash(self.code)
def __eq__(self, other):
if isinstance(other, basestr):
return self.code == other
if not isinstance(other, Script):
return False
return self.code == other.code
def __ne__(self, other):
return not self == other
def __repr__(self):
return '<Script [%s]>' % self
def __str__(self):
return self.code
@@ -0,0 +1,368 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
import re
import sys
import pickle
from unittest import TestCase, TestSuite, TestLoader, TextTestRunner
from pkg_resources import resource_stream # @UnresolvedImport
from babelfish import (LANGUAGES, Language, Country, Script, language_converters, country_converters,
LanguageReverseConverter, LanguageConvertError, LanguageReverseError, CountryReverseError)
if sys.version_info[:2] <= (2, 6):
_MAX_LENGTH = 80
def safe_repr(obj, short=False):
try:
result = repr(obj)
except Exception:
result = object.__repr__(obj)
if not short or len(result) < _MAX_LENGTH:
return result
return result[:_MAX_LENGTH] + ' [truncated]...'
class _AssertRaisesContext(object):
"""A context manager used to implement TestCase.assertRaises* methods."""
def __init__(self, expected, test_case, expected_regexp=None):
self.expected = expected
self.failureException = test_case.failureException
self.expected_regexp = expected_regexp
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, tb):
if exc_type is None:
try:
exc_name = self.expected.__name__
except AttributeError:
exc_name = str(self.expected)
raise self.failureException(
"{0} not raised".format(exc_name))
if not issubclass(exc_type, self.expected):
# let unexpected exceptions pass through
return False
self.exception = exc_value # store for later retrieval
if self.expected_regexp is None:
return True
expected_regexp = self.expected_regexp
if isinstance(expected_regexp, basestring):
expected_regexp = re.compile(expected_regexp)
if not expected_regexp.search(str(exc_value)):
raise self.failureException('"%s" does not match "%s"' %
(expected_regexp.pattern, str(exc_value)))
return True
class _Py26FixTestCase(object):
def assertIsNone(self, obj, msg=None):
"""Same as self.assertTrue(obj is None), with a nicer default message."""
if obj is not None:
standardMsg = '%s is not None' % (safe_repr(obj),)
self.fail(self._formatMessage(msg, standardMsg))
def assertIsNotNone(self, obj, msg=None):
"""Included for symmetry with assertIsNone."""
if obj is None:
standardMsg = 'unexpectedly None'
self.fail(self._formatMessage(msg, standardMsg))
def assertIn(self, member, container, msg=None):
"""Just like self.assertTrue(a in b), but with a nicer default message."""
if member not in container:
standardMsg = '%s not found in %s' % (safe_repr(member),
safe_repr(container))
self.fail(self._formatMessage(msg, standardMsg))
def assertNotIn(self, member, container, msg=None):
"""Just like self.assertTrue(a not in b), but with a nicer default message."""
if member in container:
standardMsg = '%s unexpectedly found in %s' % (safe_repr(member),
safe_repr(container))
self.fail(self._formatMessage(msg, standardMsg))
def assertIs(self, expr1, expr2, msg=None):
"""Just like self.assertTrue(a is b), but with a nicer default message."""
if expr1 is not expr2:
standardMsg = '%s is not %s' % (safe_repr(expr1),
safe_repr(expr2))
self.fail(self._formatMessage(msg, standardMsg))
def assertIsNot(self, expr1, expr2, msg=None):
"""Just like self.assertTrue(a is not b), but with a nicer default message."""
if expr1 is expr2:
standardMsg = 'unexpectedly identical: %s' % (safe_repr(expr1),)
self.fail(self._formatMessage(msg, standardMsg))
else:
class _Py26FixTestCase(object):
pass
class TestScript(TestCase, _Py26FixTestCase):
def test_wrong_script(self):
self.assertRaises(ValueError, lambda: Script('Azer'))
def test_eq(self):
self.assertEqual(Script('Latn'), Script('Latn'))
def test_ne(self):
self.assertNotEqual(Script('Cyrl'), Script('Latn'))
def test_hash(self):
self.assertEqual(hash(Script('Hira')), hash('Hira'))
def test_pickle(self):
self.assertEqual(pickle.loads(pickle.dumps(Script('Latn'))), Script('Latn'))
class TestCountry(TestCase, _Py26FixTestCase):
def test_wrong_country(self):
self.assertRaises(ValueError, lambda: Country('ZZ'))
def test_eq(self):
self.assertEqual(Country('US'), Country('US'))
def test_ne(self):
self.assertNotEqual(Country('GB'), Country('US'))
self.assertIsNotNone(Country('US'))
def test_hash(self):
self.assertEqual(hash(Country('US')), hash('US'))
def test_pickle(self):
for country in [Country('GB'), Country('US')]:
self.assertEqual(pickle.loads(pickle.dumps(country)), country)
def test_converter_name(self):
self.assertEqual(Country('US').name, 'UNITED STATES')
self.assertEqual(Country.fromname('UNITED STATES'), Country('US'))
self.assertEqual(Country.fromcode('UNITED STATES', 'name'), Country('US'))
self.assertRaises(CountryReverseError, lambda: Country.fromname('ZZZZZ'))
self.assertEqual(len(country_converters['name'].codes), 249)
class TestLanguage(TestCase, _Py26FixTestCase):
def test_languages(self):
self.assertEqual(len(LANGUAGES), 7874)
def test_wrong_language(self):
self.assertRaises(ValueError, lambda: Language('zzz'))
def test_unknown_language(self):
self.assertEqual(Language('zzzz', unknown='und'), Language('und'))
def test_converter_alpha2(self):
self.assertEqual(Language('eng').alpha2, 'en')
self.assertEqual(Language.fromalpha2('en'), Language('eng'))
self.assertEqual(Language.fromcode('en', 'alpha2'), Language('eng'))
self.assertRaises(LanguageReverseError, lambda: Language.fromalpha2('zz'))
self.assertRaises(LanguageConvertError, lambda: Language('aaa').alpha2)
self.assertEqual(len(language_converters['alpha2'].codes), 184)
def test_converter_alpha3b(self):
self.assertEqual(Language('fra').alpha3b, 'fre')
self.assertEqual(Language.fromalpha3b('fre'), Language('fra'))
self.assertEqual(Language.fromcode('fre', 'alpha3b'), Language('fra'))
self.assertRaises(LanguageReverseError, lambda: Language.fromalpha3b('zzz'))
self.assertRaises(LanguageConvertError, lambda: Language('aaa').alpha3b)
self.assertEqual(len(language_converters['alpha3b'].codes), 418)
def test_converter_alpha3t(self):
self.assertEqual(Language('fra').alpha3t, 'fra')
self.assertEqual(Language.fromalpha3t('fra'), Language('fra'))
self.assertEqual(Language.fromcode('fra', 'alpha3t'), Language('fra'))
self.assertRaises(LanguageReverseError, lambda: Language.fromalpha3t('zzz'))
self.assertRaises(LanguageConvertError, lambda: Language('aaa').alpha3t)
self.assertEqual(len(language_converters['alpha3t'].codes), 418)
def test_converter_name(self):
self.assertEqual(Language('eng').name, 'English')
self.assertEqual(Language.fromname('English'), Language('eng'))
self.assertEqual(Language.fromcode('English', 'name'), Language('eng'))
self.assertRaises(LanguageReverseError, lambda: Language.fromname('Zzzzzzzzz'))
self.assertEqual(len(language_converters['name'].codes), 7874)
def test_converter_scope(self):
self.assertEqual(language_converters['scope'].codes, set(['I', 'S', 'M']))
self.assertEqual(Language('eng').scope, 'individual')
self.assertEqual(Language('und').scope, 'special')
def test_converter_type(self):
self.assertEqual(language_converters['type'].codes, set(['A', 'C', 'E', 'H', 'L', 'S']))
self.assertEqual(Language('eng').type, 'living')
self.assertEqual(Language('und').type, 'special')
def test_converter_opensubtitles(self):
self.assertEqual(Language('fra').opensubtitles, Language('fra').alpha3b)
self.assertEqual(Language('por', 'BR').opensubtitles, 'pob')
self.assertEqual(Language.fromopensubtitles('fre'), Language('fra'))
self.assertEqual(Language.fromopensubtitles('pob'), Language('por', 'BR'))
self.assertEqual(Language.fromopensubtitles('pb'), Language('por', 'BR'))
# Montenegrin is not recognized as an ISO language (yet?) but for now it is
# unofficially accepted as Serbian from Montenegro
self.assertEqual(Language.fromopensubtitles('mne'), Language('srp', 'ME'))
self.assertEqual(Language.fromcode('pob', 'opensubtitles'), Language('por', 'BR'))
self.assertRaises(LanguageReverseError, lambda: Language.fromopensubtitles('zzz'))
self.assertRaises(LanguageConvertError, lambda: Language('aaa').opensubtitles)
self.assertEqual(len(language_converters['opensubtitles'].codes), 606)
# test with all the LANGUAGES from the opensubtitles api
# downloaded from: http://www.opensubtitles.org/addons/export_languages.php
f = resource_stream('babelfish', 'data/opensubtitles_languages.txt')
f.readline()
for l in f:
idlang, alpha2, _, upload_enabled, web_enabled = l.decode('utf-8').strip().split('\t')
if not int(upload_enabled) and not int(web_enabled):
# do not test LANGUAGES that are too esoteric / not widely available
continue
self.assertEqual(Language.fromopensubtitles(idlang).opensubtitles, idlang)
if alpha2:
self.assertEqual(Language.fromopensubtitles(idlang), Language.fromopensubtitles(alpha2))
f.close()
def test_fromietf_country_script(self):
language = Language.fromietf('fra-FR-Latn')
self.assertEqual(language.alpha3, 'fra')
self.assertEqual(language.country, Country('FR'))
self.assertEqual(language.script, Script('Latn'))
def test_fromietf_country_no_script(self):
language = Language.fromietf('fra-FR')
self.assertEqual(language.alpha3, 'fra')
self.assertEqual(language.country, Country('FR'))
self.assertIsNone(language.script)
def test_fromietf_no_country_no_script(self):
language = Language.fromietf('fra-FR')
self.assertEqual(language.alpha3, 'fra')
self.assertEqual(language.country, Country('FR'))
self.assertIsNone(language.script)
def test_fromietf_no_country_script(self):
language = Language.fromietf('fra-Latn')
self.assertEqual(language.alpha3, 'fra')
self.assertIsNone(language.country)
self.assertEqual(language.script, Script('Latn'))
def test_fromietf_alpha2_language(self):
language = Language.fromietf('fr-Latn')
self.assertEqual(language.alpha3, 'fra')
self.assertIsNone(language.country)
self.assertEqual(language.script, Script('Latn'))
def test_fromietf_wrong_language(self):
self.assertRaises(ValueError, lambda: Language.fromietf('xyz-FR'))
def test_fromietf_wrong_country(self):
self.assertRaises(ValueError, lambda: Language.fromietf('fra-YZ'))
def test_fromietf_wrong_script(self):
self.assertRaises(ValueError, lambda: Language.fromietf('fra-FR-Wxyz'))
def test_eq(self):
self.assertEqual(Language('eng'), Language('eng'))
def test_ne(self):
self.assertNotEqual(Language('fra'), Language('eng'))
self.assertIsNotNone(Language('fra'))
def test_nonzero(self):
self.assertFalse(bool(Language('und')))
self.assertTrue(bool(Language('eng')))
def test_language_hasattr(self):
self.assertTrue(hasattr(Language('fra'), 'alpha3'))
self.assertTrue(hasattr(Language('fra'), 'alpha2'))
self.assertFalse(hasattr(Language('bej'), 'alpha2'))
def test_country(self):
self.assertEqual(Language('por', 'BR').country, Country('BR'))
self.assertEqual(Language('eng', Country('US')).country, Country('US'))
def test_eq_with_country(self):
self.assertEqual(Language('eng', 'US'), Language('eng', Country('US')))
def test_ne_with_country(self):
self.assertNotEqual(Language('eng', 'US'), Language('eng', Country('GB')))
def test_script(self):
self.assertEqual(Language('srp', script='Latn').script, Script('Latn'))
self.assertEqual(Language('srp', script=Script('Cyrl')).script, Script('Cyrl'))
def test_eq_with_script(self):
self.assertEqual(Language('srp', script='Latn'), Language('srp', script=Script('Latn')))
def test_ne_with_script(self):
self.assertNotEqual(Language('srp', script='Latn'), Language('srp', script=Script('Cyrl')))
def test_eq_with_country_and_script(self):
self.assertEqual(Language('srp', 'SR', 'Latn'), Language('srp', Country('SR'), Script('Latn')))
def test_ne_with_country_and_script(self):
self.assertNotEqual(Language('srp', 'SR', 'Latn'), Language('srp', Country('SR'), Script('Cyrl')))
def test_hash(self):
self.assertEqual(hash(Language('fra')), hash('fr'))
self.assertEqual(hash(Language('ace')), hash('ace'))
self.assertEqual(hash(Language('por', 'BR')), hash('pt-BR'))
self.assertEqual(hash(Language('srp', script='Cyrl')), hash('sr-Cyrl'))
self.assertEqual(hash(Language('eng', 'US', 'Latn')), hash('en-US-Latn'))
def test_pickle(self):
for lang in [Language('fra'),
Language('eng', 'US'),
Language('srp', script='Latn'),
Language('eng', 'US', 'Latn')]:
self.assertEqual(pickle.loads(pickle.dumps(lang)), lang)
def test_str(self):
self.assertEqual(Language.fromietf(str(Language('eng', 'US', 'Latn'))), Language('eng', 'US', 'Latn'))
self.assertEqual(Language.fromietf(str(Language('fra', 'FR'))), Language('fra', 'FR'))
self.assertEqual(Language.fromietf(str(Language('bel'))), Language('bel'))
def test_register_converter(self):
class TestConverter(LanguageReverseConverter):
def __init__(self):
self.to_test = {'fra': 'test1', 'eng': 'test2'}
self.from_test = {'test1': 'fra', 'test2': 'eng'}
def convert(self, alpha3, country=None, script=None):
if alpha3 not in self.to_test:
raise LanguageConvertError(alpha3, country, script)
return self.to_test[alpha3]
def reverse(self, test):
if test not in self.from_test:
raise LanguageReverseError(test)
return (self.from_test[test], None)
language = Language('fra')
self.assertFalse(hasattr(language, 'test'))
language_converters['test'] = TestConverter()
self.assertTrue(hasattr(language, 'test'))
self.assertIn('test', language_converters)
self.assertEqual(Language('fra').test, 'test1')
self.assertEqual(Language.fromtest('test2').alpha3, 'eng')
del language_converters['test']
self.assertNotIn('test', language_converters)
self.assertRaises(KeyError, lambda: Language.fromtest('test1'))
self.assertRaises(AttributeError, lambda: Language('fra').test)
def suite():
suite = TestSuite()
suite.addTest(TestLoader().loadTestsFromTestCase(TestScript))
suite.addTest(TestLoader().loadTestsFromTestCase(TestCountry))
suite.addTest(TestLoader().loadTestsFromTestCase(TestLanguage))
return suite
if __name__ == '__main__':
TextTestRunner().run(suite())
@@ -0,0 +1,43 @@
Behold, mortal, the origins of Beautiful Soup...
================================================
Leonard Richardson is the primary programmer.
Aaron DeVore is awesome.
Mark Pilgrim provided the encoding detection code that forms the base
of UnicodeDammit.
Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
Soup 4 working under Python 3.
Simon Willison wrote soupselect, which was used to make Beautiful Soup
support CSS selectors.
Sam Ruby helped with a lot of edge cases.
Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
work in solving the nestable tags conundrum.
An incomplete list of people have contributed patches to Beautiful
Soup:
Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
Webster, Paul Wright, Danny Yoo
An incomplete list of people who made suggestions or found bugs or
found ways to break Beautiful Soup:
Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
Sousa Rocha, Yichun Wei, Per Vognsen
@@ -0,0 +1,26 @@
Beautiful Soup is made available under the MIT license:
Copyright (c) 2004-2012 Leonard Richardson
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE, DAMMIT.
Beautiful Soup incorporates code from the html5lib library, which is
also made available under the MIT license.
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,63 @@
= Introduction =
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML")
>>> print soup.prettify()
<html>
<body>
<p>
Some
<b>
bad
<i>
HTML
</i>
</b>
</p>
</body>
</html>
>>> soup.find(text="bad")
u'bad'
>>> soup.i
<i>HTML</i>
>>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml")
>>> print soup.prettify()
<?xml version="1.0" encoding="utf-8">
<tag1>
Some
<tag2 />
bad
<tag3>
XML
</tag3>
</tag1>
= Full documentation =
The bs4/doc/ directory contains full documentation in Sphinx
format. Run "make html" in that directory to create HTML
documentation.
= Running the unit tests =
Beautiful Soup supports unit test discovery from the project root directory:
$ nosetests
$ python -m unittest discover -s bs4 # Python 2.7 and up
If you checked out the source tree, you should see a script in the
home directory called test-all-versions. This script will run the unit
tests under Python 2.7, then create a temporary Python 3 conversion of
the source and run the unit tests again under Python 3.
= Links =
Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/
Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
http://readthedocs.org/docs/beautiful-soup-4/
Discussion group: http://groups.google.com/group/beautifulsoup/
Development: https://code.launchpad.net/beautifulsoup/
Bug tracker: https://bugs.launchpad.net/beautifulsoup/
@@ -0,0 +1,406 @@
"""Beautiful Soup
Elixir and Tonic
"The Screen-Scraper's Friend"
http://www.crummy.com/software/BeautifulSoup/
Beautiful Soup uses a pluggable XML or HTML parser to parse a
(possibly invalid) document into a tree representation. Beautiful Soup
provides provides methods and Pythonic idioms that make it easy to
navigate, search, and modify the parse tree.
Beautiful Soup works with Python 2.6 and up. It works better if lxml
and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the
documentation:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.3.2"
__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
import os
import re
import warnings
from .builder import builder_registry, ParserRejectedMarkup
from .dammit import UnicodeDammit
from .element import (
CData,
Comment,
DEFAULT_OUTPUT_ENCODING,
Declaration,
Doctype,
NavigableString,
PageElement,
ProcessingInstruction,
ResultSet,
SoupStrainer,
Tag,
)
# The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it.
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag):
"""
This class defines the basic interface called by the tree builders.
These methods will be called by the parser:
reset()
feed(markup)
The tree builder may call these methods from its feed() implementation:
handle_starttag(name, attrs) # See note about return value
handle_endtag(name)
handle_data(data) # Appends to the current data node
endData(containerClass=NavigableString) # Ends the current data node
No matter how complicated the underlying parser is, you should be
able to build a tree using 'start tag' events, 'end tag' events,
'data' events, and "done with data" events.
If you encounter an empty-element tag (aka a self-closing tag,
like HTML's <br> tag), call handle_starttag and then
handle_endtag.
"""
ROOT_TAG_NAME = u'[document]'
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, **kwargs):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser."""
if 'convertEntities' in kwargs:
warnings.warn(
"BS4 does not respect the convertEntities argument to the "
"BeautifulSoup constructor. Entities are always converted "
"to Unicode characters.")
if 'markupMassage' in kwargs:
del kwargs['markupMassage']
warnings.warn(
"BS4 does not respect the markupMassage argument to the "
"BeautifulSoup constructor. The tree builder is responsible "
"for any necessary markup massage.")
if 'smartQuotesTo' in kwargs:
del kwargs['smartQuotesTo']
warnings.warn(
"BS4 does not respect the smartQuotesTo argument to the "
"BeautifulSoup constructor. Smart quotes are always converted "
"to Unicode characters.")
if 'selfClosingTags' in kwargs:
del kwargs['selfClosingTags']
warnings.warn(
"BS4 does not respect the selfClosingTags argument to the "
"BeautifulSoup constructor. The tree builder is responsible "
"for understanding self-closing tags.")
if 'isHTML' in kwargs:
del kwargs['isHTML']
warnings.warn(
"BS4 does not respect the isHTML argument to the "
"BeautifulSoup constructor. You can pass in features='html' "
"or features='xml' to get a builder capable of handling "
"one or the other.")
def deprecated_argument(old_name, new_name):
if old_name in kwargs:
warnings.warn(
'The "%s" argument to the BeautifulSoup constructor '
'has been renamed to "%s."' % (old_name, new_name))
value = kwargs[old_name]
del kwargs[old_name]
return value
return None
parse_only = parse_only or deprecated_argument(
"parseOnlyThese", "parse_only")
from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding")
if len(kwargs) > 0:
arg = kwargs.keys().pop()
raise TypeError(
"__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None:
if isinstance(features, basestring):
features = [features]
if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES
builder_class = builder_registry.lookup(*features)
if builder_class is None:
raise FeatureNotFound(
"Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
builder = builder_class()
self.builder = builder
self.is_xml = builder.is_xml
self.builder.soup = self
self.parse_only = parse_only
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
elif len(markup) <= 256:
# Print out warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants.
if (isinstance(markup, unicode)
and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8")
else:
possible_filename = markup
is_file = False
try:
is_file = os.path.exists(possible_filename)
except Exception, e:
# This is almost certainly a problem involving
# characters not valid in filenames on this
# system. Just let it go.
pass
if is_file:
warnings.warn(
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
if markup[:5] == "http:" or markup[:6] == "https:":
# TODO: This is ugly but I couldn't get it to work in
# Python 3 otherwise.
if ((isinstance(markup, bytes) and not b' ' in markup)
or (isinstance(markup, unicode) and not u' ' in markup)):
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in (
self.builder.prepare_markup(markup, from_encoding)):
self.reset()
try:
self._feed()
break
except ParserRejectedMarkup:
pass
# Clear out the markup and remove the builder's circular
# reference to this object.
self.markup = None
self.builder.soup = None
def _feed(self):
# Convert the document to Unicode.
self.builder.reset()
self.builder.feed(self.markup)
# Close out any unfinished strings and close all the open tags.
self.endData()
while self.currentTag.name != self.ROOT_TAG_NAME:
self.popTag()
def reset(self):
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
self.hidden = 1
self.builder.reset()
self.current_data = []
self.currentTag = None
self.tagStack = []
self.preserve_whitespace_tag_stack = []
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
"""Create a new tag associated with this soup."""
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup."""
navigable = subclass(s)
navigable.setup()
return navigable
def insert_before(self, successor):
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
def insert_after(self, successor):
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
def popTag(self):
tag = self.tagStack.pop()
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
self.preserve_whitespace_tag_stack.pop()
#print "Pop", tag.name
if self.tagStack:
self.currentTag = self.tagStack[-1]
return self.currentTag
def pushTag(self, tag):
#print "Push", tag.name
if self.currentTag:
self.currentTag.contents.append(tag)
self.tagStack.append(tag)
self.currentTag = self.tagStack[-1]
if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag)
def endData(self, containerClass=NavigableString):
if self.current_data:
current_data = u''.join(self.current_data)
# If whitespace is not preserved, and this string contains
# nothing but ASCII spaces, replace it with a single space
# or newline.
if not self.preserve_whitespace_tag_stack:
strippable = True
for i in current_data:
if i not in self.ASCII_SPACES:
strippable = False
break
if strippable:
if '\n' in current_data:
current_data = '\n'
else:
current_data = ' '
# Reset the data collector.
self.current_data = []
# Should we add this string to the tree at all?
if self.parse_only and len(self.tagStack) <= 1 and \
(not self.parse_only.text or \
not self.parse_only.search(current_data)):
return
o = containerClass(current_data)
self.object_was_parsed(o)
def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree."""
parent = parent or self.currentTag
most_recent_element = most_recent_element or self._most_recent_element
o.setup(parent, most_recent_element)
if most_recent_element is not None:
most_recent_element.next_element = o
self._most_recent_element = o
parent.contents.append(o)
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag
stack up to but *not* including the most recent instqance of
the given tag."""
#print "Popping to %s" % name
if name == self.ROOT_TAG_NAME:
# The BeautifulSoup object itself can never be popped.
return
most_recently_popped = None
stack_size = len(self.tagStack)
for i in range(stack_size - 1, 0, -1):
t = self.tagStack[i]
if (name == t.name and nsprefix == t.prefix):
if inclusivePop:
most_recently_popped = self.popTag()
break
most_recently_popped = self.popTag()
return most_recently_popped
def handle_starttag(self, name, namespace, nsprefix, attrs):
"""Push a start tag on to the stack.
If this method returns None, the tag was rejected by the
SoupStrainer. You should proceed as if the tag had not occured
in the document. For instance, if this was a self-closing tag,
don't call handle_endtag.
"""
# print "Start tag %s: %s" % (name, attrs)
self.endData()
if (self.parse_only and len(self.tagStack) <= 1
and (self.parse_only.text
or not self.parse_only.search_tag(name, attrs))):
return None
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
self.currentTag, self._most_recent_element)
if tag is None:
return tag
if self._most_recent_element:
self._most_recent_element.next_element = tag
self._most_recent_element = tag
self.pushTag(tag)
return tag
def handle_endtag(self, name, nsprefix=None):
#print "End tag: " + name
self.endData()
self._popToTag(name, nsprefix)
def handle_data(self, data):
self.current_data.append(data)
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
"""Returns a string or Unicode representation of this document.
To get Unicode, pass None for encoding."""
if self.is_xml:
# Print the XML declaration
encoding_part = ''
if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
else:
prefix = u''
if not pretty_print:
indent_level = None
else:
indent_level = 0
return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter)
# Alias to make it easier to type import: 'from bs4 import _soup'
_s = BeautifulSoup
_soup = BeautifulSoup
class BeautifulStoneSoup(BeautifulSoup):
"""Deprecated interface to an XML parser."""
def __init__(self, *args, **kwargs):
kwargs['features'] = 'xml'
warnings.warn(
'The BeautifulStoneSoup class is deprecated. Instead of using '
'it, pass features="xml" into the BeautifulSoup constructor.')
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
class StopParsing(Exception):
pass
class FeatureNotFound(ValueError):
pass
#By default, act as an HTML pretty-printer.
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)
print soup.prettify()
@@ -0,0 +1,321 @@
from collections import defaultdict
import itertools
import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
whitespace_re
)
__all__ = [
'HTMLTreeBuilder',
'SAXTreeBuilder',
'TreeBuilder',
'TreeBuilderRegistry',
]
# Some useful features for a TreeBuilder to have.
FAST = 'fast'
PERMISSIVE = 'permissive'
STRICT = 'strict'
XML = 'xml'
HTML = 'html'
HTML_5 = 'html5'
class TreeBuilderRegistry(object):
def __init__(self):
self.builders_for_feature = defaultdict(list)
self.builders = []
def register(self, treebuilder_class):
"""Register a treebuilder based on its advertised features."""
for feature in treebuilder_class.features:
self.builders_for_feature[feature].insert(0, treebuilder_class)
self.builders.insert(0, treebuilder_class)
def lookup(self, *features):
if len(self.builders) == 0:
# There are no builders at all.
return None
if len(features) == 0:
# They didn't ask for any features. Give them the most
# recently registered builder.
return self.builders[0]
# Go down the list of features in order, and eliminate any builders
# that don't match every feature.
features = list(features)
features.reverse()
candidates = None
candidate_set = None
while len(features) > 0:
feature = features.pop()
we_have_the_feature = self.builders_for_feature.get(feature, [])
if len(we_have_the_feature) > 0:
if candidates is None:
candidates = we_have_the_feature
candidate_set = set(candidates)
else:
# Eliminate any candidates that don't have this feature.
candidate_set = candidate_set.intersection(
set(we_have_the_feature))
# The only valid candidates are the ones in candidate_set.
# Go through the original list of candidates and pick the first one
# that's in candidate_set.
if candidate_set is None:
return None
for candidate in candidates:
if candidate in candidate_set:
return candidate
return None
# The BeautifulSoup class will take feature lists from developers and use them
# to look up builders in this registry.
builder_registry = TreeBuilderRegistry()
class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree."""
features = []
is_xml = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {}
def __init__(self):
self.soup = None
def reset(self):
pass
def can_be_empty_element(self, tag_name):
"""Might a tag with this name be an empty-element tag?
The final markup may or may not actually present this tag as
self-closing.
For instance: an HTMLBuilder does not consider a <p> tag to be
an empty-element tag (it's not in
HTMLBuilder.empty_element_tags). This means an empty <p> tag
will be presented as "<p></p>", not "<p />".
The default implementation has no opinion about which tags are
empty-element tags, so a tag will be presented as an
empty-element tag if and only if it has no contents.
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
be left alone.
"""
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
def feed(self, markup):
raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
return markup, None, None, False
def test_fragment_to_document(self, fragment):
"""Wrap an HTML fragment to make it look like a document.
Different parsers do this differently. For instance, lxml
introduces an empty <head> tag, and html5lib
doesn't. Abstracting this away lets us write simple tests
which run HTML fragments through the parser and compare the
results against other HTML fragments.
This method should not be used outside of tests.
"""
return fragment
def set_up_substitutions(self, tag):
return False
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
"""Replaces class="foo bar" with class=["foo", "bar"]
Modifies its input in place.
"""
if not attrs:
return attrs
if self.cdata_list_attributes:
universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), None)
for attr in attrs.keys():
if attr in universal or (tag_specific and attr in tag_specific):
# We have a "class"-type attribute whose string
# value is a whitespace-separated list of
# values. Split it into a list.
value = attrs[attr]
if isinstance(value, basestring):
values = whitespace_re.split(value)
else:
# html5lib sometimes calls setAttributes twice
# for the same tag when rearranging the parse
# tree. On the second call the attribute value
# here is already a list. If this happens,
# leave the value alone rather than trying to
# split it again.
values = value
attrs[attr] = values
return attrs
class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events."""
def feed(self, markup):
raise NotImplementedError()
def close(self):
pass
def startElement(self, name, attrs):
attrs = dict((key[1], value) for key, value in list(attrs.items()))
#print "Start %s, %r" % (name, attrs)
self.soup.handle_starttag(name, attrs)
def endElement(self, name):
#print "End %s" % name
self.soup.handle_endtag(name)
def startElementNS(self, nsTuple, nodeName, attrs):
# Throw away (ns, nodeName) for now.
self.startElement(nodeName, attrs)
def endElementNS(self, nsTuple, nodeName):
# Throw away (ns, nodeName) for now.
self.endElement(nodeName)
#handler.endElementNS((ns, node.nodeName), node.nodeName)
def startPrefixMapping(self, prefix, nodeValue):
# Ignore the prefix for now.
pass
def endPrefixMapping(self, prefix):
# Ignore the prefix for now.
# handler.endPrefixMapping(prefix)
pass
def characters(self, content):
self.soup.handle_data(content)
def startDocument(self):
pass
def endDocument(self):
pass
class HTMLTreeBuilder(TreeBuilder):
"""This TreeBuilder knows facts about HTML.
Such as which tags are empty-element tags.
"""
preserve_whitespace_tags = set(['pre', 'textarea'])
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,
# 'foo' and 'bar', not the single value 'foo bar'. When we
# encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be
# converted back into a string.
cdata_list_attributes = {
"*" : ['class', 'accesskey', 'dropzone'],
"a" : ['rel', 'rev'],
"link" : ['rel', 'rev'],
"td" : ["headers"],
"th" : ["headers"],
"td" : ["headers"],
"form" : ["accept-charset"],
"object" : ["archive"],
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
"area" : ["rel"],
"icon" : ["sizes"],
"iframe" : ["sandbox"],
"output" : ["for"],
}
def set_up_substitutions(self, tag):
# We are only interested in <meta> tags
if tag.name != 'meta':
return False
http_equiv = tag.get('http-equiv')
content = tag.get('content')
charset = tag.get('charset')
# We are interested in <meta> tags that say what encoding the
# document was originally in. This means HTML 5-style <meta>
# tags that provide the "charset" attribute. It also means
# HTML 4-style <meta> tags that provide the "content"
# attribute and have "http-equiv" set to "content-type".
#
# In both cases we will replace the value of the appropriate
# attribute with a standin object that can take on any
# encoding.
meta_encoding = None
if charset is not None:
# HTML 5 style:
# <meta charset="utf8">
meta_encoding = charset
tag['charset'] = CharsetMetaAttributeValue(charset)
elif (content is not None and http_equiv is not None
and http_equiv.lower() == 'content-type'):
# HTML 4 style:
# <meta http-equiv="content-type" content="text/html; charset=utf8">
tag['content'] = ContentMetaAttributeValue(content)
return (meta_encoding is not None)
def register_treebuilders_from(module):
"""Copy TreeBuilders from the given module into this module."""
# I'm fairly sure this is not the best way to do this.
this_module = sys.modules['bs4.builder']
for name in module.__all__:
obj = getattr(module, name)
if issubclass(obj, TreeBuilder):
setattr(this_module, name, obj)
this_module.__all__.append(name)
# Register the builder while we're at it.
this_module.builder_registry.register(obj)
class ParserRejectedMarkup(Exception):
pass
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
# want to use HTMLParser as a last result.
from . import _htmlparser
register_treebuilders_from(_htmlparser)
try:
from . import _html5lib
register_treebuilders_from(_html5lib)
except ImportError:
# They don't have html5lib installed.
pass
try:
from . import _lxml
register_treebuilders_from(_lxml)
except ImportError:
# They don't have lxml installed.
pass
@@ -0,0 +1,285 @@
__all__ = [
'HTML5TreeBuilder',
]
import warnings
from bs4.builder import (
PERMISSIVE,
HTML,
HTML_5,
HTMLTreeBuilder,
)
from bs4.element import NamespacedAttribute
import html5lib
from html5lib.constants import namespaces
from bs4.element import (
Comment,
Doctype,
NavigableString,
Tag,
)
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
features = ['html5lib', PERMISSIVE, HTML_5, HTML]
def prepare_markup(self, markup, user_specified_encoding):
# Store the user-specified encoding for use later on.
self.user_specified_encoding = user_specified_encoding
yield (markup, None, None, False)
# These methods are defined by Beautiful Soup.
def feed(self, markup):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
doc = parser.parse(markup, encoding=self.user_specified_encoding)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
self.soup, namespaceHTMLElements)
return self.underlying_builder
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements):
self.soup = soup
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
self.soup.reset()
return Element(self.soup, self.soup, None)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
self.soup.object_was_parsed(doctype)
def elementClass(self, name, namespace):
tag = self.soup.new_tag(name, namespace)
return Element(tag, self.soup, namespace)
def commentClass(self, data):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
self.soup = BeautifulSoup("")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
def appendChild(self, node):
# XXX This code is not covered by the BS4 tests.
self.soup.append(node.element)
def getDocument(self):
return self.soup
def getFragment(self):
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
class AttrList(object):
def __init__(self, element):
self.element = element
self.attrs = dict(self.element.attrs)
def __iter__(self):
return list(self.attrs.items()).__iter__()
def __setitem__(self, name, value):
"set attr", name, value
self.element[name] = value
def items(self):
return list(self.attrs.items())
def keys(self):
return list(self.attrs.keys())
def __len__(self):
return len(self.attrs)
def __getitem__(self, name):
return self.attrs[name]
def __contains__(self, name):
return name in list(self.attrs.keys())
class Element(html5lib.treebuilders._base.Node):
def __init__(self, element, soup, namespace):
html5lib.treebuilders._base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
def appendChild(self, node):
string_child = child = None
if isinstance(node, basestring):
# Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the
# string.
string_child = child = node
elif isinstance(node, Tag):
# Some other piece of code decided to pass in a Tag
# instead of creating an Element object to contain the
# Tag.
child = node
elif node.element.__class__ == NavigableString:
string_child = child = node.element
else:
child = node.element
if not isinstance(child, basestring) and child.parent is not None:
node.element.extract()
if (string_child and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# We are appending a string onto another string.
# TODO This has O(n^2) performance, for input like
# "a</a>a</a>a</a>..."
old_element = self.element.contents[-1]
new_element = self.soup.new_string(old_element + string_child)
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
if isinstance(node, basestring):
# Create a brand new NavigableString from this string.
child = self.soup.new_string(node)
# Tell Beautiful Soup to act as if it parsed this element
# immediately after the parent's last descendant. (Or
# immediately after the parent, if it has no children.)
if self.element.contents:
most_recent_element = self.element._last_descendant(False)
else:
most_recent_element = self.element
self.soup.object_was_parsed(
child, parent=self.element,
most_recent_element=most_recent_element)
def getAttributes(self):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes is not None and len(attributes) > 0:
converted_attributes = []
for name, value in list(attributes.items()):
if isinstance(name, tuple):
new_name = NamespacedAttribute(*name)
del attributes[name]
attributes[new_name] = value
self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes)
for name, value in attributes.items():
self.element[name] = value
# The attributes may contain variables that need substitution.
# Call set_up_substitutions manually.
#
# The Tag constructor called this method when the Tag was created,
# but we just set/changed the attributes, so call it again.
self.soup.builder.set_up_substitutions(self.element)
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
if insertBefore:
text = TextNode(self.soup.new_string(data), self.soup)
self.insertBefore(data, insertBefore)
else:
self.appendChild(data)
def insertBefore(self, node, refNode):
index = self.element.index(refNode.element)
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[index-1].__class__ == NavigableString):
# (See comments in appendChild)
old_node = self.element.contents[index-1]
new_str = self.soup.new_string(old_node + node.element)
old_node.replace_with(new_str)
else:
self.element.insert(index, node.element)
node.parent = self
def removeChild(self, node):
node.element.extract()
def reparentChildren(self, new_parent):
"""Move all of this tag's children into another tag."""
element = self.element
new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children
# are removed.
final_next_element = element.next_sibling
new_parents_last_descendant = new_parent_element._last_descendant(False, False)
if len(new_parent_element.contents) > 0:
# The new parent already contains children. We will be
# appending this tag's children to the end.
new_parents_last_child = new_parent_element.contents[-1]
new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
else:
# The new parent contains no children.
new_parents_last_child = None
new_parents_last_descendant_next_element = new_parent_element.next_element
to_append = element.contents
append_after = new_parent.element.contents
if len(to_append) > 0:
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
first_child = to_append[0]
first_child.previous_element = new_parents_last_descendant
first_child.previous_sibling = new_parents_last_child
# Fix the last child's next_element and next_sibling
last_child = to_append[-1]
last_child.next_element = new_parents_last_descendant_next_element
last_child.next_sibling = None
for child in to_append:
child.parent = new_parent_element
new_parent_element.contents.append(child)
# Now that this element has no children, change its .next_element.
element.contents = []
element.next_element = final_next_element
def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace)
node = Element(tag, self.soup, self.namespace)
for key,value in self.attributes:
node.attributes[key] = value
return node
def hasContent(self):
return self.element.contents
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TextNode(Element):
def __init__(self, element, soup):
html5lib.treebuilders._base.Node.__init__(self, None)
self.element = element
self.soup = soup
def cloneNode(self):
raise NotImplementedError
@@ -0,0 +1,258 @@
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
__all__ = [
'HTMLParserTreeBuilder',
]
from HTMLParser import (
HTMLParser,
HTMLParseError,
)
import sys
import warnings
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
# argument, which we'd like to set to False. Unfortunately,
# http://bugs.python.org/issue13273 makes strict=True a better bet
# before Python 3.2.3.
#
# At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3]
CONSTRUCTOR_TAKES_STRICT = (
major > 3
or (major == 3 and minor > 2)
or (major == 3 and minor == 2 and release >= 3))
from bs4.element import (
CData,
Comment,
Declaration,
Doctype,
ProcessingInstruction,
)
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.builder import (
HTML,
HTMLTreeBuilder,
STRICT,
)
HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
def handle_starttag(self, name, attrs):
# XXX namespace
attr_dict = {}
for key, value in attrs:
# Change None attribute values to the empty string
# for consistency with the other tree builders.
if value is None:
value = ''
attr_dict[key] = value
attrvalue = '""'
self.soup.handle_starttag(name, None, None, attr_dict)
def handle_endtag(self, name):
self.soup.handle_endtag(name)
def handle_data(self, data):
self.soup.handle_data(data)
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed.
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name):
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
if character is not None:
data = character
else:
data = "&%s;" % name
self.handle_data(data)
def handle_comment(self, data):
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(Comment)
def handle_decl(self, data):
self.soup.endData()
if data.startswith("DOCTYPE "):
data = data[len("DOCTYPE "):]
elif data == 'DOCTYPE':
# i.e. "<!DOCTYPE>"
data = ''
self.soup.handle_data(data)
self.soup.endData(Doctype)
def unknown_decl(self, data):
if data.upper().startswith('CDATA['):
cls = CData
data = data[len('CDATA['):]
else:
cls = Declaration
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(cls)
def handle_pi(self, data):
self.soup.endData()
if data.endswith("?") and data.lower().startswith("xml"):
# "An XHTML processing instruction using the trailing '?'
# will cause the '?' to be included in data." - HTMLParser
# docs.
#
# Strip the question mark so we don't end up with two
# question marks.
data = data[:-1]
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
class HTMLParserTreeBuilder(HTMLTreeBuilder):
is_xml = False
features = [HTML, STRICT, HTMLPARSER]
def __init__(self, *args, **kwargs):
if CONSTRUCTOR_TAKES_STRICT:
kwargs['strict'] = False
self.parser_args = (args, kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
"""
:return: A 4-tuple (markup, original encoding, encoding
declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER).
"""
if isinstance(markup, unicode):
yield (markup, None, None, False)
return
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
yield (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding,
dammit.contains_replacement_characters)
def feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
import re
attrfind_tolerant = re.compile(
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\s]+ # bare value
)
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
from html.parser import tagfind, attrfind
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos:
if self.strict:
m = attrfind.match(rawdata, k)
else:
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
if self.strict:
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
CONSTRUCTOR_TAKES_STRICT = True
@@ -0,0 +1,233 @@
__all__ = [
'LXMLTreeBuilderForXML',
'LXMLTreeBuilder',
]
from io import BytesIO
from StringIO import StringIO
import collections
from lxml import etree
from bs4.element import Comment, Doctype, NamespacedAttribute
from bs4.builder import (
FAST,
HTML,
HTMLTreeBuilder,
PERMISSIVE,
ParserRejectedMarkup,
TreeBuilder,
XML)
from bs4.dammit import EncodingDetector
LXML = 'lxml'
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
is_xml = True
# Well, it's permissive by XML parser standards.
features = [LXML, XML, FAST, PERMISSIVE]
CHUNK_SIZE = 512
# This namespace mapping is specified in the XML Namespace
# standard.
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
def default_parser(self, encoding):
# This can either return a parser object or a class, which
# will be instantiated with default arguments.
if self._default_parser is not None:
return self._default_parser
return etree.XMLParser(
target=self, strip_cdata=False, recover=True, encoding=encoding)
def parser_for(self, encoding):
# Use the default parser.
parser = self.default_parser(encoding)
if isinstance(parser, collections.Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser
def __init__(self, parser=None, empty_element_tags=None):
# TODO: Issue a warning if parser is present but not a
# callable, since that means there's no way to create new
# parsers for different encodings.
self._default_parser = parser
if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags)
self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS]
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
# name. Copied from lxml's src/lxml/sax.py.
if tag[0] == '{':
return tuple(tag[1:].split('}', 1))
else:
return (None, tag)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
"""
:yield: A series of 4-tuples.
(markup, encoding, declared encoding,
has undergone character replacement)
Each 4-tuple represents a strategy for parsing the document.
"""
if isinstance(markup, unicode):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
yield markup, None, document_declared_encoding, False
if isinstance(markup, unicode):
# No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8",
document_declared_encoding, False)
# Instead of using UnicodeDammit to convert the bytestring to
# Unicode using different encodings, use EncodingDetector to
# iterate over the encodings, and tell lxml to try to parse
# the document as each one in turn.
is_html = not self.is_xml
try_encodings = [user_specified_encoding, document_declared_encoding]
detector = EncodingDetector(markup, try_encodings, is_html)
for encoding in detector.encodings:
yield (detector.markup, encoding, document_declared_encoding, False)
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
elif isinstance(markup, unicode):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
# or the parser won't be initialized.
data = markup.read(self.CHUNK_SIZE)
try:
self.parser = self.parser_for(self.soup.original_encoding)
self.parser.feed(data)
while len(data) != 0:
# Now call feed() on the rest of the data, chunk by chunk.
data = markup.read(self.CHUNK_SIZE)
if len(data) != 0:
self.parser.feed(data)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
raise ParserRejectedMarkup(str(e))
def close(self):
self.nsmaps = [self.DEFAULT_NSMAPS]
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
if len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a
# separate tag stack to know when they end.
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
self.nsmaps.append(inverted_nsmap)
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()
for prefix, namespace in nsmap.items():
attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
# Namespaces are in play. Find any attributes that came in
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
new_attrs = {}
for attr, value in attrs.items():
namespace, attr = self._getNsTag(attr)
if namespace is None:
new_attrs[attr] = value
else:
nsprefix = self._prefix_for_namespace(namespace)
attr = NamespacedAttribute(nsprefix, attr, namespace)
new_attrs[attr] = value
attrs = new_attrs
namespace, name = self._getNsTag(name)
nsprefix = self._prefix_for_namespace(namespace)
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
def _prefix_for_namespace(self, namespace):
"""Find the currently active prefix for the given namespace."""
if namespace is None:
return None
for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap:
return inverted_nsmap[namespace]
return None
def end(self, name):
self.soup.endData()
completed_tag = self.soup.tagStack[-1]
namespace, name = self._getNsTag(name)
nsprefix = None
if namespace is not None:
for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap:
nsprefix = inverted_nsmap[namespace]
break
self.soup.handle_endtag(name, nsprefix)
if len(self.nsmaps) > 1:
# This tag, or one of its parents, introduced a namespace
# mapping, so pop it off the stack.
self.nsmaps.pop()
def pi(self, target, data):
pass
def data(self, content):
self.soup.handle_data(content)
def doctype(self, name, pubid, system):
self.soup.endData()
doctype = Doctype.for_name_and_ids(name, pubid, system)
self.soup.object_was_parsed(doctype)
def comment(self, content):
"Handle comments as Comment objects."
self.soup.endData()
self.soup.handle_data(content)
self.soup.endData(Comment)
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = [LXML, HTML, FAST, PERMISSIVE]
is_xml = False
def default_parser(self, encoding):
return etree.HTMLParser
def feed(self, markup):
encoding = self.soup.original_encoding
try:
self.parser = self.parser_for(encoding)
self.parser.feed(markup)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
raise ParserRejectedMarkup(str(e))
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><body>%s</body></html>' % fragment
@@ -0,0 +1,829 @@
# -*- coding: utf-8 -*-
"""Beautiful Soup bonus library: Unicode, Dammit
This library converts a bytestream to Unicode through any means
necessary. It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It works best on XML and XML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
import codecs
from htmlentitydefs import codepoint2name
import re
import logging
import string
# Import a library to autodetect character encodings.
chardet_type = None
try:
# First try the fast C implementation.
# PyPI package: cchardet
import cchardet
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
except ImportError:
try:
# Fall back to the pure Python implementation
# Debian package: python-chardet
# PyPI package: chardet
import chardet
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
except ImportError:
# No chardet available.
def chardet_dammit(s):
return None
# Available from http://cjkpython.i18n.org/.
try:
import iconv_codec
except ImportError:
pass
xml_encoding_re = re.compile(
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
html_meta_re = re.compile(
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object):
"""Substitute XML or HTML entities for the corresponding characters."""
def _populate_class_variables():
lookup = {}
reverse_lookup = {}
characters_for_re = []
for codepoint, name in list(codepoint2name.items()):
character = unichr(codepoint)
if codepoint != 34:
# There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which
# is handled elsewhere.
characters_for_re.append(character)
lookup[character] = name
# But we do want to turn &quot; into the quotation mark.
reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition)
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
CHARACTER_TO_XML_ENTITY = {
"'": "apos",
'"': "quot",
"&": "amp",
"<": "lt",
">": "gt",
}
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
")")
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
@classmethod
def _substitute_html_entity(cls, matchobj):
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
return "&%s;" % entity
@classmethod
def _substitute_xml_entity(cls, matchobj):
"""Used with a regular expression to substitute the
appropriate XML entity for an XML special character."""
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
return "&%s;" % entity
@classmethod
def quoted_attribute_value(self, value):
"""Make a value into a quoted XML attribute, possibly escaping it.
Most strings will be quoted using double quotes.
Bob's Bar -> "Bob's Bar"
If a string contains double quotes, it will be quoted using
single quotes.
Welcome to "my bar" -> 'Welcome to "my bar"'
If a string contains both single and double quotes, the
double quotes will be escaped, and the string will be quoted
using double quotes.
Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
"""
quote_with = '"'
if '"' in value:
if "'" in value:
# The string contains both single and double
# quotes. Turn the double quotes into
# entities. We quote the double quotes rather than
# the single quotes because the entity name is
# "&quot;" whether this is HTML or XML. If we
# quoted the single quotes, we'd have to decide
# between &apos; and &squot;.
replace_with = "&quot;"
value = value.replace('"', replace_with)
else:
# There are double quotes but no single quotes.
# We can use single quotes to quote the attribute.
quote_with = "'"
return quote_with + value + quote_with
@classmethod
def substitute_xml(cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
:param value: A string to be substituted. The less-than sign
will become &lt;, the greater-than sign will become &gt;,
and any ampersands will become &amp;. If you want ampersands
that appear to be part of an entity definition to be left
alone, use substitute_xml_containing_entities() instead.
:param make_quoted_attribute: If True, then the string will be
quoted, as befits an attribute value.
"""
# Escape angle brackets and ampersands.
value = cls.AMPERSAND_OR_BRACKET.sub(
cls._substitute_xml_entity, value)
if make_quoted_attribute:
value = cls.quoted_attribute_value(value)
return value
@classmethod
def substitute_xml_containing_entities(
cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
:param value: A string to be substituted. The less-than sign will
become &lt;, the greater-than sign will become &gt;, and any
ampersands that are not part of an entity defition will
become &amp;.
:param make_quoted_attribute: If True, then the string will be
quoted, as befits an attribute value.
"""
# Escape angle brackets, and ampersands that aren't part of
# entities.
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
cls._substitute_xml_entity, value)
if make_quoted_attribute:
value = cls.quoted_attribute_value(value)
return value
@classmethod
def substitute_html(cls, s):
"""Replace certain Unicode characters with named HTML entities.
This differs from data.encode(encoding, 'xmlcharrefreplace')
in that the goal is to make the result more readable (to those
with ASCII displays) rather than to recover from
errors. There's absolutely nothing wrong with a UTF-8 string
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
character with "&eacute;" will make it more readable to some
people.
"""
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
cls._substitute_html_entity, s)
class EncodingDetector:
"""Suggests a number of possible encodings for a bytestring.
Order of precedence:
1. Encodings you specifically tell EncodingDetector to try first
(the override_encodings argument to the constructor).
2. An encoding declared within the bytestring itself, either in an
XML declaration (if the bytestring is to be interpreted as an XML
document), or in a <meta> tag (if the bytestring is to be
interpreted as an HTML document.)
3. An encoding detected through textual analysis by chardet,
cchardet, or a similar external library.
4. UTF-8.
5. Windows-1252.
"""
def __init__(self, markup, override_encodings=None, is_html=False):
self.override_encodings = override_encodings or []
self.chardet_encoding = None
self.is_html = is_html
self.declared_encoding = None
# First order of business: strip a byte-order mark.
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
def _usable(self, encoding, tried):
if encoding is not None:
encoding = encoding.lower()
if encoding not in tried:
tried.add(encoding)
return True
return False
@property
def encodings(self):
"""Yield a number of encodings that might work for this markup."""
tried = set()
for e in self.override_encodings:
if self._usable(e, tried):
yield e
# Did the document originally start with a byte-order mark
# that indicated its encoding?
if self._usable(self.sniffed_encoding, tried):
yield self.sniffed_encoding
# Look within the document for an XML or HTML encoding
# declaration.
if self.declared_encoding is None:
self.declared_encoding = self.find_declared_encoding(
self.markup, self.is_html)
if self._usable(self.declared_encoding, tried):
yield self.declared_encoding
# Use third-party character set detection to guess at the
# encoding.
if self.chardet_encoding is None:
self.chardet_encoding = chardet_dammit(self.markup)
if self._usable(self.chardet_encoding, tried):
yield self.chardet_encoding
# As a last-ditch effort, try utf-8 and windows-1252.
for e in ('utf-8', 'windows-1252'):
if self._usable(e, tried):
yield e
@classmethod
def strip_byte_order_mark(cls, data):
"""If a byte-order mark is present, strip it and return the encoding it implies."""
encoding = None
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
and (data[2:4] != '\x00\x00'):
encoding = 'utf-16be'
data = data[2:]
elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
and (data[2:4] != '\x00\x00'):
encoding = 'utf-16le'
data = data[2:]
elif data[:3] == b'\xef\xbb\xbf':
encoding = 'utf-8'
data = data[3:]
elif data[:4] == b'\x00\x00\xfe\xff':
encoding = 'utf-32be'
data = data[4:]
elif data[:4] == b'\xff\xfe\x00\x00':
encoding = 'utf-32le'
data = data[4:]
return data, encoding
@classmethod
def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
"""Given a document, tries to find its declared encoding.
An XML encoding is declared at the beginning of the document.
An HTML encoding is declared in a <meta> tag, hopefully near the
beginning of the document.
"""
if search_entire_document:
xml_endpos = html_endpos = len(markup)
else:
xml_endpos = 1024
html_endpos = max(2048, int(len(markup) * 0.05))
declared_encoding = None
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
if not declared_encoding_match and is_html:
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
if declared_encoding_match is not None:
declared_encoding = declared_encoding_match.groups()[0].decode(
'ascii')
if declared_encoding:
return declared_encoding.lower()
return None
class UnicodeDammit:
"""A class for detecting the encoding of a *ML document and
converting it to a Unicode string. If the source encoding is
windows-1252, can replace MS smart quotes with their HTML or XML
equivalents."""
# This dictionary maps commonly seen values for "charset" in HTML
# meta tags to the corresponding Python codec names. It only covers
# values that aren't in Python's aliases and can't be determined
# by the heuristics in find_codec.
CHARSET_ALIASES = {"macintosh": "mac-roman",
"x-sjis": "shift-jis"}
ENCODINGS_WITH_SMART_QUOTES = [
"windows-1252",
"iso-8859-1",
"iso-8859-2",
]
def __init__(self, markup, override_encodings=[],
smart_quotes_to=None, is_html=False):
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
self.is_html = is_html
self.detector = EncodingDetector(markup, override_encodings, is_html)
# Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, unicode) or markup == '':
self.markup = markup
self.unicode_markup = unicode(markup)
self.original_encoding = None
return
# The encoding detector may have stripped a byte-order mark.
# Use the stripped markup from this point on.
self.markup = self.detector.markup
u = None
for encoding in self.detector.encodings:
markup = self.detector.markup
u = self._convert_from(encoding)
if u is not None:
break
if not u:
# None of the encodings worked. As an absolute last resort,
# try them again with character replacement.
for encoding in self.detector.encodings:
if encoding != "ascii":
u = self._convert_from(encoding, "replace")
if u is not None:
logging.warning(
"Some characters could not be decoded, and were "
"replaced with REPLACEMENT CHARACTER.")
self.contains_replacement_characters = True
break
# If none of that worked, we could at this point force it to
# ASCII, but that would destroy so much data that I think
# giving up is better.
self.unicode_markup = u
if not u:
self.original_encoding = None
def _sub_ms_char(self, match):
"""Changes a MS smart quote character to an XML or HTML
entity, or an ASCII character."""
orig = match.group(1)
if self.smart_quotes_to == 'ascii':
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
else:
sub = self.MS_CHARS.get(orig)
if type(sub) == tuple:
if self.smart_quotes_to == 'xml':
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
else:
sub = '&'.encode() + sub[0].encode() + ';'.encode()
else:
sub = sub.encode()
return sub
def _convert_from(self, proposed, errors="strict"):
proposed = self.find_codec(proposed)
if not proposed or (proposed, errors) in self.tried_encodings:
return None
self.tried_encodings.append((proposed, errors))
markup = self.markup
# Convert smart quotes to HTML if coming from an encoding
# that might have them.
if (self.smart_quotes_to is not None
and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
smart_quotes_re = b"([\x80-\x9f])"
smart_quotes_compiled = re.compile(smart_quotes_re)
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
try:
#print "Trying to convert document to %s (errors=%s)" % (
# proposed, errors)
u = self._to_unicode(markup, proposed, errors)
self.markup = u
self.original_encoding = proposed
except Exception as e:
#print "That didn't work!"
#print e
return None
#print "Correct encoding: %s" % proposed
return self.markup
def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
return unicode(data, encoding, errors)
@property
def declared_html_encoding(self):
if not self.is_html:
return None
return self.detector.declared_encoding
def find_codec(self, charset):
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
or (charset and self._codec(charset.replace("-", "")))
or (charset and self._codec(charset.replace("-", "_")))
or (charset and charset.lower())
or charset
)
if value:
return value.lower()
return None
def _codec(self, charset):
if not charset:
return charset
codec = None
try:
codecs.lookup(charset)
codec = charset
except (LookupError, ValueError):
pass
return codec
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
MS_CHARS = {b'\x80': ('euro', '20AC'),
b'\x81': ' ',
b'\x82': ('sbquo', '201A'),
b'\x83': ('fnof', '192'),
b'\x84': ('bdquo', '201E'),
b'\x85': ('hellip', '2026'),
b'\x86': ('dagger', '2020'),
b'\x87': ('Dagger', '2021'),
b'\x88': ('circ', '2C6'),
b'\x89': ('permil', '2030'),
b'\x8A': ('Scaron', '160'),
b'\x8B': ('lsaquo', '2039'),
b'\x8C': ('OElig', '152'),
b'\x8D': '?',
b'\x8E': ('#x17D', '17D'),
b'\x8F': '?',
b'\x90': '?',
b'\x91': ('lsquo', '2018'),
b'\x92': ('rsquo', '2019'),
b'\x93': ('ldquo', '201C'),
b'\x94': ('rdquo', '201D'),
b'\x95': ('bull', '2022'),
b'\x96': ('ndash', '2013'),
b'\x97': ('mdash', '2014'),
b'\x98': ('tilde', '2DC'),
b'\x99': ('trade', '2122'),
b'\x9a': ('scaron', '161'),
b'\x9b': ('rsaquo', '203A'),
b'\x9c': ('oelig', '153'),
b'\x9d': '?',
b'\x9e': ('#x17E', '17E'),
b'\x9f': ('Yuml', ''),}
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
# horrors like stripping diacritical marks to turn á into a, but also
# contains non-horrors like turning “ into ".
MS_CHARS_TO_ASCII = {
b'\x80' : 'EUR',
b'\x81' : ' ',
b'\x82' : ',',
b'\x83' : 'f',
b'\x84' : ',,',
b'\x85' : '...',
b'\x86' : '+',
b'\x87' : '++',
b'\x88' : '^',
b'\x89' : '%',
b'\x8a' : 'S',
b'\x8b' : '<',
b'\x8c' : 'OE',
b'\x8d' : '?',
b'\x8e' : 'Z',
b'\x8f' : '?',
b'\x90' : '?',
b'\x91' : "'",
b'\x92' : "'",
b'\x93' : '"',
b'\x94' : '"',
b'\x95' : '*',
b'\x96' : '-',
b'\x97' : '--',
b'\x98' : '~',
b'\x99' : '(TM)',
b'\x9a' : 's',
b'\x9b' : '>',
b'\x9c' : 'oe',
b'\x9d' : '?',
b'\x9e' : 'z',
b'\x9f' : 'Y',
b'\xa0' : ' ',
b'\xa1' : '!',
b'\xa2' : 'c',
b'\xa3' : 'GBP',
b'\xa4' : '$', #This approximation is especially parochial--this is the
#generic currency symbol.
b'\xa5' : 'YEN',
b'\xa6' : '|',
b'\xa7' : 'S',
b'\xa8' : '..',
b'\xa9' : '',
b'\xaa' : '(th)',
b'\xab' : '<<',
b'\xac' : '!',
b'\xad' : ' ',
b'\xae' : '(R)',
b'\xaf' : '-',
b'\xb0' : 'o',
b'\xb1' : '+-',
b'\xb2' : '2',
b'\xb3' : '3',
b'\xb4' : ("'", 'acute'),
b'\xb5' : 'u',
b'\xb6' : 'P',
b'\xb7' : '*',
b'\xb8' : ',',
b'\xb9' : '1',
b'\xba' : '(th)',
b'\xbb' : '>>',
b'\xbc' : '1/4',
b'\xbd' : '1/2',
b'\xbe' : '3/4',
b'\xbf' : '?',
b'\xc0' : 'A',
b'\xc1' : 'A',
b'\xc2' : 'A',
b'\xc3' : 'A',
b'\xc4' : 'A',
b'\xc5' : 'A',
b'\xc6' : 'AE',
b'\xc7' : 'C',
b'\xc8' : 'E',
b'\xc9' : 'E',
b'\xca' : 'E',
b'\xcb' : 'E',
b'\xcc' : 'I',
b'\xcd' : 'I',
b'\xce' : 'I',
b'\xcf' : 'I',
b'\xd0' : 'D',
b'\xd1' : 'N',
b'\xd2' : 'O',
b'\xd3' : 'O',
b'\xd4' : 'O',
b'\xd5' : 'O',
b'\xd6' : 'O',
b'\xd7' : '*',
b'\xd8' : 'O',
b'\xd9' : 'U',
b'\xda' : 'U',
b'\xdb' : 'U',
b'\xdc' : 'U',
b'\xdd' : 'Y',
b'\xde' : 'b',
b'\xdf' : 'B',
b'\xe0' : 'a',
b'\xe1' : 'a',
b'\xe2' : 'a',
b'\xe3' : 'a',
b'\xe4' : 'a',
b'\xe5' : 'a',
b'\xe6' : 'ae',
b'\xe7' : 'c',
b'\xe8' : 'e',
b'\xe9' : 'e',
b'\xea' : 'e',
b'\xeb' : 'e',
b'\xec' : 'i',
b'\xed' : 'i',
b'\xee' : 'i',
b'\xef' : 'i',
b'\xf0' : 'o',
b'\xf1' : 'n',
b'\xf2' : 'o',
b'\xf3' : 'o',
b'\xf4' : 'o',
b'\xf5' : 'o',
b'\xf6' : 'o',
b'\xf7' : '/',
b'\xf8' : 'o',
b'\xf9' : 'u',
b'\xfa' : 'u',
b'\xfb' : 'u',
b'\xfc' : 'u',
b'\xfd' : 'y',
b'\xfe' : 'b',
b'\xff' : 'y',
}
# A map used when removing rogue Windows-1252/ISO-8859-1
# characters in otherwise UTF-8 documents.
#
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
# Windows-1252.
WINDOWS_1252_TO_UTF8 = {
0x80 : b'\xe2\x82\xac', # €
0x82 : b'\xe2\x80\x9a', #
0x83 : b'\xc6\x92', # ƒ
0x84 : b'\xe2\x80\x9e', # „
0x85 : b'\xe2\x80\xa6', # …
0x86 : b'\xe2\x80\xa0', # †
0x87 : b'\xe2\x80\xa1', # ‡
0x88 : b'\xcb\x86', # ˆ
0x89 : b'\xe2\x80\xb0', # ‰
0x8a : b'\xc5\xa0', # Š
0x8b : b'\xe2\x80\xb9', #
0x8c : b'\xc5\x92', # Œ
0x8e : b'\xc5\xbd', # Ž
0x91 : b'\xe2\x80\x98', #
0x92 : b'\xe2\x80\x99', #
0x93 : b'\xe2\x80\x9c', # “
0x94 : b'\xe2\x80\x9d', # ”
0x95 : b'\xe2\x80\xa2', # •
0x96 : b'\xe2\x80\x93', #
0x97 : b'\xe2\x80\x94', # —
0x98 : b'\xcb\x9c', # ˜
0x99 : b'\xe2\x84\xa2', # ™
0x9a : b'\xc5\xa1', # š
0x9b : b'\xe2\x80\xba', #
0x9c : b'\xc5\x93', # œ
0x9e : b'\xc5\xbe', # ž
0x9f : b'\xc5\xb8', # Ÿ
0xa0 : b'\xc2\xa0', #  
0xa1 : b'\xc2\xa1', # ¡
0xa2 : b'\xc2\xa2', # ¢
0xa3 : b'\xc2\xa3', # £
0xa4 : b'\xc2\xa4', # ¤
0xa5 : b'\xc2\xa5', # ¥
0xa6 : b'\xc2\xa6', # ¦
0xa7 : b'\xc2\xa7', # §
0xa8 : b'\xc2\xa8', # ¨
0xa9 : b'\xc2\xa9', # ©
0xaa : b'\xc2\xaa', # ª
0xab : b'\xc2\xab', # «
0xac : b'\xc2\xac', # ¬
0xad : b'\xc2\xad', # ­
0xae : b'\xc2\xae', # ®
0xaf : b'\xc2\xaf', # ¯
0xb0 : b'\xc2\xb0', # °
0xb1 : b'\xc2\xb1', # ±
0xb2 : b'\xc2\xb2', # ²
0xb3 : b'\xc2\xb3', # ³
0xb4 : b'\xc2\xb4', # ´
0xb5 : b'\xc2\xb5', # µ
0xb6 : b'\xc2\xb6', # ¶
0xb7 : b'\xc2\xb7', # ·
0xb8 : b'\xc2\xb8', # ¸
0xb9 : b'\xc2\xb9', # ¹
0xba : b'\xc2\xba', # º
0xbb : b'\xc2\xbb', # »
0xbc : b'\xc2\xbc', # ¼
0xbd : b'\xc2\xbd', # ½
0xbe : b'\xc2\xbe', # ¾
0xbf : b'\xc2\xbf', # ¿
0xc0 : b'\xc3\x80', # À
0xc1 : b'\xc3\x81', # Á
0xc2 : b'\xc3\x82', # Â
0xc3 : b'\xc3\x83', # Ã
0xc4 : b'\xc3\x84', # Ä
0xc5 : b'\xc3\x85', # Å
0xc6 : b'\xc3\x86', # Æ
0xc7 : b'\xc3\x87', # Ç
0xc8 : b'\xc3\x88', # È
0xc9 : b'\xc3\x89', # É
0xca : b'\xc3\x8a', # Ê
0xcb : b'\xc3\x8b', # Ë
0xcc : b'\xc3\x8c', # Ì
0xcd : b'\xc3\x8d', # Í
0xce : b'\xc3\x8e', # Î
0xcf : b'\xc3\x8f', # Ï
0xd0 : b'\xc3\x90', # Ð
0xd1 : b'\xc3\x91', # Ñ
0xd2 : b'\xc3\x92', # Ò
0xd3 : b'\xc3\x93', # Ó
0xd4 : b'\xc3\x94', # Ô
0xd5 : b'\xc3\x95', # Õ
0xd6 : b'\xc3\x96', # Ö
0xd7 : b'\xc3\x97', # ×
0xd8 : b'\xc3\x98', # Ø
0xd9 : b'\xc3\x99', # Ù
0xda : b'\xc3\x9a', # Ú
0xdb : b'\xc3\x9b', # Û
0xdc : b'\xc3\x9c', # Ü
0xdd : b'\xc3\x9d', # Ý
0xde : b'\xc3\x9e', # Þ
0xdf : b'\xc3\x9f', # ß
0xe0 : b'\xc3\xa0', # à
0xe1 : b'\xa1', # á
0xe2 : b'\xc3\xa2', # â
0xe3 : b'\xc3\xa3', # ã
0xe4 : b'\xc3\xa4', # ä
0xe5 : b'\xc3\xa5', # å
0xe6 : b'\xc3\xa6', # æ
0xe7 : b'\xc3\xa7', # ç
0xe8 : b'\xc3\xa8', # è
0xe9 : b'\xc3\xa9', # é
0xea : b'\xc3\xaa', # ê
0xeb : b'\xc3\xab', # ë
0xec : b'\xc3\xac', # ì
0xed : b'\xc3\xad', # í
0xee : b'\xc3\xae', # î
0xef : b'\xc3\xaf', # ï
0xf0 : b'\xc3\xb0', # ð
0xf1 : b'\xc3\xb1', # ñ
0xf2 : b'\xc3\xb2', # ò
0xf3 : b'\xc3\xb3', # ó
0xf4 : b'\xc3\xb4', # ô
0xf5 : b'\xc3\xb5', # õ
0xf6 : b'\xc3\xb6', # ö
0xf7 : b'\xc3\xb7', # ÷
0xf8 : b'\xc3\xb8', # ø
0xf9 : b'\xc3\xb9', # ù
0xfa : b'\xc3\xba', # ú
0xfb : b'\xc3\xbb', # û
0xfc : b'\xc3\xbc', # ü
0xfd : b'\xc3\xbd', # ý
0xfe : b'\xc3\xbe', # þ
}
MULTIBYTE_MARKERS_AND_SIZES = [
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
]
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
@classmethod
def detwingle(cls, in_bytes, main_encoding="utf8",
embedded_encoding="windows-1252"):
"""Fix characters from one encoding embedded in some other encoding.
Currently the only situation supported is Windows-1252 (or its
subset ISO-8859-1), embedded in UTF-8.
The input must be a bytestring. If you've already converted
the document to Unicode, you're too late.
The output is a bytestring in which `embedded_encoding`
characters have been converted to their `main_encoding`
equivalents.
"""
if embedded_encoding.replace('_', '-').lower() not in (
'windows-1252', 'windows_1252'):
raise NotImplementedError(
"Windows-1252 and ISO-8859-1 are the only currently supported "
"embedded encodings.")
if main_encoding.lower() not in ('utf8', 'utf-8'):
raise NotImplementedError(
"UTF-8 is the only currently supported main encoding.")
byte_chunks = []
chunk_start = 0
pos = 0
while pos < len(in_bytes):
byte = in_bytes[pos]
if not isinstance(byte, int):
# Python 2.x
byte = ord(byte)
if (byte >= cls.FIRST_MULTIBYTE_MARKER
and byte <= cls.LAST_MULTIBYTE_MARKER):
# This is the start of a UTF-8 multibyte character. Skip
# to the end.
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
if byte >= start and byte <= end:
pos += size
break
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
# We found a Windows-1252 character!
# Save the string up to this point as a chunk.
byte_chunks.append(in_bytes[chunk_start:pos])
# Now translate the Windows-1252 character into UTF-8
# and add it as another, one-byte chunk.
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
pos += 1
chunk_start = pos
else:
# Go on to the next character.
pos += 1
if chunk_start == 0:
# The string is unchanged.
return in_bytes
else:
# Store the final chunk.
byte_chunks.append(in_bytes[chunk_start:])
return b''.join(byte_chunks)
@@ -0,0 +1,204 @@
"""Diagnostic functions, mainly for use when doing tech support."""
import cProfile
from StringIO import StringIO
from HTMLParser import HTMLParser
import bs4
from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry
import os
import pstats
import random
import tempfile
import time
import traceback
import sys
import cProfile
def diagnose(data):
"""Diagnostic suite for isolating common problems."""
print "Diagnostic running on Beautiful Soup %s" % __version__
print "Python version %s" % sys.version
basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers:
for builder in builder_registry.builders:
if name in builder.features:
break
else:
basic_parsers.remove(name)
print (
"I noticed that %s is not installed. Installing it may help." %
name)
if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"])
from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
if 'html5lib' in basic_parsers:
import html5lib
print "Found html5lib version %s" % html5lib.__version__
if hasattr(data, 'read'):
data = data.read()
elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data
data = open(data).read()
elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
return
print
for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser
success = False
try:
soup = BeautifulSoup(data, parser)
success = True
except Exception, e:
print "%s could not parse the markup." % parser
traceback.print_exc()
if success:
print "Here's what %s did with the markup:" % parser
print soup.prettify()
print "-" * 80
def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing.
This lets you see how lxml parses a document when no Beautiful
Soup code is running.
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print("%s, %4s, %s" % (event, element.tag, element.text))
class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else."""
def _p(self, s):
print(s)
def handle_starttag(self, name, attrs):
self._p("%s START" % name)
def handle_endtag(self, name):
self._p("%s END" % name)
def handle_data(self, data):
self._p("%s DATA" % data)
def handle_charref(self, name):
self._p("%s CHARREF" % name)
def handle_entityref(self, name):
self._p("%s ENTITYREF" % name)
def handle_comment(self, data):
self._p("%s COMMENT" % data)
def handle_decl(self, data):
self._p("%s DECL" % data)
def unknown_decl(self, data):
self._p("%s UNKNOWN-DECL" % data)
def handle_pi(self, data):
self._p("%s PI" % data)
def htmlparser_trace(data):
"""Print out the HTMLParser events that occur during parsing.
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
"""
parser = AnnouncingParser()
parser.feed(data)
_vowels = "aeiou"
_consonants = "bcdfghjklmnpqrstvwxyz"
def rword(length=5):
"Generate a random word-like string."
s = ''
for i in range(length):
if i % 2 == 0:
t = _consonants
else:
t = _vowels
s += random.choice(t)
return s
def rsentence(length=4):
"Generate a random sentence-like string."
return " ".join(rword(random.randint(4,9)) for i in range(length))
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
elements = []
for i in range(num_elements):
choice = random.randint(0,3)
if choice == 0:
# New tag.
tag_name = random.choice(tag_names)
elements.append("<%s>" % tag_name)
elif choice == 1:
elements.append(rsentence(random.randint(1,4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
data = rdoc(num_elements)
print "Generated a large invalid HTML document (%d bytes)." % len(data)
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
try:
a = time.time()
soup = BeautifulSoup(data, parser)
b = time.time()
success = True
except Exception, e:
print "%s could not parse the markup." % parser
traceback.print_exc()
if success:
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print "Raw lxml parsed the markup in %.2fs." % (b-a)
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
print "Raw html5lib parsed the markup in %.2fs." % (b-a)
def profile(num_elements=100000, parser="lxml"):
filehandle = tempfile.NamedTemporaryFile()
filename = filehandle.name
data = rdoc(num_elements)
vars = dict(bs4=bs4, data=data, parser=parser)
cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
stats = pstats.Stats(filename)
# stats.strip_dirs()
stats.sort_stats("cumulative")
stats.print_stats('_html5lib|bs4', 50)
if __name__ == '__main__':
diagnose(sys.stdin.read())
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,592 @@
"""Helper classes for tests."""
import copy
import functools
import unittest
from unittest import TestCase
from bs4 import BeautifulSoup
from bs4.element import (
CharsetMetaAttributeValue,
Comment,
ContentMetaAttributeValue,
Doctype,
SoupStrainer,
)
from bs4.builder import HTMLParserTreeBuilder
default_builder = HTMLParserTreeBuilder
class SoupTest(unittest.TestCase):
@property
def default_builder(self):
return default_builder()
def soup(self, markup, **kwargs):
"""Build a Beautiful Soup object from markup."""
builder = kwargs.pop('builder', self.default_builder)
return BeautifulSoup(markup, builder=builder, **kwargs)
def document_for(self, markup):
"""Turn an HTML fragment into a document.
The details depend on the builder.
"""
return self.default_builder.test_fragment_to_document(markup)
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
builder = self.default_builder
obj = BeautifulSoup(to_parse, builder=builder)
if compare_parsed_to is None:
compare_parsed_to = to_parse
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
class HTMLTreeBuilderSmokeTest(object):
"""A basic test of a treebuilder's competence.
Any HTML treebuilder, present or future, should be able to pass
these tests. With invalid markup, there's room for interpretation,
and different parsers can handle it differently. But with the
markup in these tests, there's not much room for interpretation.
"""
def assertDoctypeHandled(self, doctype_fragment):
"""Assert that a given doctype string is handled correctly."""
doctype_str, soup = self._document_with_doctype(doctype_fragment)
# Make sure a Doctype object was created.
doctype = soup.contents[0]
self.assertEqual(doctype.__class__, Doctype)
self.assertEqual(doctype, doctype_fragment)
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
self.assertEqual(soup.p.contents[0], 'foo')
def _document_with_doctype(self, doctype_fragment):
"""Generate and parse a document with the given doctype."""
doctype = '<!DOCTYPE %s>' % doctype_fragment
markup = doctype + '\n<p>foo</p>'
soup = self.soup(markup)
return doctype, soup
def test_normal_doctypes(self):
"""Make sure normal, everyday HTML doctypes are handled correctly."""
self.assertDoctypeHandled("html")
self.assertDoctypeHandled(
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
def test_empty_doctype(self):
soup = self.soup("<!DOCTYPE>")
doctype = soup.contents[0]
self.assertEqual("", doctype.strip())
def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype)
def test_system_doctype(self):
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
def test_namespaced_system_doctype(self):
# We can handle a namespaced doctype with a system ID.
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
def test_namespaced_public_doctype(self):
# Test a namespaced doctype with a public id.
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
def test_real_xhtml_document(self):
"""A real XHTML document should come out more or less the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Hello.</title></head>
<body>Goodbye.</body>
</html>"""
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
def test_deepcopy(self):
"""Make sure you can copy the tree builder.
This is important because the builder is part of a
BeautifulSoup object, and we want to be able to copy that.
"""
copy.deepcopy(self.default_builder)
def test_p_tag_is_never_empty_element(self):
"""A <p> tag is never designated as an empty-element tag.
Even if the markup shows it as an empty-element tag, it
shouldn't be presented that way.
"""
soup = self.soup("<p/>")
self.assertFalse(soup.p.is_empty_element)
self.assertEqual(str(soup.p), "<p></p>")
def test_unclosed_tags_get_closed(self):
"""A tag that's not closed by the end of the document should be closed.
This applies to all tags except empty-element tags.
"""
self.assertSoupEquals("<p>", "<p></p>")
self.assertSoupEquals("<b>", "<b></b>")
self.assertSoupEquals("<br>", "<br/>")
def test_br_is_always_empty_element_tag(self):
"""A <br> tag is designated as an empty-element tag.
Some parsers treat <br></br> as one <br/> tag, some parsers as
two tags, but it should always be an empty-element tag.
"""
soup = self.soup("<br></br>")
self.assertTrue(soup.br.is_empty_element)
self.assertEqual(str(soup.br), "<br/>")
def test_nested_formatting_elements(self):
self.assertSoupEquals("<em><em></em></em>")
def test_comment(self):
# Comments are represented as Comment objects.
markup = "<p>foo<!--foobar-->baz</p>"
self.assertSoupEquals(markup)
soup = self.soup(markup)
comment = soup.find(text="foobar")
self.assertEqual(comment.__class__, Comment)
# The comment is properly integrated into the tree.
foo = soup.find(text="foo")
self.assertEqual(comment, foo.next_element)
baz = soup.find(text="baz")
self.assertEqual(comment, baz.previous_element)
def test_preserved_whitespace_in_pre_and_textarea(self):
"""Whitespace must be preserved in <pre> and <textarea> tags."""
self.assertSoupEquals("<pre> </pre>")
self.assertSoupEquals("<textarea> woo </textarea>")
def test_nested_inline_elements(self):
"""Inline elements can be nested indefinitely."""
b_tag = "<b>Inside a B tag</b>"
self.assertSoupEquals(b_tag)
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
self.assertSoupEquals(nested_b_tag)
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
self.assertSoupEquals(nested_b_tag)
def test_nested_block_level_elements(self):
"""Block elements can be nested."""
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
blockquote = soup.blockquote
self.assertEqual(blockquote.p.b.string, 'Foo')
self.assertEqual(blockquote.b.string, 'Foo')
def test_correctly_nested_tables(self):
"""One table can go inside another one."""
markup = ('<table id="1">'
'<tr>'
"<td>Here's another table:"
'<table id="2">'
'<tr><td>foo</td></tr>'
'</table></td>')
self.assertSoupEquals(
markup,
'<table id="1"><tr><td>Here\'s another table:'
'<table id="2"><tr><td>foo</td></tr></table>'
'</td></tr></table>')
self.assertSoupEquals(
"<table><thead><tr><td>Foo</td></tr></thead>"
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
def test_deeply_nested_multivalued_attribute(self):
# html5lib can set the attributes of the same tag many times
# as it rearranges the tree. This has caused problems with
# multivalued attributes.
markup = '<table><div><div class="css"></div></div></table>'
soup = self.soup(markup)
self.assertEqual(["css"], soup.div.div['class'])
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self):
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
def test_quot_entity_converted_to_quotation_mark(self):
self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
'<p>I said "good day!"</p>')
def test_out_of_range_entity(self):
expect = u"\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect)
def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
self.assertEqual("p", soup.h2.string.next_element.name)
self.assertEqual("p", soup.p.name)
def test_basic_namespaces(self):
"""Parsers don't need to *understand* namespaces, but at the
very least they should not choke on namespaces or lose
data."""
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
soup = self.soup(markup)
self.assertEqual(markup, soup.encode())
html = soup.html
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
self.assertEqual(
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
self.assertEqual(
'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
def test_multivalued_attribute_value_becomes_list(self):
markup = b'<a class="foo bar">'
soup = self.soup(markup)
self.assertEqual(['foo', 'bar'], soup.a['class'])
#
# Generally speaking, tests below this point are more tests of
# Beautiful Soup than tests of the tree builders. But parsers are
# weird, so we run these tests separately for every tree builder
# to detect any differences between them.
#
def test_can_parse_unicode_document(self):
# A seemingly innocuous document... but it's in Unicode! And
# it contains characters that can't be represented in the
# encoding found in the declaration! The horror!
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers."""
strainer = SoupStrainer("b")
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
parse_only=strainer)
self.assertEqual(soup.decode(), "<b>bold</b>")
def test_single_quote_attribute_values_become_double_quotes(self):
self.assertSoupEquals("<foo attr='bar'></foo>",
'<foo attr="bar"></foo>')
def test_attribute_values_with_nested_quotes_are_left_alone(self):
text = """<foo attr='bar "brawls" happen'>a</foo>"""
self.assertSoupEquals(text)
def test_attribute_values_with_double_nested_quotes_get_quoted(self):
text = """<foo attr='bar "brawls" happen'>a</foo>"""
soup = self.soup(text)
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
self.assertSoupEquals(
soup.foo.decode(),
"""<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
def test_ampersand_in_attribute_value_gets_escaped(self):
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
'<this is="really messed up &amp; stuff"></this>')
self.assertSoupEquals(
'<a href="http://example.org?a=1&b=2;3">foo</a>',
'<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
def test_entities_in_strings_converted_during_parsing(self):
# Both XML and HTML entities are converted to Unicode characters
# during parsing.
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self):
# Microsoft smart quotes are converted to Unicode characters during
# parsing.
quote = b"<p>\x91Foo\x92</p>"
soup = self.soup(quote)
self.assertEqual(
soup.p.string,
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("<a>&nbsp;&nbsp;</a>")
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self):
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected)
def test_real_iso_latin_document(self):
# Smoke test of interrelated functionality, using an
# easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use
# that to test.
iso_latin_html = unicode_html.encode("iso-8859-1")
# Parse the ISO-Latin-1 HTML.
soup = self.soup(iso_latin_html)
# Encode it to UTF-8.
result = soup.encode("utf-8")
# What do we expect the result to look like? Well, it would
# look like unicode_html, except that the META tag would say
# UTF-8 instead of ISO-Latin-1.
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
# And, of course, it would be in UTF-8, not Unicode.
expected = expected.encode("utf-8")
# Ta-da!
self.assertEqual(result, expected)
def test_real_shift_jis_document(self):
# Smoke test to make sure the parser can handle a document in
# Shift-JIS encoding, without choking.
shift_jis_html = (
b'<html><head></head><body><pre>'
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
b'</pre></body></html>')
unicode_html = shift_jis_html.decode("shift-jis")
soup = self.soup(unicode_html)
# Make sure the parse tree is correctly encoded to various
# encodings.
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
def test_real_hebrew_document(self):
# A real-world test to make sure we can convert ISO-8859-9 (a
# Hebrew encoding) to UTF-8.
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
soup = self.soup(
hebrew_document, from_encoding="iso8859-8")
self.assertEqual(soup.original_encoding, 'iso8859-8')
self.assertEqual(
soup.encode('utf-8'),
hebrew_document.decode("iso8859-8").encode("utf-8"))
def test_meta_tag_reflects_current_encoding(self):
# Here's the <meta> tag saying that a document is
# encoded in Shift-JIS.
meta_tag = ('<meta content="text/html; charset=x-sjis" '
'http-equiv="Content-type"/>')
# Here's a document incorporating that meta tag.
shift_jis_html = (
'<html><head>\n%s\n'
'<meta http-equiv="Content-language" content="ja"/>'
'</head><body>Shift-JIS markup goes here.') % meta_tag
soup = self.soup(shift_jis_html)
# Parse the document, and the charset is seemingly unaffected.
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
content = parsed_meta['content']
self.assertEqual('text/html; charset=x-sjis', content)
# But that value is actually a ContentMetaAttributeValue object.
self.assertTrue(isinstance(content, ContentMetaAttributeValue))
# And it will take on a value that reflects its current
# encoding.
self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
# For the rest of the story, see TestSubstitutions in
# test_tree.py.
def test_html5_style_meta_tag_reflects_current_encoding(self):
# Here's the <meta> tag saying that a document is
# encoded in Shift-JIS.
meta_tag = ('<meta id="encoding" charset="x-sjis" />')
# Here's a document incorporating that meta tag.
shift_jis_html = (
'<html><head>\n%s\n'
'<meta http-equiv="Content-language" content="ja"/>'
'</head><body>Shift-JIS markup goes here.') % meta_tag
soup = self.soup(shift_jis_html)
# Parse the document, and the charset is seemingly unaffected.
parsed_meta = soup.find('meta', id="encoding")
charset = parsed_meta['charset']
self.assertEqual('x-sjis', charset)
# But that value is actually a CharsetMetaAttributeValue object.
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
# And it will take on a value that reflects its current
# encoding.
self.assertEqual('utf8', charset.encode("utf8"))
def test_tag_with_no_attributes_can_have_attributes_added(self):
data = self.soup("<a>text</a>")
data.a['foo'] = 'bar'
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
class XMLTreeBuilderSmokeTest(object):
def test_docstring_generated(self):
soup = self.soup("<root/>")
self.assertEqual(
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
def test_real_xhtml_document(self):
"""A real XHTML document should come out *exactly* the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Hello.</title></head>
<body>Goodbye.</body>
</html>"""
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8"), markup)
def test_formatter_processes_script_tag_for_xml_documents(self):
doc = """
<script type="text/javascript">
</script>
"""
soup = BeautifulSoup(doc, "xml")
# lxml would have stripped this while parsing, but we can add
# it later.
soup.script.string = 'console.log("< < hey > > ");'
encoded = soup.encode()
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
def test_can_parse_unicode_document(self):
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup)
self.assertEqual(
unicode(soup.rss), markup)
def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>")
self.assertEqual(
soup.encode("latin1"),
b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
def test_large_xml_document(self):
"""A large XML document should come out the same as it went in."""
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
+ b'0' * (2**12)
+ b'</root>')
soup = self.soup(markup)
self.assertEqual(soup.encode("utf-8"), markup)
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
self.assertSoupEquals("<p>", "<p/>")
self.assertSoupEquals("<p>foo</p>")
def test_namespaces_are_preserved(self):
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
soup = self.soup(markup)
root = soup.root
self.assertEqual("http://example.com/", root['xmlns:a'])
self.assertEqual("http://example.net/", root['xmlns:b'])
def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.p), markup)
def test_namespaced_attributes(self):
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""
def test_real_xhtml_document(self):
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
# XHTML documents in any particular way.
pass
def test_html_tags_have_namespace(self):
markup = "<a>"
soup = self.soup(markup)
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
def test_svg_tags_have_namespace(self):
markup = '<svg><circle/></svg>'
soup = self.soup(markup)
namespace = "http://www.w3.org/2000/svg"
self.assertEqual(namespace, soup.svg.namespace)
self.assertEqual(namespace, soup.circle.namespace)
def test_mathml_tags_have_namespace(self):
markup = '<math><msqrt>5</msqrt></math>'
soup = self.soup(markup)
namespace = 'http://www.w3.org/1998/Math/MathML'
self.assertEqual(namespace, soup.math.namespace)
self.assertEqual(namespace, soup.msqrt.namespace)
def test_xml_declaration_becomes_comment(self):
markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
soup = self.soup(markup)
self.assertTrue(isinstance(soup.contents[0], Comment))
self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
self.assertEqual("html", soup.contents[0].next_element.name)
def skipIf(condition, reason):
def nothing(test, *args, **kwargs):
return None
def decorator(test_item):
if condition:
return nothing
else:
return test_item
return decorator
@@ -0,0 +1 @@
"The beautifulsoup tests."
@@ -0,0 +1,141 @@
"""Tests of the builder registry."""
import unittest
from bs4 import BeautifulSoup
from bs4.builder import (
builder_registry as registry,
HTMLParserTreeBuilder,
TreeBuilderRegistry,
)
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError:
HTML5LIB_PRESENT = False
try:
from bs4.builder import (
LXMLTreeBuilderForXML,
LXMLTreeBuilder,
)
LXML_PRESENT = True
except ImportError:
LXML_PRESENT = False
class BuiltInRegistryTest(unittest.TestCase):
"""Test the built-in registry with the default builders registered."""
def test_combination(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('fast', 'html'),
LXMLTreeBuilder)
if LXML_PRESENT:
self.assertEqual(registry.lookup('permissive', 'xml'),
LXMLTreeBuilderForXML)
self.assertEqual(registry.lookup('strict', 'html'),
HTMLParserTreeBuilder)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html5lib', 'html'),
HTML5TreeBuilder)
def test_lookup_by_markup_type(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
else:
self.assertEqual(registry.lookup('xml'), None)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
else:
self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
def test_named_library(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('lxml', 'xml'),
LXMLTreeBuilderForXML)
self.assertEqual(registry.lookup('lxml', 'html'),
LXMLTreeBuilder)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html5lib'),
HTML5TreeBuilder)
self.assertEqual(registry.lookup('html.parser'),
HTMLParserTreeBuilder)
def test_beautifulsoup_constructor_does_lookup(self):
# You can pass in a string.
BeautifulSoup("", features="html")
# Or a list of strings.
BeautifulSoup("", features=["html", "fast"])
# You'll get an exception if BS can't find an appropriate
# builder.
self.assertRaises(ValueError, BeautifulSoup,
"", features="no-such-feature")
class RegistryTest(unittest.TestCase):
"""Test the TreeBuilderRegistry class in general."""
def setUp(self):
self.registry = TreeBuilderRegistry()
def builder_for_features(self, *feature_list):
cls = type('Builder_' + '_'.join(feature_list),
(object,), {'features' : feature_list})
self.registry.register(cls)
return cls
def test_register_with_no_features(self):
builder = self.builder_for_features()
# Since the builder advertises no features, you can't find it
# by looking up features.
self.assertEqual(self.registry.lookup('foo'), None)
# But you can find it by doing a lookup with no features, if
# this happens to be the only registered builder.
self.assertEqual(self.registry.lookup(), builder)
def test_register_with_features_makes_lookup_succeed(self):
builder = self.builder_for_features('foo', 'bar')
self.assertEqual(self.registry.lookup('foo'), builder)
self.assertEqual(self.registry.lookup('bar'), builder)
def test_lookup_fails_when_no_builder_implements_feature(self):
builder = self.builder_for_features('foo', 'bar')
self.assertEqual(self.registry.lookup('baz'), None)
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
builder1 = self.builder_for_features('foo')
builder2 = self.builder_for_features('bar')
self.assertEqual(self.registry.lookup(), builder2)
def test_lookup_fails_when_no_tree_builders_registered(self):
self.assertEqual(self.registry.lookup(), None)
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
has_one = self.builder_for_features('foo')
has_the_other = self.builder_for_features('bar')
has_both_early = self.builder_for_features('foo', 'bar', 'baz')
has_both_late = self.builder_for_features('foo', 'bar', 'quux')
lacks_one = self.builder_for_features('bar')
has_the_other = self.builder_for_features('foo')
# There are two builders featuring 'foo' and 'bar', but
# the one that also features 'quux' was registered later.
self.assertEqual(self.registry.lookup('foo', 'bar'),
has_both_late)
# There is only one builder featuring 'foo', 'bar', and 'baz'.
self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
has_both_early)
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
builder1 = self.builder_for_features('foo', 'bar')
builder2 = self.builder_for_features('foo', 'baz')
self.assertEqual(self.registry.lookup('bar', 'baz'), None)
@@ -0,0 +1,36 @@
"Test harness for doctests."
# pylint: disable-msg=E0611,W0142
__metaclass__ = type
__all__ = [
'additional_tests',
]
import atexit
import doctest
import os
#from pkg_resources import (
# resource_filename, resource_exists, resource_listdir, cleanup_resources)
import unittest
DOCTEST_FLAGS = (
doctest.ELLIPSIS |
doctest.NORMALIZE_WHITESPACE |
doctest.REPORT_NDIFF)
# def additional_tests():
# "Run the doc tests (README.txt and docs/*, if any exist)"
# doctest_files = [
# os.path.abspath(resource_filename('bs4', 'README.txt'))]
# if resource_exists('bs4', 'docs'):
# for name in resource_listdir('bs4', 'docs'):
# if name.endswith('.txt'):
# doctest_files.append(
# os.path.abspath(
# resource_filename('bs4', 'docs/%s' % name)))
# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
# atexit.register(cleanup_resources)
# return unittest.TestSuite((
# doctest.DocFileSuite(*doctest_files, **kwargs)))
@@ -0,0 +1,85 @@
"""Tests to ensure that the html5lib tree builder generates good trees."""
import warnings
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError, e:
HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer
from bs4.testing import (
HTML5TreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@skipIf(
not HTML5LIB_PRESENT,
"html5lib seems not to be present, not testing its tree builder.")
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
"""See ``HTML5TreeBuilderSmokeTest``."""
@property
def default_builder(self):
return HTML5TreeBuilder()
def test_soupstrainer(self):
# The html5lib tree builder does not support SoupStrainers.
strainer = SoupStrainer("b")
markup = "<p>A <b>bold</b> statement.</p>"
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup, parse_only=strainer)
self.assertEqual(
soup.decode(), self.document_for(markup))
self.assertTrue(
"the html5lib tree builder doesn't support parse_only" in
str(w[0].message))
def test_correctly_nested_tables(self):
"""html5lib inserts <tbody> tags where other parsers don't."""
markup = ('<table id="1">'
'<tr>'
"<td>Here's another table:"
'<table id="2">'
'<tr><td>foo</td></tr>'
'</table></td>')
self.assertSoupEquals(
markup,
'<table id="1"><tbody><tr><td>Here\'s another table:'
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
'</td></tr></tbody></table>')
self.assertSoupEquals(
"<table><thead><tr><td>Foo</td></tr></thead>"
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
def test_xml_declaration_followed_by_doctype(self):
markup = '''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html>
<head>
</head>
<body>
<p>foo</p>
</body>
</html>'''
soup = self.soup(markup)
# Verify that we can reach the <p> tag; this means the tree is connected.
self.assertEqual(b"<p>foo</p>", soup.p.encode())
def test_reparented_markup(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
soup = self.soup(markup)
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_ends_with_whitespace(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
soup = self.soup(markup)
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
@@ -0,0 +1,19 @@
"""Tests to ensure that the html.parser tree builder generates good
trees."""
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
@property
def default_builder(self):
return HTMLParserTreeBuilder()
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_namespaced_public_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass

Some files were not shown because too many files have changed in this diff Show More