reordering structure, removed windows scripts

This commit is contained in:
s0i37
2024-02-29 17:41:08 +05:00
parent 2451ba27b7
commit 39a6739754
114 changed files with 961 additions and 13222 deletions
+37
View File
@@ -0,0 +1,37 @@
FROM debian
WORKDIR /opt/crawl
RUN apt update && \
apt install -y --no-install-recommends sudo tmux iproute2 nano less iputils-ping locales && \
apt install -y --no-install-recommends wget curl file sqlite3 cifs-utils python3 python3-pip xz-utils && \
apt install -y --no-install-recommends lynx uchardet catdoc unzip python3-pdfminer p7zip-full && \
apt install -y --no-install-recommends maildir-utils mpack libemail-outlook-message-perl libemail-sender-perl binwalk && \
apt install -y --no-install-recommends graphicsmagick-imagemagick-compat tesseract-ocr tesseract-ocr-eng tesseract-ocr-rus ffmpeg && \
pip3 install --break-system-packages vosk && \
wget https://github.com/radareorg/radare2/releases/download/5.8.8/radare2-5.8.8-static.tar.xz -O /tmp/radare2.tar.xz && tar xvf /tmp/radare2.tar.xz -C /opt/ && rm /tmp/radare2.tar.xz && ln -s /opt/r2-static/usr/bin/rabin2 /usr/local/bin/rabin2
COPY bin bin
COPY cron cron
COPY www www
COPY spider.sh .
COPY crawl.sh .
COPY import.sh .
COPY search.sh .
COPY opensearch.py .
RUN apt install -y --no-install-recommends nodejs npm openjdk-17-jre && \
pip3 install --break-system-packages opensearch-py colorama && \
cd www/ && npm install && cd - && \
wget https://artifacts.opensearch.org/releases/bundle/opensearch/2.11.0/opensearch-2.11.0-linux-x64.tar.gz -O /tmp/opensearch.tar.gz && tar xvf /tmp/opensearch.tar.gz -C /opt/ && rm /tmp/opensearch.tar.gz
RUN echo 'LANG="ru_RU.UTF-8"' > /etc/default/locale && \
localedef -i ru_RU -f UTF-8 ru_RU.UTF-8 && \
locale-gen && \
echo 241 | dpkg-reconfigure locales && \
echo "LANG=ru_RU.UTF-8" > /etc/default/locale && \
useradd -s /bin/bash -g users -N -M -d /opt/crawl user && \
chown -R user.users /opt/ && \
chmod +w /etc/sudoers && echo 'user ALL=(root) NOPASSWD: ALL' >> /etc/sudoers && chmod -w /etc/sudoers
EXPOSE 8080
+127
View File
@@ -0,0 +1,127 @@
## Crawling
Each crawler goes through some source and pulls out exclusively useful data - text. Does not depend on extension. Easily customizable.
Supported file types: `text`, `html`, `doc`/`docx`, `xls`/`xlsx`, `pdf`, `archives`, `exe`/`bin`, `eml`/`msg`, `images`, `sounds`.
You can easily add your own file types (GNU power)
![crawl.sh](img/crawl.png)
![search.sh](img/search.png)
## Installation
### System
Depends:
* lynx, uchardet - html
* catdoc - doc
* xls2csv - xls
* unzip - docx,xlsx
* pdf2txt - pdf
* rabin2 - exe,dll
* 7z - archives
* identify, tesseract - images
* vosk-transcriber - audios
* msgconvert, munpack, mu - emails
* binwalk - disk images
```
sudo apt install sqlite3 cifs-utils
sudo apt install file uchardet cifs-utils lynx catdoc unzip python3-pdfminer radare2 p7zip-full
sudo apt install maildir-utils mpack libemail-outlook-message-perl libemail-sender-perl binwalk
sudo apt install graphicsmagick-imagemagick-compat tesseract-ocr tesseract-ocr-eng tesseract-ocr-rus ffmpeg
sudo pip3 install vosk
```
### Docker
```
sudo docker build -t crawl .
sudo docker run --privileged --cap-add SYS_ADMIN --cap-add DAC_READ_SEARCH --cap-add NET_BIND_SERVICE --cap-add CAP_SYSLOG -u 1000 -p 8080:8080 --name crawl -it crawl /bin/bash
```
### SMB crawling
Making a network drive local and crawl it:
```
mount.cifs "//10.10.10.10/Docs" /mnt/Docs -o ro,dom=corp.net,user=username,pass=password
./crawl.sh /mnt/Docs -size -10M
```
It will create `Docs.csv` index file.
### Web crawling
Depends:
* wget with controllable download limit (https://yurichev.com/wget.html)
Making site content local and crawl it:
```
./spider.sh --limit-size=500k http://target.com/
./crawl.sh target.com/
```
It will create `target.com.csv` index file.
### FTP crawling
Making FTP content local and crawl it:
```
./spider.sh --limit-size=500k ftp://target.com/`
./crawl.sh target.com/
```
It will create `target.com.csv` index file.
## Searching
After crawling, the extracted text is stored in `csv` files.
Data can be searched using simple `grep`:
`grep -ia -o -P ".{0,100}password..{0,100}" *.csv | grep -ai --color=auto "password"`
Or search for data using a fuzzy search (written with errors):
`tre-agrep -i -E 2 passw *.csv`
### Searching CLI (pentesters)
Data can be converted into a `sqlite3` database with full-text search support:
`./import.sh INBOX.csv`
Searching for data in the database is now more convenient:
```
./search.sh INBOX.db 's3cr3t'
./search.sh INBOX.db 'password' -c 10 -o 20
./search.sh INBOX.db 'password' -m 'admin'
```
### Searching GUI (enterprise)
Depends:
```
sudo apt install nodejs npm openjdk-17-jre
cd www && npm install
wget wget https://artifacts.opensearch.org/releases/bundle/opensearch/2.11.0/opensearch-2.11.0-linux-x64.tar.gz -O /tmp/opensearch.tar.gz && tar xvf /tmp/opensearch.tar.gz -C /opt/
JAVA_LIBRARY_PATH=/opt/opensearch/plugins/opensearch-knn/lib /opt/opensearch/opensearch-tar-install.sh
```
Searching for data using opensearch:
```
JAVA_LIBRARY_PATH=/opt/opensearch/plugins/opensearch-knn/lib /opt/opensearch/bin/opensearch
./opensearch.py localhost:9200 -i test -init
./opensearch.py localhost:9200 -i test -import INBOX.csv
cd www && node index.js
chrome http://localhost:8080/test/
```
Continuous crawling (your Google in local area) - just use a few easy cron scripts cron/README.md
Executable
BIN
View File
Binary file not shown.
+74 -45
View File
@@ -7,7 +7,9 @@ RESET=$'\x1b[39m'
[[ $# -lt 1 ]] && {
echo "$0 where/ [/usr/bin/find options]"
echo "example: $0 /mnt/share/ -size -10M ! -iname '*.wav' ! -iname '*.mp3'"
echo "example: $0 /mnt/share/ -size -10M -not -iname '*.wav' -not -iname '*.mp3'"
echo "example: $0 /mnt/share/ -not -ipath '*/Program Files*/*' -not -ipath '*/Windows/*'"
echo "example: $0 /mnt/share/ -newermt '2012-12-21 00:00'"
exit
}
@@ -64,10 +66,10 @@ find "$where" "${opts[@]}" -type f -print 2> /dev/null |
while read path
do
[[ $is_resume = 1 && $(session_is_file_done $path) = 1 ]] && {
echo "(skip $path)"
echo $GREY"$path"$RESET
continue
}
printf "\n" >> "$index"
[[ -s "$index" ]] && printf "\n" >> "$index"
echo -n "$(date +%s)," >> "$index"
echo -n "$path"
echo -n "$path" | escape >> "$index"
@@ -78,115 +80,142 @@ do
[[ $filename = $ext ]] && ext=''
echo -n "$ext" | escape >> "$index"
echo -n "," >> "$index"
mime=$(file -bi "$path")
mime=${mime%' '*}
mime=$(file -b --mime-type "$path")
case $mime in
*/xml\;)
echo -n "xml," >> "$index"
cat "$path" | escape >> "$index"
echo $GREEN " [xml]" $RESET
;;
*/*html*)
*/*html*|application/javascript)
echo -n "html," >> "$index"
codepage=$(uchardet "$path")
cat "$path" | iconv -f $codepage | lynx -nolist -dump -stdin | escape >> "$index"
echo $GREEN " [html]" $RESET
;;
text/*|*/*script\;)
text/*|*/*script|*/xml|*/json|*-ini)
echo -n "text," >> "$index"
cat "$path" | escape >> "$index"
codepage=$(uchardet "$path")
cat "$path" | iconv -f $codepage | escape >> "$index"
echo $GREEN " [text]" $RESET
;;
application/msword\;)
application/msword)
echo -n "doc," >> "$index"
catdoc "$path" | escape >> "$index"
echo $GREEN " [doc]" $RESET
;;
application/vnd.openxmlformats-officedocument.wordprocessingml.document\;)
application/vnd.openxmlformats-officedocument.wordprocessingml.document)
echo -n "doc," >> "$index"
unzip -p "$path" | grep -a '<w:r' | sed 's/<w:p[^<\/]*>/ /g' | sed 's/<[^<]*>//g' | grep -a -v '^[[:space:]]*$' | sed G | escape >> "$index"
unzip -p "$path" 2> /dev/null | grep -a '<w:r' | sed 's/<w:p[^<\/]*>/ /g' | sed 's/<[^<]*>//g' | grep -a -v '^[[:space:]]*$' | sed G | escape >> "$index"
echo $GREEN " [docx]" $RESET
if unzip -l "$path" | grep -q 'word/media/'; then
temp=$(tempfile 2>/dev/null)
rm $temp && mkdir -p "$temp/$path"
unzip "$path" 'word/media/*' -d "$temp/$path" > /dev/null
fork "$temp"
rm -r "$temp"
#session_file_done $path
fi
;;
application/vnd.ms-excel\;)
application/vnd.ms-excel)
echo -n "xls," >> "$index"
xls2csv -x "$path" | escape >> "$index"
echo $GREEN " [xls]" $RESET
;;
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\;)
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet)
echo -n "xlsx," >> "$index"
unzip -p "$path" | grep -a -e '<si><t>' -e '<vt:lpstr>' | sed 's/<[^<\/]*>/ /g' | sed 's/<[^<]*>//g' | escape >> "$index"
#libreoffice --convert-to csv "$path" out.csv
unzip -p "$path" 2> /dev/null | grep -a -e '<si><t' -e '<vt:lpstr>' | sed 's/<[^<\/]*>/ /g' | sed 's/<[^<]*>//g' | escape >> "$index"
echo $GREEN " [xlsx]" $RESET
;;
application/pdf\;)
application/pdf)
echo -n "pdf," >> "$index"
pdf2txt -t text "$path" 2> /dev/null | escape >> "$index"
echo $GREEN " [pdf]" $RESET
;;
application/x-executable\;|application/x*dos*)
application/x-executable|application/*microsoft*-executable|application/x*dos*)
echo -n "exe," >> "$index"
rabin2 -z "$path" 2> /dev/null | escape >> "$index"
echo $GREEN " [exe]" $RESET
;;
application/x-object\;|application/x-sharedlib|application/x-executable\;)
application/x-object|application/x-sharedlib|application/x-executable)
echo -n "elf," >> "$index"
rabin2 -z "$path" 2> /dev/null | escape >> "$index"
echo $GREEN " [elf]" $RESET
;;
application/*compressed*|application/*zip*|application/*rar*|application/*tar*|application/*gzip*)
application/*compressed*|application/*zip*|application/*rar*|application/*tar*|application/*gzip*|application/*-msi|*/java-archive)
echo -n "zip," >> "$index"
7z l "$path" | tail -n +13 | escape >> "$index"
7z l -p '' "$path" 2> /dev/null | tail -n +13 | escape >> "$index"
echo $GREEN " [archive]" $RESET
temp=$(tempfile)
temp=$(tempfile 2>/dev/null)
rm $temp && mkdir -p "$temp/$path"
7z x "$path" -o"$temp/$path" 1> /dev/null 2> /dev/null
7z x -p '' "$path" -o"$temp/$path" 1> /dev/null 2> /dev/null
fork "$temp"
rm -r "$temp"
session_file_done $path
#break
#session_file_done $path
;;
image/*)
echo -n "image," >> "$index"
identify -verbose "$path" 2> /dev/null | escape >> "$index"
#tesseract "$path" stdout -l eng >> "$index"
#tesseract "$path" stdout -l rus >> "$index"
#identify -verbose "$path" 2> /dev/null | escape >> "$index"
tesseract "$path" stdout -l eng 2> /dev/null | escape >> "$index"
tesseract "$path" stdout -l rus 2> /dev/null | escape >> "$index"
#curl -X POST --form "photo=@$path" http://10.250.153.11/ | escape >> "$index"
echo $GREEN " [img]" $RESET
;;
audio/*)
echo -n "audio," >> "$index"
vosk-transcriber --lang en-us --input "$path" 2> /dev/null | escape >> "$index"
echo $GREEN " [snd]" $RESET
;;
application/vnd.ms-outlook)
echo -n "message," >> "$index"
temp=$(tempfile 2>/dev/null)
rm $temp && mkdir -p "$temp/$path"
msgconvert --outfile "$temp/$path/out.eml" "$path" 2> /dev/null
mu view "$temp/$path/out.eml" 2> /dev/null | escape >> "$index"
echo $GREEN " [message]" $RESET
munpack -t -f -C "$(realpath $temp/$path)" 'out.eml' > /dev/null
rm "$temp/$path/out.eml"
fork "$temp"
rm -r "$temp"
#session_file_done $path
;;
message/*)
echo -n "message," >> "$index"
mu view "$path" | escape >> "$index"
mu view "$path" 2> /dev/null | escape >> "$index"
echo $GREEN " [message]" $RESET
temp=$(tempfile)
temp=$(tempfile 2>/dev/null)
rm $temp && mkdir -p "$temp/$path"
cp "$path" "$temp/$path/"
munpack -t -f -C "$(realpath $temp/$path)" "$(basename $path)" > /dev/null
rm "$temp/$path/$(basename $path)"
fork "$temp"
rm -r "$temp"
session_file_done $path
#break
#session_file_done $path
;;
application/octet-stream\;)
echo -n "raw," >> "$index"
#strings "$path" | escape >> "$index"
echo -n "," >> "$index"
echo $GREEN " [raw]" $RESET
*.tcpdump.pcap)
echo -n "pcap," >> "$index"
tcpdump -r "$path" -nn -A | escape >> "$index"
echo $GREEN " [pcap]" $RESET
;;
application/x-raw-disk-image\;)
application/x-raw-disk-image)
echo -n "disk," >> "$index"
binwalk "$path" | escape >> "$index"
echo $GREEN " [disk]" $RESET
;;
application/octet-stream)
echo -n "raw," >> "$index"
#strings "$path" | escape >> "$index"
echo -n "" >> "$index"
echo $GREEN " [raw]" $RESET
;;
*)
echo -n "unknown," >> "$index"
file "$path" | grep text > /dev/null &&
{
echo -n "text," >> "$index"
cat "$path" | escape >> "$index"
echo $GREY " [unknown]" $RESET
echo $GREEN " [text]" $RESET
} || {
echo -n "unknown," >> "$index"
#strings "$path" >> "$index"
echo -n "," >> "$index"
echo -n "" >> "$index"
echo $RED " [unknown]" $RESET
echo "$path $mime" >> unknown_mime.log
echo $RED " [error]" $RESET
}
;;
esac
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.

After

Width:  |  Height:  |  Size: 150 KiB

BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
+15
View File
@@ -0,0 +1,15 @@
## Continuous crawling
```
JAVA_LIBRARY_PATH=/opt/opensearch/plugins/opensearch-knn/lib /opt/opensearch/bin/opensearch
cd /opt/crawl/www && node index.js
```
`/opt/crawl/opensearch.py localhost:9200 -i $INDEX -init`
```
crontab -e
30 11 * * * tmux new-session -d '/opt/crawl/cron/targets.sh ; timeout 3600 /opt/crawl/cron/scan.sh ; tmux new-window -d 'timeout $[3600*8] /opt/crawl/cron/www.sh' & tmux new-window -d 'timeout $[3600*8] /opt/crawl/cron/ftp.sh' & tmux new-window -d 'timeout $[3600*8] /opt/crawl/cron/smb.sh'
0 23 * * * tmux new-session -d '/opt/crawl/cron/import.sh'
0 0 * * 1 /opt/crawl/cron/clean.sh
```
Executable
+9
View File
@@ -0,0 +1,9 @@
#!/bin/bash
rm smb-hosts.txt
rm www-hosts.txt
rm ftp-hosts.txt
rm *.csv
rm crawl.log
rm .*.sess
Executable
+7
View File
@@ -0,0 +1,7 @@
#!/bin/bash
cat ftp-hosts.txt | while read ip
do echo "$ip"
timeout 300 /opt/crawl/spider.sh "ftp://$ip/"
timeout 300 /opt/crawl/crawl.sh "$ip"
done
Executable
+8
View File
@@ -0,0 +1,8 @@
#!/bin/bash
INDEX="company"
for csv in *.csv
do echo $csv
/opt/crawl/opensearch.py localhost:9200 -i $INDEX -import "$csv"
done
Executable
+13
View File
@@ -0,0 +1,13 @@
#!/bin/bash
#PORTS_WWW="80,443,8080,8443,8000,8088,8880,8808,8888,6443,7443,9443,10443,8081"
PORTS_WWW="80,8080"
PORTS_FTP='21'
PORTS_SMB='445'
for net in $(cat nets.txt)
do echo "$net"
#nmap -Pn -n --max-retries 0 --max-rate 5 "$net" -p "$PORTS_WWW" --open -oG - | grep 'open' | tr '/' ' ' | awk '{print $2 " " $5}' >> www-hosts.txt
#nmap -Pn -n --max-retries 0 --max-rate 5 "$net" -p "$PORTS_FTP" --open -oG - | grep 'open' | tr '/' ' ' | awk '{print $2}' >> ftp-hosts.txt
nmap -Pn -n --max-retries 0 --max-rate 5 "$net" -p "$PORTS_SMB" --open -oG - | grep 'open' | tr '/' ' ' | awk '{print $2}' >> smb-hosts.txt
done
Executable
+23
View File
@@ -0,0 +1,23 @@
#!/bin/bash
DOMAIN='company.org'
USER='iivanov'
PASS='password'
#cme -t 1 smb --shares smb-hosts.txt | grep ' READ ' | sed -rn 's/SMB\s+([^\s]+)\s+445\s+([^\s]+)\s+(.*)\s+READ.+/\1\t\2\t\3/p' > shares-anon.txt
cme -t 1 smb -d "$DOMAIN" -u "$USER" -p "$PASS" --shares smb-hosts.txt | grep ' READ ' | sed -rn 's/SMB\s+([^\s]+)\s+445\s+([^\s]+)\s+(.*)\s+READ.+/\1\t\2\t\3/p' > shares-user.txt
IFS=$'\t'
for depth in {1..10}
do
cat shares-user.txt | grep -v 'IPC$' | while read ip name share
do echo "$ip" "$share"
fgrep -q "+ $depth //$ip/$share" crawl.log 2> /dev/null && continue
mkdir "/mnt/$ip-$share"
sudo timeout 5 mount.cifs "//$ip/$share" "/mnt/$ip-$share" -o ro,dom="$DOMAIN",user="$USER",pass="$PASS" || { echo "- $depth //$ip/$share" >> crawl.log; continue; }
timeout 300 /opt/crawl/crawl.sh "/mnt/$ip-$share" -mindepth "$depth" -maxdepth "$depth" -size -100k
sudo umount "/mnt/$ip-$share"
rm -r "/mnt/$ip-$share"
echo "+ $DEPTH //$ip/$share" >> crawl.log
done
done
+14
View File
@@ -0,0 +1,14 @@
#!/bin/bash
USER='iivanov'
PASS='password'
DOMAIN='company.org'
DC='192.168.12.6'
DNS=$DC
namespace=$(curl -s ldap://$DC | grep 'namingContexts:' | head -n 1 | awk '{print $2}')
ldapsearch -o ldif-wrap=no -E pr=10000/noprompt -D "$USER@$DOMAIN" -w "$PASS" -x -H ldap://"$DC" -b "$namespace" '(objectClass=computer)' dnshostname | grep dNSHostName | awk '{print $2}' > hosts.txt
cat hosts.txt | while read host
do host "$host" "$DNS" | grep 'has address' | awk '{print $4}'
done | sed -rn 's/([0-9]+\.[0-9]+\.[0-9]+)\.[0-9]+./\1\.0\/24/p' | sort | uniq -c | sort -n -r | awk '{print $2}' > nets.txt
Executable
+7
View File
@@ -0,0 +1,7 @@
#!/bin/bash
cat www-hosts.txt | while read ip port
do echo "$ip $port"
timeout 300 /opt/crawl/spider.sh "http://$ip:$port/"
timeout 300 /opt/crawl/crawl.sh "$ip:$port"
done
View File
BIN
View File
Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

BIN
View File
Binary file not shown.

After

Width:  |  Height:  |  Size: 83 KiB

View File
-25
View File
@@ -1,25 +0,0 @@
cd path/to/crawl/linux
### Local crawling
PATH=$PATH:bin ./crawl.sh /home/ -size -10M
PATH=$PATH:bin ./grep.sh 'pass' / -size -10M
./import.sh results.csv
./search.sh results.db 's3cr3t'
### Web crawling
./spider.sh http://target.com/
cd /tmp/spider/
./crawl.sh target.com/ -size -10M
### Mails crawling
./imap.sh imap://server.com user:pass
./crawl.sh INBOX
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
Binary file not shown.
-205
View File
@@ -1,205 +0,0 @@
#!/usr/bin/python3
"""A command line tool for extracting text and images from PDF and
output it to plain text, html, xml or tags."""
import argparse
import logging
import sys
sys.path = ['.'] + sys.path
import pdfminer.high_level
import pdfminer.layout
logging.basicConfig()
OUTPUT_TYPES = ((".htm", "html"),
(".html", "html"),
(".xml", "xml"),
(".tag", "tag"))
def float_or_disabled(x):
if x.lower().strip() == "disabled":
return x
try:
x = float(x)
except ValueError:
raise argparse.ArgumentTypeError("invalid float value: {}".format(x))
def extract_text(files=[], outfile='-',
no_laparams=False, all_texts=None, detect_vertical=None,
word_margin=None, char_margin=None, line_margin=None,
boxes_flow=None, output_type='text', codec='utf-8',
strip_control=False, maxpages=0, page_numbers=None,
password="", scale=1.0, rotation=0, layoutmode='normal',
output_dir=None, debug=False, disable_caching=False,
**kwargs):
if not files:
raise ValueError("Must provide files to work upon!")
# If any LAParams group arguments were passed,
# create an LAParams object and
# populate with given args. Otherwise, set it to None.
if not no_laparams:
laparams = pdfminer.layout.LAParams()
for param in ("all_texts", "detect_vertical", "word_margin",
"char_margin", "line_margin", "boxes_flow"):
paramv = locals().get(param, None)
if paramv is not None:
setattr(laparams, param, paramv)
else:
laparams = None
if output_type == "text" and outfile != "-":
for override, alttype in OUTPUT_TYPES:
if outfile.endswith(override):
output_type = alttype
if outfile == "-":
outfp = sys.stdout
if outfp.encoding is not None:
codec = 'utf-8'
else:
outfp = open(outfile, "wb")
for fname in files:
with open(fname, "rb") as fp:
pdfminer.high_level.extract_text_to_fp(fp, **locals())
return outfp
def maketheparser():
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
parser.add_argument(
"files", type=str, default=None, nargs="+",
help="One or more paths to PDF files.")
parser.add_argument(
"--version", "-v", action="version",
version="pdfminer.six v{}".format(pdfminer.__version__))
parser.add_argument(
"--debug", "-d", default=False, action="store_true",
help="Use debug logging level.")
parser.add_argument(
"--disable-caching", "-C", default=False, action="store_true",
help="If caching or resources, such as fonts, should be disabled.")
parse_params = parser.add_argument_group(
'Parser', description='Used during PDF parsing')
parse_params.add_argument(
"--page-numbers", type=int, default=None, nargs="+",
help="A space-seperated list of page numbers to parse.")
parse_params.add_argument(
"--pagenos", "-p", type=str,
help="A comma-separated list of page numbers to parse. "
"Included for legacy applications, use --page-numbers "
"for more idiomatic argument entry.")
parse_params.add_argument(
"--maxpages", "-m", type=int, default=0,
help="The maximum number of pages to parse.")
parse_params.add_argument(
"--password", "-P", type=str, default="",
help="The password to use for decrypting PDF file.")
parse_params.add_argument(
"--rotation", "-R", default=0, type=int,
help="The number of degrees to rotate the PDF "
"before other types of processing.")
la_params = parser.add_argument_group(
'Layout analysis', description='Used during layout analysis.')
la_params.add_argument(
"--no-laparams", "-n", default=False, action="store_true",
help="If layout analysis parameters should be ignored.")
la_params.add_argument(
"--detect-vertical", "-V", default=False, action="store_true",
help="If vertical text should be considered during layout analysis")
la_params.add_argument(
"--char-margin", "-M", type=float, default=2.0,
help="If two characters are closer together than this margin they "
"are considered to be part of the same line. The margin is "
"specified relative to the width of the character.")
la_params.add_argument(
"--word-margin", "-W", type=float, default=0.1,
help="If two characters on the same line are further apart than this "
"margin then they are considered to be two separate words, and "
"an intermediate space will be added for readability. The margin "
"is specified relative to the width of the character.")
la_params.add_argument(
"--line-margin", "-L", type=float, default=0.5,
help="If two lines are are close together they are considered to "
"be part of the same paragraph. The margin is specified "
"relative to the height of a line.")
la_params.add_argument(
"--boxes-flow", "-F", type=float_or_disabled, default=0.5,
help="Specifies how much a horizontal and vertical position of a "
"text matters when determining the order of lines. The value "
"should be within the range of -1.0 (only horizontal position "
"matters) to +1.0 (only vertical position matters). You can also "
"pass `disabled` to disable advanced layout analysis, and "
"instead return text based on the position of the bottom left "
"corner of the text box.")
la_params.add_argument(
"--all-texts", "-A", default=False, action="store_true",
help="If layout analysis should be performed on text in figures.")
output_params = parser.add_argument_group(
'Output', description='Used during output generation.')
output_params.add_argument(
"--outfile", "-o", type=str, default="-",
help="Path to file where output is written. "
"Or \"-\" (default) to write to stdout.")
output_params.add_argument(
"--output_type", "-t", type=str, default="text",
help="Type of output to generate {text,html,xml,tag}.")
output_params.add_argument(
"--codec", "-c", type=str, default="utf-8",
help="Text encoding to use in output file.")
output_params.add_argument(
"--output-dir", "-O", default=None,
help="The output directory to put extracted images in. If not given, "
"images are not extracted.")
output_params.add_argument(
"--layoutmode", "-Y", default="normal",
type=str, help="Type of layout to use when generating html "
"{normal,exact,loose}. If normal,each line is"
" positioned separately in the html. If exact"
", each character is positioned separately in"
" the html. If loose, same result as normal "
"but with an additional newline after each "
"text line. Only used when output_type is html.")
output_params.add_argument(
"--scale", "-s", type=float, default=1.0,
help="The amount of zoom to use when generating html file. "
"Only used when output_type is html.")
output_params.add_argument(
"--strip-control", "-S", default=False, action="store_true",
help="Remove control statement from text. "
"Only used when output_type is xml.")
return parser
# main
def main(args=None):
P = maketheparser()
A = P.parse_args(args=args)
if A.page_numbers:
A.page_numbers = {x-1 for x in A.page_numbers}
if A.pagenos:
A.page_numbers = {int(x)-1 for x in A.pagenos.split(",")}
if A.output_type == "text" and A.outfile != "-":
for override, alttype in OUTPUT_TYPES:
if A.outfile.endswith(override):
A.output_type = alttype
outfp = extract_text(**vars(A))
outfp.close()
return 0
if __name__ == '__main__':
sys.exit(main())
-12
View File
@@ -1,12 +0,0 @@
import sys
import warnings
__version__ = '20201018'
if sys.version_info < (3, 6):
warnings.warn('Python 3.4 and 3.5 are deprecated. '
'Please upgrade to Python 3.6 or newer.')
if __name__ == '__main__':
print(__version__)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
-35
View File
@@ -1,35 +0,0 @@
""" Python implementation of Arcfour encryption algorithm.
See https://en.wikipedia.org/wiki/RC4
This code is in the public domain.
"""
class Arcfour:
def __init__(self, key):
# because Py3 range is not indexable
s = [i for i in range(256)]
j = 0
klen = len(key)
for i in range(256):
j = (j + s[i] + key[i % klen]) % 256
(s[i], s[j]) = (s[j], s[i])
self.s = s
(self.i, self.j) = (0, 0)
return
def process(self, data):
(i, j) = (self.i, self.j)
s = self.s
r = b''
for c in iter(data):
i = (i+1) % 256
j = (j+s[i]) % 256
(s[i], s[j]) = (s[j], s[i])
k = s[(s[i]+s[j]) % 256]
r += bytes((c ^ k,))
(self.i, self.j) = (i, j)
return r
encrypt = decrypt = process
-71
View File
@@ -1,71 +0,0 @@
""" Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
This code is in the public domain.
"""
import re
import struct
# ascii85decode(data)
def ascii85decode(data):
"""
In ASCII85 encoding, every four bytes are encoded with five ASCII
letters, using 85 different types of characters (as 256**4 < 85**5).
When the length of the original bytes is not a multiple of 4, a special
rule is used for round up.
The Adobe's ASCII85 implementation is slightly different from
its original in handling the last characters.
"""
n = b = 0
out = b''
for i in iter(data):
c = bytes((i,))
if b'!' <= c and c <= b'u':
n += 1
b = b*85+(ord(c)-33)
if n == 5:
out += struct.pack('>L', b)
n = b = 0
elif c == b'z':
assert n == 0, str(n)
out += b'\0\0\0\0'
elif c == b'~':
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack('>L', b)[:n-1]
break
return out
# asciihexdecode(data)
hex_re = re.compile(br'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(br'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
def asciihexdecode(data):
"""
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
ASCIIHexDecode filter produces one byte of binary data. All white-space
characters are ignored. A right angle bracket character (>) indicates
EOD. Any other characters will cause an error. If the filter encounters
the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit.
"""
def decode(x):
i = int(x, 16)
return bytes((i,))
out = b''
for x in hex_re.findall(data):
out += decode(x)
m = trail_re.search(data)
if m:
out += decode(m.group(1)+b'0')
return out
-593
View File
@@ -1,593 +0,0 @@
# CCITT Fax decoder
#
# Bugs: uncompressed mode untested.
#
# cf.
# ITU-T Recommendation T.4
# "Standardization of Group 3 facsimile terminals
# for document transmission"
# ITU-T Recommendation T.6
# "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS
# FOR GROUP 4 FACSIMILE APPARATUS"
import sys
import array
def get_bytes(data):
yield from data
class BitParser:
def __init__(self):
self._pos = 0
return
@classmethod
def add(cls, root, v, bits):
p = root
b = None
for i in range(len(bits)):
if 0 < i:
if p[b] is None:
p[b] = [None, None]
p = p[b]
if bits[i] == '1':
b = 1
else:
b = 0
p[b] = v
return
def feedbytes(self, data):
for byte in get_bytes(data):
for m in (128, 64, 32, 16, 8, 4, 2, 1):
self._parse_bit(byte & m)
return
def _parse_bit(self, x):
if x:
v = self._state[1]
else:
v = self._state[0]
self._pos += 1
if isinstance(v, list):
self._state = v
else:
self._state = self._accept(v)
return
class CCITTG4Parser(BitParser):
MODE = [None, None]
BitParser.add(MODE, 0, '1')
BitParser.add(MODE, +1, '011')
BitParser.add(MODE, -1, '010')
BitParser.add(MODE, 'h', '001')
BitParser.add(MODE, 'p', '0001')
BitParser.add(MODE, +2, '000011')
BitParser.add(MODE, -2, '000010')
BitParser.add(MODE, +3, '0000011')
BitParser.add(MODE, -3, '0000010')
BitParser.add(MODE, 'u', '0000001111')
BitParser.add(MODE, 'x1', '0000001000')
BitParser.add(MODE, 'x2', '0000001001')
BitParser.add(MODE, 'x3', '0000001010')
BitParser.add(MODE, 'x4', '0000001011')
BitParser.add(MODE, 'x5', '0000001100')
BitParser.add(MODE, 'x6', '0000001101')
BitParser.add(MODE, 'x7', '0000001110')
BitParser.add(MODE, 'e', '000000000001000000000001')
WHITE = [None, None]
BitParser.add(WHITE, 0, '00110101')
BitParser.add(WHITE, 1, '000111')
BitParser.add(WHITE, 2, '0111')
BitParser.add(WHITE, 3, '1000')
BitParser.add(WHITE, 4, '1011')
BitParser.add(WHITE, 5, '1100')
BitParser.add(WHITE, 6, '1110')
BitParser.add(WHITE, 7, '1111')
BitParser.add(WHITE, 8, '10011')
BitParser.add(WHITE, 9, '10100')
BitParser.add(WHITE, 10, '00111')
BitParser.add(WHITE, 11, '01000')
BitParser.add(WHITE, 12, '001000')
BitParser.add(WHITE, 13, '000011')
BitParser.add(WHITE, 14, '110100')
BitParser.add(WHITE, 15, '110101')
BitParser.add(WHITE, 16, '101010')
BitParser.add(WHITE, 17, '101011')
BitParser.add(WHITE, 18, '0100111')
BitParser.add(WHITE, 19, '0001100')
BitParser.add(WHITE, 20, '0001000')
BitParser.add(WHITE, 21, '0010111')
BitParser.add(WHITE, 22, '0000011')
BitParser.add(WHITE, 23, '0000100')
BitParser.add(WHITE, 24, '0101000')
BitParser.add(WHITE, 25, '0101011')
BitParser.add(WHITE, 26, '0010011')
BitParser.add(WHITE, 27, '0100100')
BitParser.add(WHITE, 28, '0011000')
BitParser.add(WHITE, 29, '00000010')
BitParser.add(WHITE, 30, '00000011')
BitParser.add(WHITE, 31, '00011010')
BitParser.add(WHITE, 32, '00011011')
BitParser.add(WHITE, 33, '00010010')
BitParser.add(WHITE, 34, '00010011')
BitParser.add(WHITE, 35, '00010100')
BitParser.add(WHITE, 36, '00010101')
BitParser.add(WHITE, 37, '00010110')
BitParser.add(WHITE, 38, '00010111')
BitParser.add(WHITE, 39, '00101000')
BitParser.add(WHITE, 40, '00101001')
BitParser.add(WHITE, 41, '00101010')
BitParser.add(WHITE, 42, '00101011')
BitParser.add(WHITE, 43, '00101100')
BitParser.add(WHITE, 44, '00101101')
BitParser.add(WHITE, 45, '00000100')
BitParser.add(WHITE, 46, '00000101')
BitParser.add(WHITE, 47, '00001010')
BitParser.add(WHITE, 48, '00001011')
BitParser.add(WHITE, 49, '01010010')
BitParser.add(WHITE, 50, '01010011')
BitParser.add(WHITE, 51, '01010100')
BitParser.add(WHITE, 52, '01010101')
BitParser.add(WHITE, 53, '00100100')
BitParser.add(WHITE, 54, '00100101')
BitParser.add(WHITE, 55, '01011000')
BitParser.add(WHITE, 56, '01011001')
BitParser.add(WHITE, 57, '01011010')
BitParser.add(WHITE, 58, '01011011')
BitParser.add(WHITE, 59, '01001010')
BitParser.add(WHITE, 60, '01001011')
BitParser.add(WHITE, 61, '00110010')
BitParser.add(WHITE, 62, '00110011')
BitParser.add(WHITE, 63, '00110100')
BitParser.add(WHITE, 64, '11011')
BitParser.add(WHITE, 128, '10010')
BitParser.add(WHITE, 192, '010111')
BitParser.add(WHITE, 256, '0110111')
BitParser.add(WHITE, 320, '00110110')
BitParser.add(WHITE, 384, '00110111')
BitParser.add(WHITE, 448, '01100100')
BitParser.add(WHITE, 512, '01100101')
BitParser.add(WHITE, 576, '01101000')
BitParser.add(WHITE, 640, '01100111')
BitParser.add(WHITE, 704, '011001100')
BitParser.add(WHITE, 768, '011001101')
BitParser.add(WHITE, 832, '011010010')
BitParser.add(WHITE, 896, '011010011')
BitParser.add(WHITE, 960, '011010100')
BitParser.add(WHITE, 1024, '011010101')
BitParser.add(WHITE, 1088, '011010110')
BitParser.add(WHITE, 1152, '011010111')
BitParser.add(WHITE, 1216, '011011000')
BitParser.add(WHITE, 1280, '011011001')
BitParser.add(WHITE, 1344, '011011010')
BitParser.add(WHITE, 1408, '011011011')
BitParser.add(WHITE, 1472, '010011000')
BitParser.add(WHITE, 1536, '010011001')
BitParser.add(WHITE, 1600, '010011010')
BitParser.add(WHITE, 1664, '011000')
BitParser.add(WHITE, 1728, '010011011')
BitParser.add(WHITE, 1792, '00000001000')
BitParser.add(WHITE, 1856, '00000001100')
BitParser.add(WHITE, 1920, '00000001101')
BitParser.add(WHITE, 1984, '000000010010')
BitParser.add(WHITE, 2048, '000000010011')
BitParser.add(WHITE, 2112, '000000010100')
BitParser.add(WHITE, 2176, '000000010101')
BitParser.add(WHITE, 2240, '000000010110')
BitParser.add(WHITE, 2304, '000000010111')
BitParser.add(WHITE, 2368, '000000011100')
BitParser.add(WHITE, 2432, '000000011101')
BitParser.add(WHITE, 2496, '000000011110')
BitParser.add(WHITE, 2560, '000000011111')
BLACK = [None, None]
BitParser.add(BLACK, 0, '0000110111')
BitParser.add(BLACK, 1, '010')
BitParser.add(BLACK, 2, '11')
BitParser.add(BLACK, 3, '10')
BitParser.add(BLACK, 4, '011')
BitParser.add(BLACK, 5, '0011')
BitParser.add(BLACK, 6, '0010')
BitParser.add(BLACK, 7, '00011')
BitParser.add(BLACK, 8, '000101')
BitParser.add(BLACK, 9, '000100')
BitParser.add(BLACK, 10, '0000100')
BitParser.add(BLACK, 11, '0000101')
BitParser.add(BLACK, 12, '0000111')
BitParser.add(BLACK, 13, '00000100')
BitParser.add(BLACK, 14, '00000111')
BitParser.add(BLACK, 15, '000011000')
BitParser.add(BLACK, 16, '0000010111')
BitParser.add(BLACK, 17, '0000011000')
BitParser.add(BLACK, 18, '0000001000')
BitParser.add(BLACK, 19, '00001100111')
BitParser.add(BLACK, 20, '00001101000')
BitParser.add(BLACK, 21, '00001101100')
BitParser.add(BLACK, 22, '00000110111')
BitParser.add(BLACK, 23, '00000101000')
BitParser.add(BLACK, 24, '00000010111')
BitParser.add(BLACK, 25, '00000011000')
BitParser.add(BLACK, 26, '000011001010')
BitParser.add(BLACK, 27, '000011001011')
BitParser.add(BLACK, 28, '000011001100')
BitParser.add(BLACK, 29, '000011001101')
BitParser.add(BLACK, 30, '000001101000')
BitParser.add(BLACK, 31, '000001101001')
BitParser.add(BLACK, 32, '000001101010')
BitParser.add(BLACK, 33, '000001101011')
BitParser.add(BLACK, 34, '000011010010')
BitParser.add(BLACK, 35, '000011010011')
BitParser.add(BLACK, 36, '000011010100')
BitParser.add(BLACK, 37, '000011010101')
BitParser.add(BLACK, 38, '000011010110')
BitParser.add(BLACK, 39, '000011010111')
BitParser.add(BLACK, 40, '000001101100')
BitParser.add(BLACK, 41, '000001101101')
BitParser.add(BLACK, 42, '000011011010')
BitParser.add(BLACK, 43, '000011011011')
BitParser.add(BLACK, 44, '000001010100')
BitParser.add(BLACK, 45, '000001010101')
BitParser.add(BLACK, 46, '000001010110')
BitParser.add(BLACK, 47, '000001010111')
BitParser.add(BLACK, 48, '000001100100')
BitParser.add(BLACK, 49, '000001100101')
BitParser.add(BLACK, 50, '000001010010')
BitParser.add(BLACK, 51, '000001010011')
BitParser.add(BLACK, 52, '000000100100')
BitParser.add(BLACK, 53, '000000110111')
BitParser.add(BLACK, 54, '000000111000')
BitParser.add(BLACK, 55, '000000100111')
BitParser.add(BLACK, 56, '000000101000')
BitParser.add(BLACK, 57, '000001011000')
BitParser.add(BLACK, 58, '000001011001')
BitParser.add(BLACK, 59, '000000101011')
BitParser.add(BLACK, 60, '000000101100')
BitParser.add(BLACK, 61, '000001011010')
BitParser.add(BLACK, 62, '000001100110')
BitParser.add(BLACK, 63, '000001100111')
BitParser.add(BLACK, 64, '0000001111')
BitParser.add(BLACK, 128, '000011001000')
BitParser.add(BLACK, 192, '000011001001')
BitParser.add(BLACK, 256, '000001011011')
BitParser.add(BLACK, 320, '000000110011')
BitParser.add(BLACK, 384, '000000110100')
BitParser.add(BLACK, 448, '000000110101')
BitParser.add(BLACK, 512, '0000001101100')
BitParser.add(BLACK, 576, '0000001101101')
BitParser.add(BLACK, 640, '0000001001010')
BitParser.add(BLACK, 704, '0000001001011')
BitParser.add(BLACK, 768, '0000001001100')
BitParser.add(BLACK, 832, '0000001001101')
BitParser.add(BLACK, 896, '0000001110010')
BitParser.add(BLACK, 960, '0000001110011')
BitParser.add(BLACK, 1024, '0000001110100')
BitParser.add(BLACK, 1088, '0000001110101')
BitParser.add(BLACK, 1152, '0000001110110')
BitParser.add(BLACK, 1216, '0000001110111')
BitParser.add(BLACK, 1280, '0000001010010')
BitParser.add(BLACK, 1344, '0000001010011')
BitParser.add(BLACK, 1408, '0000001010100')
BitParser.add(BLACK, 1472, '0000001010101')
BitParser.add(BLACK, 1536, '0000001011010')
BitParser.add(BLACK, 1600, '0000001011011')
BitParser.add(BLACK, 1664, '0000001100100')
BitParser.add(BLACK, 1728, '0000001100101')
BitParser.add(BLACK, 1792, '00000001000')
BitParser.add(BLACK, 1856, '00000001100')
BitParser.add(BLACK, 1920, '00000001101')
BitParser.add(BLACK, 1984, '000000010010')
BitParser.add(BLACK, 2048, '000000010011')
BitParser.add(BLACK, 2112, '000000010100')
BitParser.add(BLACK, 2176, '000000010101')
BitParser.add(BLACK, 2240, '000000010110')
BitParser.add(BLACK, 2304, '000000010111')
BitParser.add(BLACK, 2368, '000000011100')
BitParser.add(BLACK, 2432, '000000011101')
BitParser.add(BLACK, 2496, '000000011110')
BitParser.add(BLACK, 2560, '000000011111')
UNCOMPRESSED = [None, None]
BitParser.add(UNCOMPRESSED, '1', '1')
BitParser.add(UNCOMPRESSED, '01', '01')
BitParser.add(UNCOMPRESSED, '001', '001')
BitParser.add(UNCOMPRESSED, '0001', '0001')
BitParser.add(UNCOMPRESSED, '00001', '00001')
BitParser.add(UNCOMPRESSED, '00000', '000001')
BitParser.add(UNCOMPRESSED, 'T00', '00000011')
BitParser.add(UNCOMPRESSED, 'T10', '00000010')
BitParser.add(UNCOMPRESSED, 'T000', '000000011')
BitParser.add(UNCOMPRESSED, 'T100', '000000010')
BitParser.add(UNCOMPRESSED, 'T0000', '0000000011')
BitParser.add(UNCOMPRESSED, 'T1000', '0000000010')
BitParser.add(UNCOMPRESSED, 'T00000', '00000000011')
BitParser.add(UNCOMPRESSED, 'T10000', '00000000010')
class EOFB(Exception):
pass
class InvalidData(Exception):
pass
class ByteSkip(Exception):
pass
def __init__(self, width, bytealign=False):
BitParser.__init__(self)
self.width = width
self.bytealign = bytealign
self.reset()
return
def feedbytes(self, data):
for byte in get_bytes(data):
try:
for m in (128, 64, 32, 16, 8, 4, 2, 1):
self._parse_bit(byte & m)
except self.ByteSkip:
self._accept = self._parse_mode
self._state = self.MODE
except self.EOFB:
break
return
def _parse_mode(self, mode):
if mode == 'p':
self._do_pass()
self._flush_line()
return self.MODE
elif mode == 'h':
self._n1 = 0
self._accept = self._parse_horiz1
if self._color:
return self.WHITE
else:
return self.BLACK
elif mode == 'u':
self._accept = self._parse_uncompressed
return self.UNCOMPRESSED
elif mode == 'e':
raise self.EOFB
elif isinstance(mode, int):
self._do_vertical(mode)
self._flush_line()
return self.MODE
else:
raise self.InvalidData(mode)
def _parse_horiz1(self, n):
if n is None:
raise self.InvalidData
self._n1 += n
if n < 64:
self._n2 = 0
self._color = 1-self._color
self._accept = self._parse_horiz2
if self._color:
return self.WHITE
else:
return self.BLACK
def _parse_horiz2(self, n):
if n is None:
raise self.InvalidData
self._n2 += n
if n < 64:
self._color = 1-self._color
self._accept = self._parse_mode
self._do_horizontal(self._n1, self._n2)
self._flush_line()
return self.MODE
elif self._color:
return self.WHITE
else:
return self.BLACK
def _parse_uncompressed(self, bits):
if not bits:
raise self.InvalidData
if bits.startswith('T'):
self._accept = self._parse_mode
self._color = int(bits[1])
self._do_uncompressed(bits[2:])
return self.MODE
else:
self._do_uncompressed(bits)
return self.UNCOMPRESSED
def _get_bits(self):
return ''.join(str(b) for b in self._curline[:self._curpos])
def _get_refline(self, i):
if i < 0:
return '[]'+''.join(str(b) for b in self._refline)
elif len(self._refline) <= i:
return ''.join(str(b) for b in self._refline)+'[]'
else:
return (''.join(str(b) for b in self._refline[:i]) +
'['+str(self._refline[i])+']' +
''.join(str(b) for b in self._refline[i+1:]))
def reset(self):
self._y = 0
self._curline = array.array('b', [1]*self.width)
self._reset_line()
self._accept = self._parse_mode
self._state = self.MODE
return
def output_line(self, y, bits):
print(y, ''.join(str(b) for b in bits))
return
def _reset_line(self):
self._refline = self._curline
self._curline = array.array('b', [1]*self.width)
self._curpos = -1
self._color = 1
return
def _flush_line(self):
if self.width <= self._curpos:
self.output_line(self._y, self._curline)
self._y += 1
self._reset_line()
if self.bytealign:
raise self.ByteSkip
return
def _do_vertical(self, dx):
x1 = self._curpos+1
while 1:
if x1 == 0:
if (self._color == 1 and self._refline[x1] != self._color):
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] == self._color and
self._refline[x1] != self._color):
break
x1 += 1
x1 += dx
x0 = max(0, self._curpos)
x1 = max(0, min(self.width, x1))
if x1 < x0:
for x in range(x1, x0):
self._curline[x] = self._color
elif x0 < x1:
for x in range(x0, x1):
self._curline[x] = self._color
self._curpos = x1
self._color = 1-self._color
return
def _do_pass(self):
x1 = self._curpos+1
while 1:
if x1 == 0:
if (self._color == 1 and self._refline[x1] != self._color):
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] == self._color and
self._refline[x1] != self._color):
break
x1 += 1
while 1:
if x1 == 0:
if (self._color == 0 and self._refline[x1] == self._color):
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] != self._color and
self._refline[x1] == self._color):
break
x1 += 1
for x in range(self._curpos, x1):
self._curline[x] = self._color
self._curpos = x1
return
def _do_horizontal(self, n1, n2):
if self._curpos < 0:
self._curpos = 0
x = self._curpos
for _ in range(n1):
if len(self._curline) <= x:
break
self._curline[x] = self._color
x += 1
for _ in range(n2):
if len(self._curline) <= x:
break
self._curline[x] = 1-self._color
x += 1
self._curpos = x
return
def _do_uncompressed(self, bits):
for c in bits:
self._curline[self._curpos] = int(c)
self._curpos += 1
self._flush_line()
return
class CCITTFaxDecoder(CCITTG4Parser):
def __init__(self, width, bytealign=False, reversed=False):
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.reversed = reversed
self._buf = b''
return
def close(self):
return self._buf
def output_line(self, y, bits):
bytes = array.array('B', [0]*((len(bits)+7)//8))
if self.reversed:
bits = [1-b for b in bits]
for (i, b) in enumerate(bits):
if b:
bytes[i//8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
self._buf += bytes.tostring()
return
def ccittfaxdecode(data, params):
K = params.get('K')
cols = params.get('Columns')
bytealign = params.get('EncodedByteAlign')
reversed = params.get('BlackIs1')
if K == -1:
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
else:
raise ValueError(K)
parser.feedbytes(data)
return parser.close()
# test
def main(argv):
if not argv[1:]:
import unittest
return unittest.main()
class Parser(CCITTG4Parser):
def __init__(self, width, bytealign=False):
import pygame
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.img = pygame.Surface((self.width, 1000))
return
def output_line(self, y, bits):
for (x, b) in enumerate(bits):
if b:
self.img.set_at((x, y), (255, 255, 255))
else:
self.img.set_at((x, y), (0, 0, 0))
return
def close(self):
import pygame
pygame.image.save(self.img, 'out.bmp')
return
for path in argv[1:]:
fp = open(path, 'rb')
(_, _, k, w, h, _) = path.split('.')
parser = Parser(int(w))
parser.feedbytes(fp.read())
parser.close()
fp.close()
return
if __name__ == '__main__':
sys.exit(main(sys.argv))
-425
View File
@@ -1,425 +0,0 @@
""" Adobe character mapping (CMap) support.
CMaps provide the mapping between character codes and Unicode
code-points to character ids (CIDs).
More information is available on the Adobe website:
http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
"""
import sys
import os
import os.path
import gzip
import pickle as pickle
import struct
import logging
from .psparser import PSStackParser
from .psparser import PSSyntaxError
from .psparser import PSEOF
from .psparser import PSLiteral
from .psparser import literal_name
from .psparser import KWD
from .encodingdb import name2unicode
from .utils import choplist
from .utils import nunpack
log = logging.getLogger(__name__)
class CMapError(Exception):
pass
class CMapBase:
debug = 0
def __init__(self, **kwargs):
self.attrs = kwargs.copy()
return
def is_vertical(self):
return self.attrs.get('WMode', 0) != 0
def set_attr(self, k, v):
self.attrs[k] = v
return
def add_code2cid(self, code, cid):
return
def add_cid2unichr(self, cid, code):
return
def use_cmap(self, cmap):
return
class CMap(CMapBase):
def __init__(self, **kwargs):
CMapBase.__init__(self, **kwargs)
self.code2cid = {}
return
def __repr__(self):
return '<CMap: %s>' % self.attrs.get('CMapName')
def use_cmap(self, cmap):
assert isinstance(cmap, CMap), str(type(cmap))
def copy(dst, src):
for (k, v) in src.items():
if isinstance(v, dict):
d = {}
dst[k] = d
copy(d, v)
else:
dst[k] = v
copy(self.code2cid, cmap.code2cid)
return
def decode(self, code):
log.debug('decode: %r, %r', self, code)
d = self.code2cid
for i in iter(code):
if i in d:
d = d[i]
if isinstance(d, int):
yield d
d = self.code2cid
else:
d = self.code2cid
return
def dump(self, out=sys.stdout, code2cid=None, code=None):
if code2cid is None:
code2cid = self.code2cid
code = ()
for (k, v) in sorted(code2cid.items()):
c = code+(k,)
if isinstance(v, int):
out.write('code %r = cid %d\n' % (c, v))
else:
self.dump(out=out, code2cid=v, code=c)
return
class IdentityCMap(CMapBase):
def decode(self, code):
n = len(code)//2
if n:
return struct.unpack('>%dH' % n, code)
else:
return ()
class IdentityCMapByte(IdentityCMap):
def decode(self, code):
n = len(code)
if n:
return struct.unpack('>%dB' % n, code)
else:
return ()
class UnicodeMap(CMapBase):
def __init__(self, **kwargs):
CMapBase.__init__(self, **kwargs)
self.cid2unichr = {}
return
def __repr__(self):
return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
def get_unichr(self, cid):
log.debug('get_unichr: %r, %r', self, cid)
return self.cid2unichr[cid]
def dump(self, out=sys.stdout):
for (k, v) in sorted(self.cid2unichr.items()):
out.write('cid %d = unicode %r\n' % (k, v))
return
class FileCMap(CMap):
def add_code2cid(self, code, cid):
assert isinstance(code, str) and isinstance(cid, int),\
str((type(code), type(cid)))
d = self.code2cid
for c in code[:-1]:
c = ord(c)
if c in d:
d = d[c]
else:
t = {}
d[c] = t
d = t
c = ord(code[-1])
d[c] = cid
return
class FileUnicodeMap(UnicodeMap):
def add_cid2unichr(self, cid, code):
assert isinstance(cid, int), str(type(cid))
if isinstance(code, PSLiteral):
# Interpret as an Adobe glyph name.
self.cid2unichr[cid] = name2unicode(code.name)
elif isinstance(code, bytes):
# Interpret as UTF-16BE.
self.cid2unichr[cid] = code.decode('UTF-16BE', 'ignore')
elif isinstance(code, int):
self.cid2unichr[cid] = chr(code)
else:
raise TypeError(code)
return
class PyCMap(CMap):
def __init__(self, name, module):
CMap.__init__(self, CMapName=name)
self.code2cid = module.CODE2CID
if module.IS_VERTICAL:
self.attrs['WMode'] = 1
return
class PyUnicodeMap(UnicodeMap):
def __init__(self, name, module, vertical):
UnicodeMap.__init__(self, CMapName=name)
if vertical:
self.cid2unichr = module.CID2UNICHR_V
self.attrs['WMode'] = 1
else:
self.cid2unichr = module.CID2UNICHR_H
return
class CMapDB:
_cmap_cache = {}
_umap_cache = {}
class CMapNotFound(CMapError):
pass
@classmethod
def _load_data(cls, name):
name = name.replace("\0", "")
filename = '%s.pickle.gz' % name
log.info('loading: %r', name)
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
os.path.join(os.path.dirname(__file__), 'cmap'),)
for directory in cmap_paths:
path = os.path.join(directory, filename)
if os.path.exists(path):
gzfile = gzip.open(path)
try:
return type(str(name), (), pickle.loads(gzfile.read()))
finally:
gzfile.close()
else:
raise CMapDB.CMapNotFound(name)
@classmethod
def get_cmap(cls, name):
if name == 'Identity-H':
return IdentityCMap(WMode=0)
elif name == 'Identity-V':
return IdentityCMap(WMode=1)
elif name == 'OneByteIdentityH':
return IdentityCMapByte(WMode=0)
elif name == 'OneByteIdentityV':
return IdentityCMapByte(WMode=1)
try:
return cls._cmap_cache[name]
except KeyError:
pass
data = cls._load_data(name)
cls._cmap_cache[name] = cmap = PyCMap(name, data)
return cmap
@classmethod
def get_unicode_map(cls, name, vertical=False):
try:
return cls._umap_cache[name][vertical]
except KeyError:
pass
data = cls._load_data('to-unicode-%s' % name)
cls._umap_cache[name] = [PyUnicodeMap(name, data, v)
for v in (False, True)]
return cls._umap_cache[name][vertical]
class CMapParser(PSStackParser):
def __init__(self, cmap, fp):
PSStackParser.__init__(self, fp)
self.cmap = cmap
# some ToUnicode maps don't have "begincmap" keyword.
self._in_cmap = True
return
def run(self):
try:
self.nextobject()
except PSEOF:
pass
return
KEYWORD_BEGINCMAP = KWD(b'begincmap')
KEYWORD_ENDCMAP = KWD(b'endcmap')
KEYWORD_USECMAP = KWD(b'usecmap')
KEYWORD_DEF = KWD(b'def')
KEYWORD_BEGINCODESPACERANGE = KWD(b'begincodespacerange')
KEYWORD_ENDCODESPACERANGE = KWD(b'endcodespacerange')
KEYWORD_BEGINCIDRANGE = KWD(b'begincidrange')
KEYWORD_ENDCIDRANGE = KWD(b'endcidrange')
KEYWORD_BEGINCIDCHAR = KWD(b'begincidchar')
KEYWORD_ENDCIDCHAR = KWD(b'endcidchar')
KEYWORD_BEGINBFRANGE = KWD(b'beginbfrange')
KEYWORD_ENDBFRANGE = KWD(b'endbfrange')
KEYWORD_BEGINBFCHAR = KWD(b'beginbfchar')
KEYWORD_ENDBFCHAR = KWD(b'endbfchar')
KEYWORD_BEGINNOTDEFRANGE = KWD(b'beginnotdefrange')
KEYWORD_ENDNOTDEFRANGE = KWD(b'endnotdefrange')
def do_keyword(self, pos, token):
if token is self.KEYWORD_BEGINCMAP:
self._in_cmap = True
self.popall()
return
elif token is self.KEYWORD_ENDCMAP:
self._in_cmap = False
return
if not self._in_cmap:
return
#
if token is self.KEYWORD_DEF:
try:
((_, k), (_, v)) = self.pop(2)
self.cmap.set_attr(literal_name(k), v)
except PSSyntaxError:
pass
return
if token is self.KEYWORD_USECMAP:
try:
((_, cmapname),) = self.pop(1)
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
except CMapDB.CMapNotFound:
pass
return
if token is self.KEYWORD_BEGINCODESPACERANGE:
self.popall()
return
if token is self.KEYWORD_ENDCODESPACERANGE:
self.popall()
return
if token is self.KEYWORD_BEGINCIDRANGE:
self.popall()
return
if token is self.KEYWORD_ENDCIDRANGE:
objs = [obj for (__, obj) in self.popall()]
for (s, e, cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
not isinstance(cid, int) or len(s) != len(e)):
continue
sprefix = s[:-4]
eprefix = e[:-4]
if sprefix != eprefix:
continue
svar = s[-4:]
evar = e[-4:]
s1 = nunpack(svar)
e1 = nunpack(evar)
vlen = len(svar)
for i in range(e1-s1+1):
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
self.cmap.add_code2cid(x, cid+i)
return
if token is self.KEYWORD_BEGINCIDCHAR:
self.popall()
return
if token is self.KEYWORD_ENDCIDCHAR:
objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str):
self.cmap.add_code2cid(code, nunpack(cid))
return
if token is self.KEYWORD_BEGINBFRANGE:
self.popall()
return
if token is self.KEYWORD_ENDBFRANGE:
objs = [obj for (__, obj) in self.popall()]
for (s, e, code) in choplist(3, objs):
if (not isinstance(s, bytes) or not isinstance(e, bytes) or
len(s) != len(e)):
continue
s1 = nunpack(s)
e1 = nunpack(e)
if isinstance(code, list):
for i in range(e1-s1+1):
self.cmap.add_cid2unichr(s1+i, code[i])
else:
var = code[-4:]
base = nunpack(var)
prefix = code[:-4]
vlen = len(var)
for i in range(e1-s1+1):
x = prefix+struct.pack('>L', base+i)[-vlen:]
self.cmap.add_cid2unichr(s1+i, x)
return
if token is self.KEYWORD_BEGINBFCHAR:
self.popall()
return
if token is self.KEYWORD_ENDBFCHAR:
objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs):
if isinstance(cid, bytes) and isinstance(code, bytes):
self.cmap.add_cid2unichr(nunpack(cid), code)
return
if token is self.KEYWORD_BEGINNOTDEFRANGE:
self.popall()
return
if token is self.KEYWORD_ENDNOTDEFRANGE:
self.popall()
return
self.push((pos, token))
return
def main(argv):
args = argv[1:]
for fname in args:
fp = open(fname, 'rb')
cmap = FileUnicodeMap()
CMapParser(cmap, fp).run()
fp.close()
cmap.dump()
return
if __name__ == '__main__':
sys.exit(main(sys.argv))
-587
View File
@@ -1,587 +0,0 @@
import logging
import re
import sys
from .pdfdevice import PDFTextDevice
from .pdffont import PDFUnicodeNotDefined
from .layout import LTContainer
from .layout import LTPage
from .layout import LTText
from .layout import LTLine
from .layout import LTRect
from .layout import LTCurve
from .layout import LTFigure
from .layout import LTImage
from .layout import LTChar
from .layout import LTTextLine
from .layout import LTTextBox
from .layout import LTTextBoxVertical
from .layout import LTTextGroup
from .utils import apply_matrix_pt
from .utils import mult_matrix
from .utils import enc
from .utils import bbox2str
from . import utils
log = logging.getLogger(__name__)
class PDFLayoutAnalyzer(PDFTextDevice):
def __init__(self, rsrcmgr, pageno=1, laparams=None):
PDFTextDevice.__init__(self, rsrcmgr)
self.pageno = pageno
self.laparams = laparams
self._stack = []
return
def begin_page(self, page, ctm):
(x0, y0, x1, y1) = page.mediabox
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
self.cur_item = LTPage(self.pageno, mediabox)
return
def end_page(self, page):
assert not self._stack, str(len(self._stack))
assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
if self.laparams is not None:
self.cur_item.analyze(self.laparams)
self.pageno += 1
self.receive_layout(self.cur_item)
return
def begin_figure(self, name, bbox, matrix):
self._stack.append(self.cur_item)
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
return
def end_figure(self, _):
fig = self.cur_item
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
self.cur_item = self._stack.pop()
self.cur_item.add(fig)
return
def render_image(self, name, stream):
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
item = LTImage(name, stream,
(self.cur_item.x0, self.cur_item.y0,
self.cur_item.x1, self.cur_item.y1))
self.cur_item.add(item)
return
def paint_path(self, gstate, stroke, fill, evenodd, path):
"""Paint paths described in section 4.4 of the PDF reference manual"""
shape = ''.join(x[0] for x in path)
if shape.count('m') > 1:
# recurse if there are multiple m's in this shape
for m in re.finditer(r'm[^m]+', shape):
subpath = path[m.start(0):m.end(0)]
self.paint_path(gstate, stroke, fill, evenodd, subpath)
else:
if shape == 'ml':
# single line segment
(x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
(x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
if x0 == x1 or y0 == y1:
line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke,
fill, evenodd, gstate.scolor, gstate.ncolor)
self.cur_item.add(line)
elif shape == 'mlllh':
(x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
(x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
(x2, y2) = apply_matrix_pt(self.ctm, path[2][1:])
(x3, y3) = apply_matrix_pt(self.ctm, path[3][1:])
if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0):
rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke,
fill, evenodd, gstate.scolor, gstate.ncolor)
self.cur_item.add(rect)
else:
curve = self._create_curve(gstate, stroke, fill, evenodd,
path)
self.cur_item.add(curve)
else:
curve = self._create_curve(gstate, stroke, fill, evenodd, path)
self.cur_item.add(curve)
def _create_curve(self, gstate, stroke, fill, evenodd, path):
"""Create a `LTCurve` object for the paint path operator"""
pts = [
apply_matrix_pt(self.ctm, point)
for p in path
for point in zip(p[1::2], p[2::2])
]
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
gstate.scolor, gstate.ncolor)
return curve
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
graphicstate):
try:
text = font.to_unichr(cid)
assert isinstance(text, str), str(type(text))
except PDFUnicodeNotDefined:
text = self.handle_undefined_char(font, cid)
textwidth = font.char_width(cid)
textdisp = font.char_disp(cid)
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth,
textdisp, ncs, graphicstate)
self.cur_item.add(item)
return item.adv
def handle_undefined_char(self, font, cid):
log.info('undefined: %r, %r', font, cid)
return '(cid:%d)' % cid
def receive_layout(self, ltpage):
return
class PDFPageAggregator(PDFLayoutAnalyzer):
def __init__(self, rsrcmgr, pageno=1, laparams=None):
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
laparams=laparams)
self.result = None
return
def receive_layout(self, ltpage):
self.result = ltpage
return
def get_result(self):
return self.result
class PDFConverter(PDFLayoutAnalyzer):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
laparams=None):
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
laparams=laparams)
self.outfp = outfp
self.codec = codec
if hasattr(self.outfp, 'mode'):
if 'b' in self.outfp.mode:
self.outfp_binary = True
else:
self.outfp_binary = False
else:
import io
if isinstance(self.outfp, io.BytesIO):
self.outfp_binary = True
elif isinstance(self.outfp, io.StringIO):
self.outfp_binary = False
else:
try:
self.outfp.write("é")
self.outfp_binary = False
except TypeError:
self.outfp_binary = True
return
class TextConverter(PDFConverter):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
showpageno=False, imagewriter=None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
laparams=laparams)
self.showpageno = showpageno
self.imagewriter = imagewriter
return
def write_text(self, text):
text = utils.compatible_encode_method(text, self.codec, 'ignore')
if self.outfp_binary:
text = text.encode()
self.outfp.write(text)
return
def receive_layout(self, ltpage):
def render(item):
if isinstance(item, LTContainer):
for child in item:
render(child)
elif isinstance(item, LTText):
self.write_text(item.get_text())
if isinstance(item, LTTextBox):
self.write_text('\n')
elif isinstance(item, LTImage):
if self.imagewriter is not None:
self.imagewriter.export_image(item)
if self.showpageno:
self.write_text('Page %s\n' % ltpage.pageid)
render(ltpage)
self.write_text('\f')
return
# Some dummy functions to save memory/CPU when all that is wanted
# is text. This stops all the image and drawing output from being
# recorded and taking up RAM.
def render_image(self, name, stream):
if self.imagewriter is None:
return
PDFConverter.render_image(self, name, stream)
return
def paint_path(self, gstate, stroke, fill, evenodd, path):
return
class HTMLConverter(PDFConverter):
RECT_COLORS = {
'figure': 'yellow',
'textline': 'magenta',
'textbox': 'cyan',
'textgroup': 'red',
'curve': 'black',
'page': 'gray',
}
TEXT_COLORS = {
'textbox': 'blue',
'char': 'black',
}
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
pagemargin=50, imagewriter=None, debug=0, rect_colors=None,
text_colors=None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
laparams=laparams)
if text_colors is None:
text_colors = {'char': 'black'}
if rect_colors is None:
rect_colors = {'curve': 'black', 'page': 'gray'}
self.scale = scale
self.fontscale = fontscale
self.layoutmode = layoutmode
self.showpageno = showpageno
self.pagemargin = pagemargin
self.imagewriter = imagewriter
self.rect_colors = rect_colors
self.text_colors = text_colors
if debug:
self.rect_colors.update(self.RECT_COLORS)
self.text_colors.update(self.TEXT_COLORS)
self._yoffset = self.pagemargin
self._font = None
self._fontstack = []
self.write_header()
return
def write(self, text):
if self.codec:
text = text.encode(self.codec)
if sys.version_info < (3, 0):
text = str(text)
self.outfp.write(text)
return
def write_header(self):
self.write('<html><head>\n')
if self.codec:
s = '<meta http-equiv="Content-Type" content="text/html; ' \
'charset=%s">\n' % self.codec
else:
s = '<meta http-equiv="Content-Type" content="text/html">\n'
self.write(s)
self.write('</head><body>\n')
return
def write_footer(self):
page_links = ['<a href="#{}">{}</a>'.format(i, i)
for i in range(1, self.pageno)]
s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % \
', '.join(page_links)
self.write(s)
self.write('</body></html>\n')
return
def write_text(self, text):
self.write(enc(text))
return
def place_rect(self, color, borderwidth, x, y, w, h):
color = self.rect_colors.get(color)
if color is not None:
s = '<span style="position:absolute; border: %s %dpx solid; ' \
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % \
(color, borderwidth, x * self.scale,
(self._yoffset - y) * self.scale, w * self.scale,
h * self.scale)
self.write(
s)
return
def place_border(self, color, borderwidth, item):
self.place_rect(color, borderwidth, item.x0, item.y1, item.width,
item.height)
return
def place_image(self, item, borderwidth, x, y, w, h):
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
s = '<img src="%s" border="%d" style="position:absolute; ' \
'left:%dpx; top:%dpx;" width="%d" height="%d" />\n' % \
(enc(name), borderwidth, x * self.scale,
(self._yoffset - y) * self.scale, w * self.scale,
h * self.scale)
self.write(s)
return
def place_text(self, color, text, x, y, size):
color = self.text_colors.get(color)
if color is not None:
s = '<span style="position:absolute; color:%s; left:%dpx; ' \
'top:%dpx; font-size:%dpx;">' % \
(color, x * self.scale, (self._yoffset - y) * self.scale,
size * self.scale * self.fontscale)
self.write(s)
self.write_text(text)
self.write('</span>\n')
return
def begin_div(self, color, borderwidth, x, y, w, h, writing_mode=False):
self._fontstack.append(self._font)
self._font = None
s = '<div style="position:absolute; border: %s %dpx solid; ' \
'writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; ' \
'height:%dpx;">' % \
(color, borderwidth, writing_mode, x * self.scale,
(self._yoffset - y) * self.scale, w * self.scale, h * self.scale)
self.write(s)
return
def end_div(self, color):
if self._font is not None:
self.write('</span>')
self._font = self._fontstack.pop()
self.write('</div>')
return
def put_text(self, text, fontname, fontsize):
font = (fontname, fontsize)
if font != self._font:
if self._font is not None:
self.write('</span>')
# Remove subset tag from fontname, see PDF Reference 5.5.3
fontname_without_subset_tag = fontname.split('+')[-1]
self.write('<span style="font-family: %s; font-size:%dpx">' %
(fontname_without_subset_tag,
fontsize * self.scale * self.fontscale))
self._font = font
self.write_text(text)
return
def put_newline(self):
self.write('<br>')
return
def receive_layout(self, ltpage):
def show_group(item):
if isinstance(item, LTTextGroup):
self.place_border('textgroup', 1, item)
for child in item:
show_group(child)
return
def render(item):
if isinstance(item, LTPage):
self._yoffset += item.y1
self.place_border('page', 1, item)
if self.showpageno:
self.write('<div style="position:absolute; top:%dpx;">' %
((self._yoffset-item.y1)*self.scale))
self.write('<a name="{}">Page {}</a></div>\n'
.format(item.pageid, item.pageid))
for child in item:
render(child)
if item.groups is not None:
for group in item.groups:
show_group(group)
elif isinstance(item, LTCurve):
self.place_border('curve', 1, item)
elif isinstance(item, LTFigure):
self.begin_div('figure', 1, item.x0, item.y1, item.width,
item.height)
for child in item:
render(child)
self.end_div('figure')
elif isinstance(item, LTImage):
self.place_image(item, 1, item.x0, item.y1, item.width,
item.height)
else:
if self.layoutmode == 'exact':
if isinstance(item, LTTextLine):
self.place_border('textline', 1, item)
for child in item:
render(child)
elif isinstance(item, LTTextBox):
self.place_border('textbox', 1, item)
self.place_text('textbox', str(item.index+1), item.x0,
item.y1, 20)
for child in item:
render(child)
elif isinstance(item, LTChar):
self.place_border('char', 1, item)
self.place_text('char', item.get_text(), item.x0,
item.y1, item.size)
else:
if isinstance(item, LTTextLine):
for child in item:
render(child)
if self.layoutmode != 'loose':
self.put_newline()
elif isinstance(item, LTTextBox):
self.begin_div('textbox', 1, item.x0, item.y1,
item.width, item.height,
item.get_writing_mode())
for child in item:
render(child)
self.end_div('textbox')
elif isinstance(item, LTChar):
self.put_text(item.get_text(), item.fontname,
item.size)
elif isinstance(item, LTText):
self.write_text(item.get_text())
return
render(ltpage)
self._yoffset += self.pagemargin
return
def close(self):
self.write_footer()
return
class XMLConverter(PDFConverter):
CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]')
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
imagewriter=None, stripcontrol=False):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
laparams=laparams)
self.imagewriter = imagewriter
self.stripcontrol = stripcontrol
self.write_header()
return
def write(self, text):
if self.codec:
text = text.encode(self.codec)
self.outfp.write(text)
return
def write_header(self):
if self.codec:
self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
else:
self.write('<?xml version="1.0" ?>\n')
self.write('<pages>\n')
return
def write_footer(self):
self.write('</pages>\n')
return
def write_text(self, text):
if self.stripcontrol:
text = self.CONTROL.sub('', text)
self.write(enc(text))
return
def receive_layout(self, ltpage):
def show_group(item):
if isinstance(item, LTTextBox):
self.write('<textbox id="%d" bbox="%s" />\n' %
(item.index, bbox2str(item.bbox)))
elif isinstance(item, LTTextGroup):
self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
for child in item:
show_group(child)
self.write('</textgroup>\n')
return
def render(item):
if isinstance(item, LTPage):
s = '<page id="%s" bbox="%s" rotate="%d">\n' % \
(item.pageid, bbox2str(item.bbox), item.rotate)
self.write(s)
for child in item:
render(child)
if item.groups is not None:
self.write('<layout>\n')
for group in item.groups:
show_group(group)
self.write('</layout>\n')
self.write('</page>\n')
elif isinstance(item, LTLine):
s = '<line linewidth="%d" bbox="%s" />\n' % \
(item.linewidth, bbox2str(item.bbox))
self.write(s)
elif isinstance(item, LTRect):
s = '<rect linewidth="%d" bbox="%s" />\n' % \
(item.linewidth, bbox2str(item.bbox))
self.write(s)
elif isinstance(item, LTCurve):
s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % \
(item.linewidth, bbox2str(item.bbox), item.get_pts())
self.write(s)
elif isinstance(item, LTFigure):
s = '<figure name="%s" bbox="%s">\n' % \
(item.name, bbox2str(item.bbox))
self.write(s)
for child in item:
render(child)
self.write('</figure>\n')
elif isinstance(item, LTTextLine):
self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
for child in item:
render(child)
self.write('</textline>\n')
elif isinstance(item, LTTextBox):
wmode = ''
if isinstance(item, LTTextBoxVertical):
wmode = ' wmode="vertical"'
s = '<textbox id="%d" bbox="%s"%s>\n' %\
(item.index, bbox2str(item.bbox), wmode)
self.write(s)
for child in item:
render(child)
self.write('</textbox>\n')
elif isinstance(item, LTChar):
s = '<text font="%s" bbox="%s" colourspace="%s" ' \
'ncolour="%s" size="%.3f">' % \
(enc(item.fontname), bbox2str(item.bbox),
item.ncs.name, item.graphicstate.ncolor, item.size)
self.write(s)
self.write_text(item.get_text())
self.write('</text>\n')
elif isinstance(item, LTText):
self.write('<text>%s</text>\n' % item.get_text())
elif isinstance(item, LTImage):
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
self.write('<image src="%s" width="%d" height="%d" />\n' %
(enc(name), item.width, item.height))
else:
self.write('<image width="%d" height="%d" />\n' %
(item.width, item.height))
else:
assert False, str(('Unhandled', item))
return
render(ltpage)
return
def close(self):
self.write_footer()
return
-112
View File
@@ -1,112 +0,0 @@
import logging
import re
from .glyphlist import glyphname2unicode
from .latin_enc import ENCODING
from .psparser import PSLiteral
HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
log = logging.getLogger(__name__)
def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers.
In contrast to the specification, this raises a KeyError instead of return
an empty string when the key is unknown.
This way the caller must explicitly define what to do
when there is not a match.
Reference:
https://github.com/adobe-type-tools/agl-specification#2-the-mapping
:returns unicode character if name resembles something,
otherwise a KeyError
"""
name = name.split('.')[0]
components = name.split('_')
if len(components) > 1:
return ''.join(map(name2unicode, components))
else:
if name in glyphname2unicode:
return glyphname2unicode.get(name)
elif name.startswith('uni'):
name_without_uni = name.strip('uni')
if HEXADECIMAL.match(name_without_uni) and \
len(name_without_uni) % 4 == 0:
unicode_digits = [int(name_without_uni[i:i + 4], base=16)
for i in range(0, len(name_without_uni), 4)]
for digit in unicode_digits:
raise_key_error_for_invalid_unicode(digit)
characters = map(chr, unicode_digits)
return ''.join(characters)
elif name.startswith('u'):
name_without_u = name.strip('u')
if HEXADECIMAL.match(name_without_u) and \
4 <= len(name_without_u) <= 6:
unicode_digit = int(name_without_u, base=16)
raise_key_error_for_invalid_unicode(unicode_digit)
return chr(unicode_digit)
raise KeyError('Could not convert unicode name "%s" to character because '
'it does not match specification' % name)
def raise_key_error_for_invalid_unicode(unicode_digit):
"""Unicode values should not be in the range D800 through DFFF because
that is used for surrogate pairs in UTF-16
:raises KeyError if unicode digit is invalid
"""
if 55295 < unicode_digit < 57344:
raise KeyError('Unicode digit %d is invalid because '
'it is in the range D800 through DFFF' % unicode_digit)
class EncodingDB:
std2unicode = {}
mac2unicode = {}
win2unicode = {}
pdf2unicode = {}
for (name, std, mac, win, pdf) in ENCODING:
c = name2unicode(name)
if std:
std2unicode[std] = c
if mac:
mac2unicode[mac] = c
if win:
win2unicode[win] = c
if pdf:
pdf2unicode[pdf] = c
encodings = {
'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode,
}
@classmethod
def get_encoding(cls, name, diff=None):
cid2unicode = cls.encodings.get(name, cls.std2unicode)
if diff:
cid2unicode = cid2unicode.copy()
cid = 0
for x in diff:
if isinstance(x, int):
cid = x
elif isinstance(x, PSLiteral):
try:
cid2unicode[cid] = name2unicode(x.name)
except (KeyError, ValueError) as e:
log.debug(str(e))
cid += 1
return cid2unicode
-46
View File
@@ -1,46 +0,0 @@
""" Font metrics for the Adobe core 14 fonts.
Font metrics are used to compute the boundary of each character
written with a proportional font.
The following data were extracted from the AFM files:
http://www.ctan.org/tex-archive/fonts/adobe/afm/
"""
### BEGIN Verbatim copy of the license part
#
# Adobe Core 35 AFM Files with 314 Glyph Entries - ReadMe
#
# This file and the 35 PostScript(R) AFM files it accompanies may be
# used, copied, and distributed for any purpose and without charge,
# with or without modification, provided that all copyright notices
# are retained; that the AFM files are not distributed without this
# file; that all modifications to this file or any of the AFM files
# are prominently noted in the modified file(s); and that this
# paragraph is not modified. Adobe Systems has no responsibility or
# obligation to support the use of the AFM files.
#
### END Verbatim copy of the license part
# flake8: noqa
FONT_METRICS = {
'Courier': ({'FontName': 'Courier', 'Descent': -194.0, 'FontBBox': (-6.0, -249.0, 639.0, 803.0), 'FontWeight': 'Medium', 'CapHeight': 572.0, 'FontFamily': 'Courier', 'Flags': 64, 'XHeight': 434.0, 'ItalicAngle': 0.0, 'Ascent': 627.0}, {' ': 600, '!': 600, '"': 600, '#': 600, '$': 600, '%': 600, '&': 600, "'": 600, '(': 600, ')': 600, '*': 600, '+': 600, ',': 600, '-': 600, '.': 600, '/': 600, '0': 600, '1': 600, '2': 600, '3': 600, '4': 600, '5': 600, '6': 600, '7': 600, '8': 600, '9': 600, ':': 600, ';': 600, '<': 600, '=': 600, '>': 600, '?': 600, '@': 600, 'A': 600, 'B': 600, 'C': 600, 'D': 600, 'E': 600, 'F': 600, 'G': 600, 'H': 600, 'I': 600, 'J': 600, 'K': 600, 'L': 600, 'M': 600, 'N': 600, 'O': 600, 'P': 600, 'Q': 600, 'R': 600, 'S': 600, 'T': 600, 'U': 600, 'V': 600, 'W': 600, 'X': 600, 'Y': 600, 'Z': 600, '[': 600, '\\': 600, ']': 600, '^': 600, '_': 600, '`': 600, 'a': 600, 'b': 600, 'c': 600, 'd': 600, 'e': 600, 'f': 600, 'g': 600, 'h': 600, 'i': 600, 'j': 600, 'k': 600, 'l': 600, 'm': 600, 'n': 600, 'o': 600, 'p': 600, 'q': 600, 'r': 600, 's': 600, 't': 600, 'u': 600, 'v': 600, 'w': 600, 'x': 600, 'y': 600, 'z': 600, '{': 600, '|': 600, '}': 600, '~': 600, '\xa1': 600, '\xa2': 600, '\xa3': 600, '\xa4': 600, '\xa5': 600, '\xa6': 600, '\xa7': 600, '\xa8': 600, '\xa9': 600, '\xaa': 600, '\xab': 600, '\xac': 600, '\xae': 600, '\xaf': 600, '\xb0': 600, '\xb1': 600, '\xb2': 600, '\xb3': 600, '\xb4': 600, '\xb5': 600, '\xb6': 600, '\xb7': 600, '\xb8': 600, '\xb9': 600, '\xba': 600, '\xbb': 600, '\xbc': 600, '\xbd': 600, '\xbe': 600, '\xbf': 600, '\xc0': 600, '\xc1': 600, '\xc2': 600, '\xc3': 600, '\xc4': 600, '\xc5': 600, '\xc6': 600, '\xc7': 600, '\xc8': 600, '\xc9': 600, '\xca': 600, '\xcb': 600, '\xcc': 600, '\xcd': 600, '\xce': 600, '\xcf': 600, '\xd0': 600, '\xd1': 600, '\xd2': 600, '\xd3': 600, '\xd4': 600, '\xd5': 600, '\xd6': 600, '\xd7': 600, '\xd8': 600, '\xd9': 600, '\xda': 600, '\xdb': 600, '\xdc': 600, '\xdd': 600, '\xde': 600, '\xdf': 600, '\xe0': 600, '\xe1': 600, '\xe2': 600, '\xe3': 600, '\xe4': 600, '\xe5': 600, '\xe6': 600, '\xe7': 600, '\xe8': 600, '\xe9': 600, '\xea': 600, '\xeb': 600, '\xec': 600, '\xed': 600, '\xee': 600, '\xef': 600, '\xf0': 600, '\xf1': 600, '\xf2': 600, '\xf3': 600, '\xf4': 600, '\xf5': 600, '\xf6': 600, '\xf7': 600, '\xf8': 600, '\xf9': 600, '\xfa': 600, '\xfb': 600, '\xfc': 600, '\xfd': 600, '\xfe': 600, '\xff': 600, '\u0100': 600, '\u0101': 600, '\u0102': 600, '\u0103': 600, '\u0104': 600, '\u0105': 600, '\u0106': 600, '\u0107': 600, '\u010c': 600, '\u010d': 600, '\u010e': 600, '\u010f': 600, '\u0110': 600, '\u0111': 600, '\u0112': 600, '\u0113': 600, '\u0116': 600, '\u0117': 600, '\u0118': 600, '\u0119': 600, '\u011a': 600, '\u011b': 600, '\u011e': 600, '\u011f': 600, '\u0122': 600, '\u0123': 600, '\u012a': 600, '\u012b': 600, '\u012e': 600, '\u012f': 600, '\u0130': 600, '\u0131': 600, '\u0136': 600, '\u0137': 600, '\u0139': 600, '\u013a': 600, '\u013b': 600, '\u013c': 600, '\u013d': 600, '\u013e': 600, '\u0141': 600, '\u0142': 600, '\u0143': 600, '\u0144': 600, '\u0145': 600, '\u0146': 600, '\u0147': 600, '\u0148': 600, '\u014c': 600, '\u014d': 600, '\u0150': 600, '\u0151': 600, '\u0152': 600, '\u0153': 600, '\u0154': 600, '\u0155': 600, '\u0156': 600, '\u0157': 600, '\u0158': 600, '\u0159': 600, '\u015a': 600, '\u015b': 600, '\u015e': 600, '\u015f': 600, '\u0160': 600, '\u0161': 600, '\u0162': 600, '\u0163': 600, '\u0164': 600, '\u0165': 600, '\u016a': 600, '\u016b': 600, '\u016e': 600, '\u016f': 600, '\u0170': 600, '\u0171': 600, '\u0172': 600, '\u0173': 600, '\u0178': 600, '\u0179': 600, '\u017a': 600, '\u017b': 600, '\u017c': 600, '\u017d': 600, '\u017e': 600, '\u0192': 600, '\u0218': 600, '\u0219': 600, '\u02c6': 600, '\u02c7': 600, '\u02d8': 600, '\u02d9': 600, '\u02da': 600, '\u02db': 600, '\u02dc': 600, '\u02dd': 600, '\u2013': 600, '\u2014': 600, '\u2018': 600, '\u2019': 600, '\u201a': 600, '\u201c': 600, '\u201d': 600, '\u201e': 600, '\u2020': 600, '\u2021': 600, '\u2022': 600, '\u2026': 600, '\u2030': 600, '\u2039': 600, '\u203a': 600, '\u2044': 600, '\u2122': 600, '\u2202': 600, '\u2206': 600, '\u2211': 600, '\u2212': 600, '\u221a': 600, '\u2260': 600, '\u2264': 600, '\u2265': 600, '\u25ca': 600, '\uf6c3': 600, '\ufb01': 600, '\ufb02': 600}),
'Courier-Bold': ({'FontName': 'Courier-Bold', 'Descent': -194.0, 'FontBBox': (-88.0, -249.0, 697.0, 811.0), 'FontWeight': 'Bold', 'CapHeight': 572.0, 'FontFamily': 'Courier', 'Flags': 64, 'XHeight': 434.0, 'ItalicAngle': 0.0, 'Ascent': 627.0}, {' ': 600, '!': 600, '"': 600, '#': 600, '$': 600, '%': 600, '&': 600, "'": 600, '(': 600, ')': 600, '*': 600, '+': 600, ',': 600, '-': 600, '.': 600, '/': 600, '0': 600, '1': 600, '2': 600, '3': 600, '4': 600, '5': 600, '6': 600, '7': 600, '8': 600, '9': 600, ':': 600, ';': 600, '<': 600, '=': 600, '>': 600, '?': 600, '@': 600, 'A': 600, 'B': 600, 'C': 600, 'D': 600, 'E': 600, 'F': 600, 'G': 600, 'H': 600, 'I': 600, 'J': 600, 'K': 600, 'L': 600, 'M': 600, 'N': 600, 'O': 600, 'P': 600, 'Q': 600, 'R': 600, 'S': 600, 'T': 600, 'U': 600, 'V': 600, 'W': 600, 'X': 600, 'Y': 600, 'Z': 600, '[': 600, '\\': 600, ']': 600, '^': 600, '_': 600, '`': 600, 'a': 600, 'b': 600, 'c': 600, 'd': 600, 'e': 600, 'f': 600, 'g': 600, 'h': 600, 'i': 600, 'j': 600, 'k': 600, 'l': 600, 'm': 600, 'n': 600, 'o': 600, 'p': 600, 'q': 600, 'r': 600, 's': 600, 't': 600, 'u': 600, 'v': 600, 'w': 600, 'x': 600, 'y': 600, 'z': 600, '{': 600, '|': 600, '}': 600, '~': 600, '\xa1': 600, '\xa2': 600, '\xa3': 600, '\xa4': 600, '\xa5': 600, '\xa6': 600, '\xa7': 600, '\xa8': 600, '\xa9': 600, '\xaa': 600, '\xab': 600, '\xac': 600, '\xae': 600, '\xaf': 600, '\xb0': 600, '\xb1': 600, '\xb2': 600, '\xb3': 600, '\xb4': 600, '\xb5': 600, '\xb6': 600, '\xb7': 600, '\xb8': 600, '\xb9': 600, '\xba': 600, '\xbb': 600, '\xbc': 600, '\xbd': 600, '\xbe': 600, '\xbf': 600, '\xc0': 600, '\xc1': 600, '\xc2': 600, '\xc3': 600, '\xc4': 600, '\xc5': 600, '\xc6': 600, '\xc7': 600, '\xc8': 600, '\xc9': 600, '\xca': 600, '\xcb': 600, '\xcc': 600, '\xcd': 600, '\xce': 600, '\xcf': 600, '\xd0': 600, '\xd1': 600, '\xd2': 600, '\xd3': 600, '\xd4': 600, '\xd5': 600, '\xd6': 600, '\xd7': 600, '\xd8': 600, '\xd9': 600, '\xda': 600, '\xdb': 600, '\xdc': 600, '\xdd': 600, '\xde': 600, '\xdf': 600, '\xe0': 600, '\xe1': 600, '\xe2': 600, '\xe3': 600, '\xe4': 600, '\xe5': 600, '\xe6': 600, '\xe7': 600, '\xe8': 600, '\xe9': 600, '\xea': 600, '\xeb': 600, '\xec': 600, '\xed': 600, '\xee': 600, '\xef': 600, '\xf0': 600, '\xf1': 600, '\xf2': 600, '\xf3': 600, '\xf4': 600, '\xf5': 600, '\xf6': 600, '\xf7': 600, '\xf8': 600, '\xf9': 600, '\xfa': 600, '\xfb': 600, '\xfc': 600, '\xfd': 600, '\xfe': 600, '\xff': 600, '\u0100': 600, '\u0101': 600, '\u0102': 600, '\u0103': 600, '\u0104': 600, '\u0105': 600, '\u0106': 600, '\u0107': 600, '\u010c': 600, '\u010d': 600, '\u010e': 600, '\u010f': 600, '\u0110': 600, '\u0111': 600, '\u0112': 600, '\u0113': 600, '\u0116': 600, '\u0117': 600, '\u0118': 600, '\u0119': 600, '\u011a': 600, '\u011b': 600, '\u011e': 600, '\u011f': 600, '\u0122': 600, '\u0123': 600, '\u012a': 600, '\u012b': 600, '\u012e': 600, '\u012f': 600, '\u0130': 600, '\u0131': 600, '\u0136': 600, '\u0137': 600, '\u0139': 600, '\u013a': 600, '\u013b': 600, '\u013c': 600, '\u013d': 600, '\u013e': 600, '\u0141': 600, '\u0142': 600, '\u0143': 600, '\u0144': 600, '\u0145': 600, '\u0146': 600, '\u0147': 600, '\u0148': 600, '\u014c': 600, '\u014d': 600, '\u0150': 600, '\u0151': 600, '\u0152': 600, '\u0153': 600, '\u0154': 600, '\u0155': 600, '\u0156': 600, '\u0157': 600, '\u0158': 600, '\u0159': 600, '\u015a': 600, '\u015b': 600, '\u015e': 600, '\u015f': 600, '\u0160': 600, '\u0161': 600, '\u0162': 600, '\u0163': 600, '\u0164': 600, '\u0165': 600, '\u016a': 600, '\u016b': 600, '\u016e': 600, '\u016f': 600, '\u0170': 600, '\u0171': 600, '\u0172': 600, '\u0173': 600, '\u0178': 600, '\u0179': 600, '\u017a': 600, '\u017b': 600, '\u017c': 600, '\u017d': 600, '\u017e': 600, '\u0192': 600, '\u0218': 600, '\u0219': 600, '\u02c6': 600, '\u02c7': 600, '\u02d8': 600, '\u02d9': 600, '\u02da': 600, '\u02db': 600, '\u02dc': 600, '\u02dd': 600, '\u2013': 600, '\u2014': 600, '\u2018': 600, '\u2019': 600, '\u201a': 600, '\u201c': 600, '\u201d': 600, '\u201e': 600, '\u2020': 600, '\u2021': 600, '\u2022': 600, '\u2026': 600, '\u2030': 600, '\u2039': 600, '\u203a': 600, '\u2044': 600, '\u2122': 600, '\u2202': 600, '\u2206': 600, '\u2211': 600, '\u2212': 600, '\u221a': 600, '\u2260': 600, '\u2264': 600, '\u2265': 600, '\u25ca': 600, '\uf6c3': 600, '\ufb01': 600, '\ufb02': 600}),
'Courier-BoldOblique': ({'FontName': 'Courier-BoldOblique', 'Descent': -194.0, 'FontBBox': (-49.0, -249.0, 758.0, 811.0), 'FontWeight': 'Bold', 'CapHeight': 572.0, 'FontFamily': 'Courier', 'Flags': 64, 'XHeight': 434.0, 'ItalicAngle': -11.0, 'Ascent': 627.0}, {' ': 600, '!': 600, '"': 600, '#': 600, '$': 600, '%': 600, '&': 600, "'": 600, '(': 600, ')': 600, '*': 600, '+': 600, ',': 600, '-': 600, '.': 600, '/': 600, '0': 600, '1': 600, '2': 600, '3': 600, '4': 600, '5': 600, '6': 600, '7': 600, '8': 600, '9': 600, ':': 600, ';': 600, '<': 600, '=': 600, '>': 600, '?': 600, '@': 600, 'A': 600, 'B': 600, 'C': 600, 'D': 600, 'E': 600, 'F': 600, 'G': 600, 'H': 600, 'I': 600, 'J': 600, 'K': 600, 'L': 600, 'M': 600, 'N': 600, 'O': 600, 'P': 600, 'Q': 600, 'R': 600, 'S': 600, 'T': 600, 'U': 600, 'V': 600, 'W': 600, 'X': 600, 'Y': 600, 'Z': 600, '[': 600, '\\': 600, ']': 600, '^': 600, '_': 600, '`': 600, 'a': 600, 'b': 600, 'c': 600, 'd': 600, 'e': 600, 'f': 600, 'g': 600, 'h': 600, 'i': 600, 'j': 600, 'k': 600, 'l': 600, 'm': 600, 'n': 600, 'o': 600, 'p': 600, 'q': 600, 'r': 600, 's': 600, 't': 600, 'u': 600, 'v': 600, 'w': 600, 'x': 600, 'y': 600, 'z': 600, '{': 600, '|': 600, '}': 600, '~': 600, '\xa1': 600, '\xa2': 600, '\xa3': 600, '\xa4': 600, '\xa5': 600, '\xa6': 600, '\xa7': 600, '\xa8': 600, '\xa9': 600, '\xaa': 600, '\xab': 600, '\xac': 600, '\xae': 600, '\xaf': 600, '\xb0': 600, '\xb1': 600, '\xb2': 600, '\xb3': 600, '\xb4': 600, '\xb5': 600, '\xb6': 600, '\xb7': 600, '\xb8': 600, '\xb9': 600, '\xba': 600, '\xbb': 600, '\xbc': 600, '\xbd': 600, '\xbe': 600, '\xbf': 600, '\xc0': 600, '\xc1': 600, '\xc2': 600, '\xc3': 600, '\xc4': 600, '\xc5': 600, '\xc6': 600, '\xc7': 600, '\xc8': 600, '\xc9': 600, '\xca': 600, '\xcb': 600, '\xcc': 600, '\xcd': 600, '\xce': 600, '\xcf': 600, '\xd0': 600, '\xd1': 600, '\xd2': 600, '\xd3': 600, '\xd4': 600, '\xd5': 600, '\xd6': 600, '\xd7': 600, '\xd8': 600, '\xd9': 600, '\xda': 600, '\xdb': 600, '\xdc': 600, '\xdd': 600, '\xde': 600, '\xdf': 600, '\xe0': 600, '\xe1': 600, '\xe2': 600, '\xe3': 600, '\xe4': 600, '\xe5': 600, '\xe6': 600, '\xe7': 600, '\xe8': 600, '\xe9': 600, '\xea': 600, '\xeb': 600, '\xec': 600, '\xed': 600, '\xee': 600, '\xef': 600, '\xf0': 600, '\xf1': 600, '\xf2': 600, '\xf3': 600, '\xf4': 600, '\xf5': 600, '\xf6': 600, '\xf7': 600, '\xf8': 600, '\xf9': 600, '\xfa': 600, '\xfb': 600, '\xfc': 600, '\xfd': 600, '\xfe': 600, '\xff': 600, '\u0100': 600, '\u0101': 600, '\u0102': 600, '\u0103': 600, '\u0104': 600, '\u0105': 600, '\u0106': 600, '\u0107': 600, '\u010c': 600, '\u010d': 600, '\u010e': 600, '\u010f': 600, '\u0110': 600, '\u0111': 600, '\u0112': 600, '\u0113': 600, '\u0116': 600, '\u0117': 600, '\u0118': 600, '\u0119': 600, '\u011a': 600, '\u011b': 600, '\u011e': 600, '\u011f': 600, '\u0122': 600, '\u0123': 600, '\u012a': 600, '\u012b': 600, '\u012e': 600, '\u012f': 600, '\u0130': 600, '\u0131': 600, '\u0136': 600, '\u0137': 600, '\u0139': 600, '\u013a': 600, '\u013b': 600, '\u013c': 600, '\u013d': 600, '\u013e': 600, '\u0141': 600, '\u0142': 600, '\u0143': 600, '\u0144': 600, '\u0145': 600, '\u0146': 600, '\u0147': 600, '\u0148': 600, '\u014c': 600, '\u014d': 600, '\u0150': 600, '\u0151': 600, '\u0152': 600, '\u0153': 600, '\u0154': 600, '\u0155': 600, '\u0156': 600, '\u0157': 600, '\u0158': 600, '\u0159': 600, '\u015a': 600, '\u015b': 600, '\u015e': 600, '\u015f': 600, '\u0160': 600, '\u0161': 600, '\u0162': 600, '\u0163': 600, '\u0164': 600, '\u0165': 600, '\u016a': 600, '\u016b': 600, '\u016e': 600, '\u016f': 600, '\u0170': 600, '\u0171': 600, '\u0172': 600, '\u0173': 600, '\u0178': 600, '\u0179': 600, '\u017a': 600, '\u017b': 600, '\u017c': 600, '\u017d': 600, '\u017e': 600, '\u0192': 600, '\u0218': 600, '\u0219': 600, '\u02c6': 600, '\u02c7': 600, '\u02d8': 600, '\u02d9': 600, '\u02da': 600, '\u02db': 600, '\u02dc': 600, '\u02dd': 600, '\u2013': 600, '\u2014': 600, '\u2018': 600, '\u2019': 600, '\u201a': 600, '\u201c': 600, '\u201d': 600, '\u201e': 600, '\u2020': 600, '\u2021': 600, '\u2022': 600, '\u2026': 600, '\u2030': 600, '\u2039': 600, '\u203a': 600, '\u2044': 600, '\u2122': 600, '\u2202': 600, '\u2206': 600, '\u2211': 600, '\u2212': 600, '\u221a': 600, '\u2260': 600, '\u2264': 600, '\u2265': 600, '\u25ca': 600, '\uf6c3': 600, '\ufb01': 600, '\ufb02': 600}),
'Courier-Oblique': ({'FontName': 'Courier-Oblique', 'Descent': -194.0, 'FontBBox': (-49.0, -249.0, 749.0, 803.0), 'FontWeight': 'Medium', 'CapHeight': 572.0, 'FontFamily': 'Courier', 'Flags': 64, 'XHeight': 434.0, 'ItalicAngle': -11.0, 'Ascent': 627.0}, {' ': 600, '!': 600, '"': 600, '#': 600, '$': 600, '%': 600, '&': 600, "'": 600, '(': 600, ')': 600, '*': 600, '+': 600, ',': 600, '-': 600, '.': 600, '/': 600, '0': 600, '1': 600, '2': 600, '3': 600, '4': 600, '5': 600, '6': 600, '7': 600, '8': 600, '9': 600, ':': 600, ';': 600, '<': 600, '=': 600, '>': 600, '?': 600, '@': 600, 'A': 600, 'B': 600, 'C': 600, 'D': 600, 'E': 600, 'F': 600, 'G': 600, 'H': 600, 'I': 600, 'J': 600, 'K': 600, 'L': 600, 'M': 600, 'N': 600, 'O': 600, 'P': 600, 'Q': 600, 'R': 600, 'S': 600, 'T': 600, 'U': 600, 'V': 600, 'W': 600, 'X': 600, 'Y': 600, 'Z': 600, '[': 600, '\\': 600, ']': 600, '^': 600, '_': 600, '`': 600, 'a': 600, 'b': 600, 'c': 600, 'd': 600, 'e': 600, 'f': 600, 'g': 600, 'h': 600, 'i': 600, 'j': 600, 'k': 600, 'l': 600, 'm': 600, 'n': 600, 'o': 600, 'p': 600, 'q': 600, 'r': 600, 's': 600, 't': 600, 'u': 600, 'v': 600, 'w': 600, 'x': 600, 'y': 600, 'z': 600, '{': 600, '|': 600, '}': 600, '~': 600, '\xa1': 600, '\xa2': 600, '\xa3': 600, '\xa4': 600, '\xa5': 600, '\xa6': 600, '\xa7': 600, '\xa8': 600, '\xa9': 600, '\xaa': 600, '\xab': 600, '\xac': 600, '\xae': 600, '\xaf': 600, '\xb0': 600, '\xb1': 600, '\xb2': 600, '\xb3': 600, '\xb4': 600, '\xb5': 600, '\xb6': 600, '\xb7': 600, '\xb8': 600, '\xb9': 600, '\xba': 600, '\xbb': 600, '\xbc': 600, '\xbd': 600, '\xbe': 600, '\xbf': 600, '\xc0': 600, '\xc1': 600, '\xc2': 600, '\xc3': 600, '\xc4': 600, '\xc5': 600, '\xc6': 600, '\xc7': 600, '\xc8': 600, '\xc9': 600, '\xca': 600, '\xcb': 600, '\xcc': 600, '\xcd': 600, '\xce': 600, '\xcf': 600, '\xd0': 600, '\xd1': 600, '\xd2': 600, '\xd3': 600, '\xd4': 600, '\xd5': 600, '\xd6': 600, '\xd7': 600, '\xd8': 600, '\xd9': 600, '\xda': 600, '\xdb': 600, '\xdc': 600, '\xdd': 600, '\xde': 600, '\xdf': 600, '\xe0': 600, '\xe1': 600, '\xe2': 600, '\xe3': 600, '\xe4': 600, '\xe5': 600, '\xe6': 600, '\xe7': 600, '\xe8': 600, '\xe9': 600, '\xea': 600, '\xeb': 600, '\xec': 600, '\xed': 600, '\xee': 600, '\xef': 600, '\xf0': 600, '\xf1': 600, '\xf2': 600, '\xf3': 600, '\xf4': 600, '\xf5': 600, '\xf6': 600, '\xf7': 600, '\xf8': 600, '\xf9': 600, '\xfa': 600, '\xfb': 600, '\xfc': 600, '\xfd': 600, '\xfe': 600, '\xff': 600, '\u0100': 600, '\u0101': 600, '\u0102': 600, '\u0103': 600, '\u0104': 600, '\u0105': 600, '\u0106': 600, '\u0107': 600, '\u010c': 600, '\u010d': 600, '\u010e': 600, '\u010f': 600, '\u0110': 600, '\u0111': 600, '\u0112': 600, '\u0113': 600, '\u0116': 600, '\u0117': 600, '\u0118': 600, '\u0119': 600, '\u011a': 600, '\u011b': 600, '\u011e': 600, '\u011f': 600, '\u0122': 600, '\u0123': 600, '\u012a': 600, '\u012b': 600, '\u012e': 600, '\u012f': 600, '\u0130': 600, '\u0131': 600, '\u0136': 600, '\u0137': 600, '\u0139': 600, '\u013a': 600, '\u013b': 600, '\u013c': 600, '\u013d': 600, '\u013e': 600, '\u0141': 600, '\u0142': 600, '\u0143': 600, '\u0144': 600, '\u0145': 600, '\u0146': 600, '\u0147': 600, '\u0148': 600, '\u014c': 600, '\u014d': 600, '\u0150': 600, '\u0151': 600, '\u0152': 600, '\u0153': 600, '\u0154': 600, '\u0155': 600, '\u0156': 600, '\u0157': 600, '\u0158': 600, '\u0159': 600, '\u015a': 600, '\u015b': 600, '\u015e': 600, '\u015f': 600, '\u0160': 600, '\u0161': 600, '\u0162': 600, '\u0163': 600, '\u0164': 600, '\u0165': 600, '\u016a': 600, '\u016b': 600, '\u016e': 600, '\u016f': 600, '\u0170': 600, '\u0171': 600, '\u0172': 600, '\u0173': 600, '\u0178': 600, '\u0179': 600, '\u017a': 600, '\u017b': 600, '\u017c': 600, '\u017d': 600, '\u017e': 600, '\u0192': 600, '\u0218': 600, '\u0219': 600, '\u02c6': 600, '\u02c7': 600, '\u02d8': 600, '\u02d9': 600, '\u02da': 600, '\u02db': 600, '\u02dc': 600, '\u02dd': 600, '\u2013': 600, '\u2014': 600, '\u2018': 600, '\u2019': 600, '\u201a': 600, '\u201c': 600, '\u201d': 600, '\u201e': 600, '\u2020': 600, '\u2021': 600, '\u2022': 600, '\u2026': 600, '\u2030': 600, '\u2039': 600, '\u203a': 600, '\u2044': 600, '\u2122': 600, '\u2202': 600, '\u2206': 600, '\u2211': 600, '\u2212': 600, '\u221a': 600, '\u2260': 600, '\u2264': 600, '\u2265': 600, '\u25ca': 600, '\uf6c3': 600, '\ufb01': 600, '\ufb02': 600}),
'Helvetica': ({'FontName': 'Helvetica', 'Descent': -207.0, 'FontBBox': (-166.0, -225.0, 1000.0, 931.0), 'FontWeight': 'Medium', 'CapHeight': 718.0, 'FontFamily': 'Helvetica', 'Flags': 0, 'XHeight': 523.0, 'ItalicAngle': 0.0, 'Ascent': 718.0}, {' ': 278, '!': 278, '"': 355, '#': 556, '$': 556, '%': 889, '&': 667, "'": 191, '(': 333, ')': 333, '*': 389, '+': 584, ',': 278, '-': 333, '.': 278, '/': 278, '0': 556, '1': 556, '2': 556, '3': 556, '4': 556, '5': 556, '6': 556, '7': 556, '8': 556, '9': 556, ':': 278, ';': 278, '<': 584, '=': 584, '>': 584, '?': 556, '@': 1015, 'A': 667, 'B': 667, 'C': 722, 'D': 722, 'E': 667, 'F': 611, 'G': 778, 'H': 722, 'I': 278, 'J': 500, 'K': 667, 'L': 556, 'M': 833, 'N': 722, 'O': 778, 'P': 667, 'Q': 778, 'R': 722, 'S': 667, 'T': 611, 'U': 722, 'V': 667, 'W': 944, 'X': 667, 'Y': 667, 'Z': 611, '[': 278, '\\': 278, ']': 278, '^': 469, '_': 556, '`': 333, 'a': 556, 'b': 556, 'c': 500, 'd': 556, 'e': 556, 'f': 278, 'g': 556, 'h': 556, 'i': 222, 'j': 222, 'k': 500, 'l': 222, 'm': 833, 'n': 556, 'o': 556, 'p': 556, 'q': 556, 'r': 333, 's': 500, 't': 278, 'u': 556, 'v': 500, 'w': 722, 'x': 500, 'y': 500, 'z': 500, '{': 334, '|': 260, '}': 334, '~': 584, '\xa1': 333, '\xa2': 556, '\xa3': 556, '\xa4': 556, '\xa5': 556, '\xa6': 260, '\xa7': 556, '\xa8': 333, '\xa9': 737, '\xaa': 370, '\xab': 556, '\xac': 584, '\xae': 737, '\xaf': 333, '\xb0': 400, '\xb1': 584, '\xb2': 333, '\xb3': 333, '\xb4': 333, '\xb5': 556, '\xb6': 537, '\xb7': 278, '\xb8': 333, '\xb9': 333, '\xba': 365, '\xbb': 556, '\xbc': 834, '\xbd': 834, '\xbe': 834, '\xbf': 611, '\xc0': 667, '\xc1': 667, '\xc2': 667, '\xc3': 667, '\xc4': 667, '\xc5': 667, '\xc6': 1000, '\xc7': 722, '\xc8': 667, '\xc9': 667, '\xca': 667, '\xcb': 667, '\xcc': 278, '\xcd': 278, '\xce': 278, '\xcf': 278, '\xd0': 722, '\xd1': 722, '\xd2': 778, '\xd3': 778, '\xd4': 778, '\xd5': 778, '\xd6': 778, '\xd7': 584, '\xd8': 778, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 667, '\xde': 667, '\xdf': 611, '\xe0': 556, '\xe1': 556, '\xe2': 556, '\xe3': 556, '\xe4': 556, '\xe5': 556, '\xe6': 889, '\xe7': 500, '\xe8': 556, '\xe9': 556, '\xea': 556, '\xeb': 556, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 556, '\xf1': 556, '\xf2': 556, '\xf3': 556, '\xf4': 556, '\xf5': 556, '\xf6': 556, '\xf7': 584, '\xf8': 611, '\xf9': 556, '\xfa': 556, '\xfb': 556, '\xfc': 556, '\xfd': 500, '\xfe': 556, '\xff': 500, '\u0100': 667, '\u0101': 556, '\u0102': 667, '\u0103': 556, '\u0104': 667, '\u0105': 556, '\u0106': 722, '\u0107': 500, '\u010c': 722, '\u010d': 500, '\u010e': 722, '\u010f': 643, '\u0110': 722, '\u0111': 556, '\u0112': 667, '\u0113': 556, '\u0116': 667, '\u0117': 556, '\u0118': 667, '\u0119': 556, '\u011a': 667, '\u011b': 556, '\u011e': 778, '\u011f': 556, '\u0122': 778, '\u0123': 556, '\u012a': 278, '\u012b': 278, '\u012e': 278, '\u012f': 222, '\u0130': 278, '\u0131': 278, '\u0136': 667, '\u0137': 500, '\u0139': 556, '\u013a': 222, '\u013b': 556, '\u013c': 222, '\u013d': 556, '\u013e': 299, '\u0141': 556, '\u0142': 222, '\u0143': 722, '\u0144': 556, '\u0145': 722, '\u0146': 556, '\u0147': 722, '\u0148': 556, '\u014c': 778, '\u014d': 556, '\u0150': 778, '\u0151': 556, '\u0152': 1000, '\u0153': 944, '\u0154': 722, '\u0155': 333, '\u0156': 722, '\u0157': 333, '\u0158': 722, '\u0159': 333, '\u015a': 667, '\u015b': 500, '\u015e': 667, '\u015f': 500, '\u0160': 667, '\u0161': 500, '\u0162': 611, '\u0163': 278, '\u0164': 611, '\u0165': 317, '\u016a': 722, '\u016b': 556, '\u016e': 722, '\u016f': 556, '\u0170': 722, '\u0171': 556, '\u0172': 722, '\u0173': 556, '\u0178': 667, '\u0179': 611, '\u017a': 500, '\u017b': 611, '\u017c': 500, '\u017d': 611, '\u017e': 500, '\u0192': 556, '\u0218': 667, '\u0219': 500, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 556, '\u2014': 1000, '\u2018': 222, '\u2019': 222, '\u201a': 222, '\u201c': 333, '\u201d': 333, '\u201e': 333, '\u2020': 556, '\u2021': 556, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 1000, '\u2202': 476, '\u2206': 612, '\u2211': 600, '\u2212': 584, '\u221a': 453, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 471, '\uf6c3': 250, '\ufb01': 500, '\ufb02': 500}),
'Helvetica-Bold': ({'FontName': 'Helvetica-Bold', 'Descent': -207.0, 'FontBBox': (-170.0, -228.0, 1003.0, 962.0), 'FontWeight': 'Bold', 'CapHeight': 718.0, 'FontFamily': 'Helvetica', 'Flags': 0, 'XHeight': 532.0, 'ItalicAngle': 0.0, 'Ascent': 718.0}, {' ': 278, '!': 333, '"': 474, '#': 556, '$': 556, '%': 889, '&': 722, "'": 238, '(': 333, ')': 333, '*': 389, '+': 584, ',': 278, '-': 333, '.': 278, '/': 278, '0': 556, '1': 556, '2': 556, '3': 556, '4': 556, '5': 556, '6': 556, '7': 556, '8': 556, '9': 556, ':': 333, ';': 333, '<': 584, '=': 584, '>': 584, '?': 611, '@': 975, 'A': 722, 'B': 722, 'C': 722, 'D': 722, 'E': 667, 'F': 611, 'G': 778, 'H': 722, 'I': 278, 'J': 556, 'K': 722, 'L': 611, 'M': 833, 'N': 722, 'O': 778, 'P': 667, 'Q': 778, 'R': 722, 'S': 667, 'T': 611, 'U': 722, 'V': 667, 'W': 944, 'X': 667, 'Y': 667, 'Z': 611, '[': 333, '\\': 278, ']': 333, '^': 584, '_': 556, '`': 333, 'a': 556, 'b': 611, 'c': 556, 'd': 611, 'e': 556, 'f': 333, 'g': 611, 'h': 611, 'i': 278, 'j': 278, 'k': 556, 'l': 278, 'm': 889, 'n': 611, 'o': 611, 'p': 611, 'q': 611, 'r': 389, 's': 556, 't': 333, 'u': 611, 'v': 556, 'w': 778, 'x': 556, 'y': 556, 'z': 500, '{': 389, '|': 280, '}': 389, '~': 584, '\xa1': 333, '\xa2': 556, '\xa3': 556, '\xa4': 556, '\xa5': 556, '\xa6': 280, '\xa7': 556, '\xa8': 333, '\xa9': 737, '\xaa': 370, '\xab': 556, '\xac': 584, '\xae': 737, '\xaf': 333, '\xb0': 400, '\xb1': 584, '\xb2': 333, '\xb3': 333, '\xb4': 333, '\xb5': 611, '\xb6': 556, '\xb7': 278, '\xb8': 333, '\xb9': 333, '\xba': 365, '\xbb': 556, '\xbc': 834, '\xbd': 834, '\xbe': 834, '\xbf': 611, '\xc0': 722, '\xc1': 722, '\xc2': 722, '\xc3': 722, '\xc4': 722, '\xc5': 722, '\xc6': 1000, '\xc7': 722, '\xc8': 667, '\xc9': 667, '\xca': 667, '\xcb': 667, '\xcc': 278, '\xcd': 278, '\xce': 278, '\xcf': 278, '\xd0': 722, '\xd1': 722, '\xd2': 778, '\xd3': 778, '\xd4': 778, '\xd5': 778, '\xd6': 778, '\xd7': 584, '\xd8': 778, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 667, '\xde': 667, '\xdf': 611, '\xe0': 556, '\xe1': 556, '\xe2': 556, '\xe3': 556, '\xe4': 556, '\xe5': 556, '\xe6': 889, '\xe7': 556, '\xe8': 556, '\xe9': 556, '\xea': 556, '\xeb': 556, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 611, '\xf1': 611, '\xf2': 611, '\xf3': 611, '\xf4': 611, '\xf5': 611, '\xf6': 611, '\xf7': 584, '\xf8': 611, '\xf9': 611, '\xfa': 611, '\xfb': 611, '\xfc': 611, '\xfd': 556, '\xfe': 611, '\xff': 556, '\u0100': 722, '\u0101': 556, '\u0102': 722, '\u0103': 556, '\u0104': 722, '\u0105': 556, '\u0106': 722, '\u0107': 556, '\u010c': 722, '\u010d': 556, '\u010e': 722, '\u010f': 743, '\u0110': 722, '\u0111': 611, '\u0112': 667, '\u0113': 556, '\u0116': 667, '\u0117': 556, '\u0118': 667, '\u0119': 556, '\u011a': 667, '\u011b': 556, '\u011e': 778, '\u011f': 611, '\u0122': 778, '\u0123': 611, '\u012a': 278, '\u012b': 278, '\u012e': 278, '\u012f': 278, '\u0130': 278, '\u0131': 278, '\u0136': 722, '\u0137': 556, '\u0139': 611, '\u013a': 278, '\u013b': 611, '\u013c': 278, '\u013d': 611, '\u013e': 400, '\u0141': 611, '\u0142': 278, '\u0143': 722, '\u0144': 611, '\u0145': 722, '\u0146': 611, '\u0147': 722, '\u0148': 611, '\u014c': 778, '\u014d': 611, '\u0150': 778, '\u0151': 611, '\u0152': 1000, '\u0153': 944, '\u0154': 722, '\u0155': 389, '\u0156': 722, '\u0157': 389, '\u0158': 722, '\u0159': 389, '\u015a': 667, '\u015b': 556, '\u015e': 667, '\u015f': 556, '\u0160': 667, '\u0161': 556, '\u0162': 611, '\u0163': 333, '\u0164': 611, '\u0165': 389, '\u016a': 722, '\u016b': 611, '\u016e': 722, '\u016f': 611, '\u0170': 722, '\u0171': 611, '\u0172': 722, '\u0173': 611, '\u0178': 667, '\u0179': 611, '\u017a': 500, '\u017b': 611, '\u017c': 500, '\u017d': 611, '\u017e': 500, '\u0192': 556, '\u0218': 667, '\u0219': 556, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 556, '\u2014': 1000, '\u2018': 278, '\u2019': 278, '\u201a': 278, '\u201c': 500, '\u201d': 500, '\u201e': 500, '\u2020': 556, '\u2021': 556, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 1000, '\u2202': 494, '\u2206': 612, '\u2211': 600, '\u2212': 584, '\u221a': 549, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 494, '\uf6c3': 250, '\ufb01': 611, '\ufb02': 611}),
'Helvetica-BoldOblique': ({'FontName': 'Helvetica-BoldOblique', 'Descent': -207.0, 'FontBBox': (-175.0, -228.0, 1114.0, 962.0), 'FontWeight': 'Bold', 'CapHeight': 718.0, 'FontFamily': 'Helvetica', 'Flags': 0, 'XHeight': 532.0, 'ItalicAngle': -12.0, 'Ascent': 718.0}, {' ': 278, '!': 333, '"': 474, '#': 556, '$': 556, '%': 889, '&': 722, "'": 238, '(': 333, ')': 333, '*': 389, '+': 584, ',': 278, '-': 333, '.': 278, '/': 278, '0': 556, '1': 556, '2': 556, '3': 556, '4': 556, '5': 556, '6': 556, '7': 556, '8': 556, '9': 556, ':': 333, ';': 333, '<': 584, '=': 584, '>': 584, '?': 611, '@': 975, 'A': 722, 'B': 722, 'C': 722, 'D': 722, 'E': 667, 'F': 611, 'G': 778, 'H': 722, 'I': 278, 'J': 556, 'K': 722, 'L': 611, 'M': 833, 'N': 722, 'O': 778, 'P': 667, 'Q': 778, 'R': 722, 'S': 667, 'T': 611, 'U': 722, 'V': 667, 'W': 944, 'X': 667, 'Y': 667, 'Z': 611, '[': 333, '\\': 278, ']': 333, '^': 584, '_': 556, '`': 333, 'a': 556, 'b': 611, 'c': 556, 'd': 611, 'e': 556, 'f': 333, 'g': 611, 'h': 611, 'i': 278, 'j': 278, 'k': 556, 'l': 278, 'm': 889, 'n': 611, 'o': 611, 'p': 611, 'q': 611, 'r': 389, 's': 556, 't': 333, 'u': 611, 'v': 556, 'w': 778, 'x': 556, 'y': 556, 'z': 500, '{': 389, '|': 280, '}': 389, '~': 584, '\xa1': 333, '\xa2': 556, '\xa3': 556, '\xa4': 556, '\xa5': 556, '\xa6': 280, '\xa7': 556, '\xa8': 333, '\xa9': 737, '\xaa': 370, '\xab': 556, '\xac': 584, '\xae': 737, '\xaf': 333, '\xb0': 400, '\xb1': 584, '\xb2': 333, '\xb3': 333, '\xb4': 333, '\xb5': 611, '\xb6': 556, '\xb7': 278, '\xb8': 333, '\xb9': 333, '\xba': 365, '\xbb': 556, '\xbc': 834, '\xbd': 834, '\xbe': 834, '\xbf': 611, '\xc0': 722, '\xc1': 722, '\xc2': 722, '\xc3': 722, '\xc4': 722, '\xc5': 722, '\xc6': 1000, '\xc7': 722, '\xc8': 667, '\xc9': 667, '\xca': 667, '\xcb': 667, '\xcc': 278, '\xcd': 278, '\xce': 278, '\xcf': 278, '\xd0': 722, '\xd1': 722, '\xd2': 778, '\xd3': 778, '\xd4': 778, '\xd5': 778, '\xd6': 778, '\xd7': 584, '\xd8': 778, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 667, '\xde': 667, '\xdf': 611, '\xe0': 556, '\xe1': 556, '\xe2': 556, '\xe3': 556, '\xe4': 556, '\xe5': 556, '\xe6': 889, '\xe7': 556, '\xe8': 556, '\xe9': 556, '\xea': 556, '\xeb': 556, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 611, '\xf1': 611, '\xf2': 611, '\xf3': 611, '\xf4': 611, '\xf5': 611, '\xf6': 611, '\xf7': 584, '\xf8': 611, '\xf9': 611, '\xfa': 611, '\xfb': 611, '\xfc': 611, '\xfd': 556, '\xfe': 611, '\xff': 556, '\u0100': 722, '\u0101': 556, '\u0102': 722, '\u0103': 556, '\u0104': 722, '\u0105': 556, '\u0106': 722, '\u0107': 556, '\u010c': 722, '\u010d': 556, '\u010e': 722, '\u010f': 743, '\u0110': 722, '\u0111': 611, '\u0112': 667, '\u0113': 556, '\u0116': 667, '\u0117': 556, '\u0118': 667, '\u0119': 556, '\u011a': 667, '\u011b': 556, '\u011e': 778, '\u011f': 611, '\u0122': 778, '\u0123': 611, '\u012a': 278, '\u012b': 278, '\u012e': 278, '\u012f': 278, '\u0130': 278, '\u0131': 278, '\u0136': 722, '\u0137': 556, '\u0139': 611, '\u013a': 278, '\u013b': 611, '\u013c': 278, '\u013d': 611, '\u013e': 400, '\u0141': 611, '\u0142': 278, '\u0143': 722, '\u0144': 611, '\u0145': 722, '\u0146': 611, '\u0147': 722, '\u0148': 611, '\u014c': 778, '\u014d': 611, '\u0150': 778, '\u0151': 611, '\u0152': 1000, '\u0153': 944, '\u0154': 722, '\u0155': 389, '\u0156': 722, '\u0157': 389, '\u0158': 722, '\u0159': 389, '\u015a': 667, '\u015b': 556, '\u015e': 667, '\u015f': 556, '\u0160': 667, '\u0161': 556, '\u0162': 611, '\u0163': 333, '\u0164': 611, '\u0165': 389, '\u016a': 722, '\u016b': 611, '\u016e': 722, '\u016f': 611, '\u0170': 722, '\u0171': 611, '\u0172': 722, '\u0173': 611, '\u0178': 667, '\u0179': 611, '\u017a': 500, '\u017b': 611, '\u017c': 500, '\u017d': 611, '\u017e': 500, '\u0192': 556, '\u0218': 667, '\u0219': 556, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 556, '\u2014': 1000, '\u2018': 278, '\u2019': 278, '\u201a': 278, '\u201c': 500, '\u201d': 500, '\u201e': 500, '\u2020': 556, '\u2021': 556, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 1000, '\u2202': 494, '\u2206': 612, '\u2211': 600, '\u2212': 584, '\u221a': 549, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 494, '\uf6c3': 250, '\ufb01': 611, '\ufb02': 611}),
'Helvetica-Oblique': ({'FontName': 'Helvetica-Oblique', 'Descent': -207.0, 'FontBBox': (-171.0, -225.0, 1116.0, 931.0), 'FontWeight': 'Medium', 'CapHeight': 718.0, 'FontFamily': 'Helvetica', 'Flags': 0, 'XHeight': 523.0, 'ItalicAngle': -12.0, 'Ascent': 718.0}, {' ': 278, '!': 278, '"': 355, '#': 556, '$': 556, '%': 889, '&': 667, "'": 191, '(': 333, ')': 333, '*': 389, '+': 584, ',': 278, '-': 333, '.': 278, '/': 278, '0': 556, '1': 556, '2': 556, '3': 556, '4': 556, '5': 556, '6': 556, '7': 556, '8': 556, '9': 556, ':': 278, ';': 278, '<': 584, '=': 584, '>': 584, '?': 556, '@': 1015, 'A': 667, 'B': 667, 'C': 722, 'D': 722, 'E': 667, 'F': 611, 'G': 778, 'H': 722, 'I': 278, 'J': 500, 'K': 667, 'L': 556, 'M': 833, 'N': 722, 'O': 778, 'P': 667, 'Q': 778, 'R': 722, 'S': 667, 'T': 611, 'U': 722, 'V': 667, 'W': 944, 'X': 667, 'Y': 667, 'Z': 611, '[': 278, '\\': 278, ']': 278, '^': 469, '_': 556, '`': 333, 'a': 556, 'b': 556, 'c': 500, 'd': 556, 'e': 556, 'f': 278, 'g': 556, 'h': 556, 'i': 222, 'j': 222, 'k': 500, 'l': 222, 'm': 833, 'n': 556, 'o': 556, 'p': 556, 'q': 556, 'r': 333, 's': 500, 't': 278, 'u': 556, 'v': 500, 'w': 722, 'x': 500, 'y': 500, 'z': 500, '{': 334, '|': 260, '}': 334, '~': 584, '\xa1': 333, '\xa2': 556, '\xa3': 556, '\xa4': 556, '\xa5': 556, '\xa6': 260, '\xa7': 556, '\xa8': 333, '\xa9': 737, '\xaa': 370, '\xab': 556, '\xac': 584, '\xae': 737, '\xaf': 333, '\xb0': 400, '\xb1': 584, '\xb2': 333, '\xb3': 333, '\xb4': 333, '\xb5': 556, '\xb6': 537, '\xb7': 278, '\xb8': 333, '\xb9': 333, '\xba': 365, '\xbb': 556, '\xbc': 834, '\xbd': 834, '\xbe': 834, '\xbf': 611, '\xc0': 667, '\xc1': 667, '\xc2': 667, '\xc3': 667, '\xc4': 667, '\xc5': 667, '\xc6': 1000, '\xc7': 722, '\xc8': 667, '\xc9': 667, '\xca': 667, '\xcb': 667, '\xcc': 278, '\xcd': 278, '\xce': 278, '\xcf': 278, '\xd0': 722, '\xd1': 722, '\xd2': 778, '\xd3': 778, '\xd4': 778, '\xd5': 778, '\xd6': 778, '\xd7': 584, '\xd8': 778, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 667, '\xde': 667, '\xdf': 611, '\xe0': 556, '\xe1': 556, '\xe2': 556, '\xe3': 556, '\xe4': 556, '\xe5': 556, '\xe6': 889, '\xe7': 500, '\xe8': 556, '\xe9': 556, '\xea': 556, '\xeb': 556, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 556, '\xf1': 556, '\xf2': 556, '\xf3': 556, '\xf4': 556, '\xf5': 556, '\xf6': 556, '\xf7': 584, '\xf8': 611, '\xf9': 556, '\xfa': 556, '\xfb': 556, '\xfc': 556, '\xfd': 500, '\xfe': 556, '\xff': 500, '\u0100': 667, '\u0101': 556, '\u0102': 667, '\u0103': 556, '\u0104': 667, '\u0105': 556, '\u0106': 722, '\u0107': 500, '\u010c': 722, '\u010d': 500, '\u010e': 722, '\u010f': 643, '\u0110': 722, '\u0111': 556, '\u0112': 667, '\u0113': 556, '\u0116': 667, '\u0117': 556, '\u0118': 667, '\u0119': 556, '\u011a': 667, '\u011b': 556, '\u011e': 778, '\u011f': 556, '\u0122': 778, '\u0123': 556, '\u012a': 278, '\u012b': 278, '\u012e': 278, '\u012f': 222, '\u0130': 278, '\u0131': 278, '\u0136': 667, '\u0137': 500, '\u0139': 556, '\u013a': 222, '\u013b': 556, '\u013c': 222, '\u013d': 556, '\u013e': 299, '\u0141': 556, '\u0142': 222, '\u0143': 722, '\u0144': 556, '\u0145': 722, '\u0146': 556, '\u0147': 722, '\u0148': 556, '\u014c': 778, '\u014d': 556, '\u0150': 778, '\u0151': 556, '\u0152': 1000, '\u0153': 944, '\u0154': 722, '\u0155': 333, '\u0156': 722, '\u0157': 333, '\u0158': 722, '\u0159': 333, '\u015a': 667, '\u015b': 500, '\u015e': 667, '\u015f': 500, '\u0160': 667, '\u0161': 500, '\u0162': 611, '\u0163': 278, '\u0164': 611, '\u0165': 317, '\u016a': 722, '\u016b': 556, '\u016e': 722, '\u016f': 556, '\u0170': 722, '\u0171': 556, '\u0172': 722, '\u0173': 556, '\u0178': 667, '\u0179': 611, '\u017a': 500, '\u017b': 611, '\u017c': 500, '\u017d': 611, '\u017e': 500, '\u0192': 556, '\u0218': 667, '\u0219': 500, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 556, '\u2014': 1000, '\u2018': 222, '\u2019': 222, '\u201a': 222, '\u201c': 333, '\u201d': 333, '\u201e': 333, '\u2020': 556, '\u2021': 556, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 1000, '\u2202': 476, '\u2206': 612, '\u2211': 600, '\u2212': 584, '\u221a': 453, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 471, '\uf6c3': 250, '\ufb01': 500, '\ufb02': 500}),
'Symbol': ({'FontName': 'Symbol', 'FontBBox': (-180.0, -293.0, 1090.0, 1010.0), 'FontWeight': 'Medium', 'FontFamily': 'Symbol', 'Flags': 0, 'ItalicAngle': 0.0}, {' ': 250, '!': 333, '#': 500, '%': 833, '&': 778, '(': 333, ')': 333, '+': 549, ',': 250, '.': 250, '/': 278, '0': 500, '1': 500, '2': 500, '3': 500, '4': 500, '5': 500, '6': 500, '7': 500, '8': 500, '9': 500, ':': 278, ';': 278, '<': 549, '=': 549, '>': 549, '?': 444, '[': 333, ']': 333, '_': 500, '{': 480, '|': 200, '}': 480, '\xac': 713, '\xb0': 400, '\xb1': 549, '\xb5': 576, '\xd7': 549, '\xf7': 549, '\u0192': 500, '\u0391': 722, '\u0392': 667, '\u0393': 603, '\u0395': 611, '\u0396': 611, '\u0397': 722, '\u0398': 741, '\u0399': 333, '\u039a': 722, '\u039b': 686, '\u039c': 889, '\u039d': 722, '\u039e': 645, '\u039f': 722, '\u03a0': 768, '\u03a1': 556, '\u03a3': 592, '\u03a4': 611, '\u03a5': 690, '\u03a6': 763, '\u03a7': 722, '\u03a8': 795, '\u03b1': 631, '\u03b2': 549, '\u03b3': 411, '\u03b4': 494, '\u03b5': 439, '\u03b6': 494, '\u03b7': 603, '\u03b8': 521, '\u03b9': 329, '\u03ba': 549, '\u03bb': 549, '\u03bd': 521, '\u03be': 493, '\u03bf': 549, '\u03c0': 549, '\u03c1': 549, '\u03c2': 439, '\u03c3': 603, '\u03c4': 439, '\u03c5': 576, '\u03c6': 521, '\u03c7': 549, '\u03c8': 686, '\u03c9': 686, '\u03d1': 631, '\u03d2': 620, '\u03d5': 603, '\u03d6': 713, '\u2022': 460, '\u2026': 1000, '\u2032': 247, '\u2033': 411, '\u2044': 167, '\u20ac': 750, '\u2111': 686, '\u2118': 987, '\u211c': 795, '\u2126': 768, '\u2135': 823, '\u2190': 987, '\u2191': 603, '\u2192': 987, '\u2193': 603, '\u2194': 1042, '\u21b5': 658, '\u21d0': 987, '\u21d1': 603, '\u21d2': 987, '\u21d3': 603, '\u21d4': 1042, '\u2200': 713, '\u2202': 494, '\u2203': 549, '\u2205': 823, '\u2206': 612, '\u2207': 713, '\u2208': 713, '\u2209': 713, '\u220b': 439, '\u220f': 823, '\u2211': 713, '\u2212': 549, '\u2217': 500, '\u221a': 549, '\u221d': 713, '\u221e': 713, '\u2220': 768, '\u2227': 603, '\u2228': 603, '\u2229': 768, '\u222a': 768, '\u222b': 274, '\u2234': 863, '\u223c': 549, '\u2245': 549, '\u2248': 549, '\u2260': 549, '\u2261': 549, '\u2264': 549, '\u2265': 549, '\u2282': 713, '\u2283': 713, '\u2284': 713, '\u2286': 713, '\u2287': 713, '\u2295': 768, '\u2297': 768, '\u22a5': 658, '\u22c5': 250, '\u2320': 686, '\u2321': 686, '\u2329': 329, '\u232a': 329, '\u25ca': 494, '\u2660': 753, '\u2663': 753, '\u2665': 753, '\u2666': 753, '\uf6d9': 790, '\uf6da': 790, '\uf6db': 890, '\uf8e5': 500, '\uf8e6': 603, '\uf8e7': 1000, '\uf8e8': 790, '\uf8e9': 790, '\uf8ea': 786, '\uf8eb': 384, '\uf8ec': 384, '\uf8ed': 384, '\uf8ee': 384, '\uf8ef': 384, '\uf8f0': 384, '\uf8f1': 494, '\uf8f2': 494, '\uf8f3': 494, '\uf8f4': 494, '\uf8f5': 686, '\uf8f6': 384, '\uf8f7': 384, '\uf8f8': 384, '\uf8f9': 384, '\uf8fa': 384, '\uf8fb': 384, '\uf8fc': 494, '\uf8fd': 494, '\uf8fe': 494, '\uf8ff': 790}),
'Times-Bold': ({'FontName': 'Times-Bold', 'Descent': -217.0, 'FontBBox': (-168.0, -218.0, 1000.0, 935.0), 'FontWeight': 'Bold', 'CapHeight': 676.0, 'FontFamily': 'Times', 'Flags': 0, 'XHeight': 461.0, 'ItalicAngle': 0.0, 'Ascent': 683.0}, {' ': 250, '!': 333, '"': 555, '#': 500, '$': 500, '%': 1000, '&': 833, "'": 278, '(': 333, ')': 333, '*': 500, '+': 570, ',': 250, '-': 333, '.': 250, '/': 278, '0': 500, '1': 500, '2': 500, '3': 500, '4': 500, '5': 500, '6': 500, '7': 500, '8': 500, '9': 500, ':': 333, ';': 333, '<': 570, '=': 570, '>': 570, '?': 500, '@': 930, 'A': 722, 'B': 667, 'C': 722, 'D': 722, 'E': 667, 'F': 611, 'G': 778, 'H': 778, 'I': 389, 'J': 500, 'K': 778, 'L': 667, 'M': 944, 'N': 722, 'O': 778, 'P': 611, 'Q': 778, 'R': 722, 'S': 556, 'T': 667, 'U': 722, 'V': 722, 'W': 1000, 'X': 722, 'Y': 722, 'Z': 667, '[': 333, '\\': 278, ']': 333, '^': 581, '_': 500, '`': 333, 'a': 500, 'b': 556, 'c': 444, 'd': 556, 'e': 444, 'f': 333, 'g': 500, 'h': 556, 'i': 278, 'j': 333, 'k': 556, 'l': 278, 'm': 833, 'n': 556, 'o': 500, 'p': 556, 'q': 556, 'r': 444, 's': 389, 't': 333, 'u': 556, 'v': 500, 'w': 722, 'x': 500, 'y': 500, 'z': 444, '{': 394, '|': 220, '}': 394, '~': 520, '\xa1': 333, '\xa2': 500, '\xa3': 500, '\xa4': 500, '\xa5': 500, '\xa6': 220, '\xa7': 500, '\xa8': 333, '\xa9': 747, '\xaa': 300, '\xab': 500, '\xac': 570, '\xae': 747, '\xaf': 333, '\xb0': 400, '\xb1': 570, '\xb2': 300, '\xb3': 300, '\xb4': 333, '\xb5': 556, '\xb6': 540, '\xb7': 250, '\xb8': 333, '\xb9': 300, '\xba': 330, '\xbb': 500, '\xbc': 750, '\xbd': 750, '\xbe': 750, '\xbf': 500, '\xc0': 722, '\xc1': 722, '\xc2': 722, '\xc3': 722, '\xc4': 722, '\xc5': 722, '\xc6': 1000, '\xc7': 722, '\xc8': 667, '\xc9': 667, '\xca': 667, '\xcb': 667, '\xcc': 389, '\xcd': 389, '\xce': 389, '\xcf': 389, '\xd0': 722, '\xd1': 722, '\xd2': 778, '\xd3': 778, '\xd4': 778, '\xd5': 778, '\xd6': 778, '\xd7': 570, '\xd8': 778, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 722, '\xde': 611, '\xdf': 556, '\xe0': 500, '\xe1': 500, '\xe2': 500, '\xe3': 500, '\xe4': 500, '\xe5': 500, '\xe6': 722, '\xe7': 444, '\xe8': 444, '\xe9': 444, '\xea': 444, '\xeb': 444, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 500, '\xf1': 556, '\xf2': 500, '\xf3': 500, '\xf4': 500, '\xf5': 500, '\xf6': 500, '\xf7': 570, '\xf8': 500, '\xf9': 556, '\xfa': 556, '\xfb': 556, '\xfc': 556, '\xfd': 500, '\xfe': 556, '\xff': 500, '\u0100': 722, '\u0101': 500, '\u0102': 722, '\u0103': 500, '\u0104': 722, '\u0105': 500, '\u0106': 722, '\u0107': 444, '\u010c': 722, '\u010d': 444, '\u010e': 722, '\u010f': 672, '\u0110': 722, '\u0111': 556, '\u0112': 667, '\u0113': 444, '\u0116': 667, '\u0117': 444, '\u0118': 667, '\u0119': 444, '\u011a': 667, '\u011b': 444, '\u011e': 778, '\u011f': 500, '\u0122': 778, '\u0123': 500, '\u012a': 389, '\u012b': 278, '\u012e': 389, '\u012f': 278, '\u0130': 389, '\u0131': 278, '\u0136': 778, '\u0137': 556, '\u0139': 667, '\u013a': 278, '\u013b': 667, '\u013c': 278, '\u013d': 667, '\u013e': 394, '\u0141': 667, '\u0142': 278, '\u0143': 722, '\u0144': 556, '\u0145': 722, '\u0146': 556, '\u0147': 722, '\u0148': 556, '\u014c': 778, '\u014d': 500, '\u0150': 778, '\u0151': 500, '\u0152': 1000, '\u0153': 722, '\u0154': 722, '\u0155': 444, '\u0156': 722, '\u0157': 444, '\u0158': 722, '\u0159': 444, '\u015a': 556, '\u015b': 389, '\u015e': 556, '\u015f': 389, '\u0160': 556, '\u0161': 389, '\u0162': 667, '\u0163': 333, '\u0164': 667, '\u0165': 416, '\u016a': 722, '\u016b': 556, '\u016e': 722, '\u016f': 556, '\u0170': 722, '\u0171': 556, '\u0172': 722, '\u0173': 556, '\u0178': 722, '\u0179': 667, '\u017a': 444, '\u017b': 667, '\u017c': 444, '\u017d': 667, '\u017e': 444, '\u0192': 500, '\u0218': 556, '\u0219': 389, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 500, '\u2014': 1000, '\u2018': 333, '\u2019': 333, '\u201a': 333, '\u201c': 500, '\u201d': 500, '\u201e': 500, '\u2020': 500, '\u2021': 500, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 1000, '\u2202': 494, '\u2206': 612, '\u2211': 600, '\u2212': 570, '\u221a': 549, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 494, '\uf6c3': 250, '\ufb01': 556, '\ufb02': 556}),
'Times-BoldItalic': ({'FontName': 'Times-BoldItalic', 'Descent': -217.0, 'FontBBox': (-200.0, -218.0, 996.0, 921.0), 'FontWeight': 'Bold', 'CapHeight': 669.0, 'FontFamily': 'Times', 'Flags': 0, 'XHeight': 462.0, 'ItalicAngle': -15.0, 'Ascent': 683.0}, {' ': 250, '!': 389, '"': 555, '#': 500, '$': 500, '%': 833, '&': 778, "'": 278, '(': 333, ')': 333, '*': 500, '+': 570, ',': 250, '-': 333, '.': 250, '/': 278, '0': 500, '1': 500, '2': 500, '3': 500, '4': 500, '5': 500, '6': 500, '7': 500, '8': 500, '9': 500, ':': 333, ';': 333, '<': 570, '=': 570, '>': 570, '?': 500, '@': 832, 'A': 667, 'B': 667, 'C': 667, 'D': 722, 'E': 667, 'F': 667, 'G': 722, 'H': 778, 'I': 389, 'J': 500, 'K': 667, 'L': 611, 'M': 889, 'N': 722, 'O': 722, 'P': 611, 'Q': 722, 'R': 667, 'S': 556, 'T': 611, 'U': 722, 'V': 667, 'W': 889, 'X': 667, 'Y': 611, 'Z': 611, '[': 333, '\\': 278, ']': 333, '^': 570, '_': 500, '`': 333, 'a': 500, 'b': 500, 'c': 444, 'd': 500, 'e': 444, 'f': 333, 'g': 500, 'h': 556, 'i': 278, 'j': 278, 'k': 500, 'l': 278, 'm': 778, 'n': 556, 'o': 500, 'p': 500, 'q': 500, 'r': 389, 's': 389, 't': 278, 'u': 556, 'v': 444, 'w': 667, 'x': 500, 'y': 444, 'z': 389, '{': 348, '|': 220, '}': 348, '~': 570, '\xa1': 389, '\xa2': 500, '\xa3': 500, '\xa4': 500, '\xa5': 500, '\xa6': 220, '\xa7': 500, '\xa8': 333, '\xa9': 747, '\xaa': 266, '\xab': 500, '\xac': 606, '\xae': 747, '\xaf': 333, '\xb0': 400, '\xb1': 570, '\xb2': 300, '\xb3': 300, '\xb4': 333, '\xb5': 576, '\xb6': 500, '\xb7': 250, '\xb8': 333, '\xb9': 300, '\xba': 300, '\xbb': 500, '\xbc': 750, '\xbd': 750, '\xbe': 750, '\xbf': 500, '\xc0': 667, '\xc1': 667, '\xc2': 667, '\xc3': 667, '\xc4': 667, '\xc5': 667, '\xc6': 944, '\xc7': 667, '\xc8': 667, '\xc9': 667, '\xca': 667, '\xcb': 667, '\xcc': 389, '\xcd': 389, '\xce': 389, '\xcf': 389, '\xd0': 722, '\xd1': 722, '\xd2': 722, '\xd3': 722, '\xd4': 722, '\xd5': 722, '\xd6': 722, '\xd7': 570, '\xd8': 722, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 611, '\xde': 611, '\xdf': 500, '\xe0': 500, '\xe1': 500, '\xe2': 500, '\xe3': 500, '\xe4': 500, '\xe5': 500, '\xe6': 722, '\xe7': 444, '\xe8': 444, '\xe9': 444, '\xea': 444, '\xeb': 444, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 500, '\xf1': 556, '\xf2': 500, '\xf3': 500, '\xf4': 500, '\xf5': 500, '\xf6': 500, '\xf7': 570, '\xf8': 500, '\xf9': 556, '\xfa': 556, '\xfb': 556, '\xfc': 556, '\xfd': 444, '\xfe': 500, '\xff': 444, '\u0100': 667, '\u0101': 500, '\u0102': 667, '\u0103': 500, '\u0104': 667, '\u0105': 500, '\u0106': 667, '\u0107': 444, '\u010c': 667, '\u010d': 444, '\u010e': 722, '\u010f': 608, '\u0110': 722, '\u0111': 500, '\u0112': 667, '\u0113': 444, '\u0116': 667, '\u0117': 444, '\u0118': 667, '\u0119': 444, '\u011a': 667, '\u011b': 444, '\u011e': 722, '\u011f': 500, '\u0122': 722, '\u0123': 500, '\u012a': 389, '\u012b': 278, '\u012e': 389, '\u012f': 278, '\u0130': 389, '\u0131': 278, '\u0136': 667, '\u0137': 500, '\u0139': 611, '\u013a': 278, '\u013b': 611, '\u013c': 278, '\u013d': 611, '\u013e': 382, '\u0141': 611, '\u0142': 278, '\u0143': 722, '\u0144': 556, '\u0145': 722, '\u0146': 556, '\u0147': 722, '\u0148': 556, '\u014c': 722, '\u014d': 500, '\u0150': 722, '\u0151': 500, '\u0152': 944, '\u0153': 722, '\u0154': 667, '\u0155': 389, '\u0156': 667, '\u0157': 389, '\u0158': 667, '\u0159': 389, '\u015a': 556, '\u015b': 389, '\u015e': 556, '\u015f': 389, '\u0160': 556, '\u0161': 389, '\u0162': 611, '\u0163': 278, '\u0164': 611, '\u0165': 366, '\u016a': 722, '\u016b': 556, '\u016e': 722, '\u016f': 556, '\u0170': 722, '\u0171': 556, '\u0172': 722, '\u0173': 556, '\u0178': 611, '\u0179': 611, '\u017a': 389, '\u017b': 611, '\u017c': 389, '\u017d': 611, '\u017e': 389, '\u0192': 500, '\u0218': 556, '\u0219': 389, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 500, '\u2014': 1000, '\u2018': 333, '\u2019': 333, '\u201a': 333, '\u201c': 500, '\u201d': 500, '\u201e': 500, '\u2020': 500, '\u2021': 500, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 1000, '\u2202': 494, '\u2206': 612, '\u2211': 600, '\u2212': 606, '\u221a': 549, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 494, '\uf6c3': 250, '\ufb01': 556, '\ufb02': 556}),
'Times-Italic': ({'FontName': 'Times-Italic', 'Descent': -217.0, 'FontBBox': (-169.0, -217.0, 1010.0, 883.0), 'FontWeight': 'Medium', 'CapHeight': 653.0, 'FontFamily': 'Times', 'Flags': 0, 'XHeight': 441.0, 'ItalicAngle': -15.5, 'Ascent': 683.0}, {' ': 250, '!': 333, '"': 420, '#': 500, '$': 500, '%': 833, '&': 778, "'": 214, '(': 333, ')': 333, '*': 500, '+': 675, ',': 250, '-': 333, '.': 250, '/': 278, '0': 500, '1': 500, '2': 500, '3': 500, '4': 500, '5': 500, '6': 500, '7': 500, '8': 500, '9': 500, ':': 333, ';': 333, '<': 675, '=': 675, '>': 675, '?': 500, '@': 920, 'A': 611, 'B': 611, 'C': 667, 'D': 722, 'E': 611, 'F': 611, 'G': 722, 'H': 722, 'I': 333, 'J': 444, 'K': 667, 'L': 556, 'M': 833, 'N': 667, 'O': 722, 'P': 611, 'Q': 722, 'R': 611, 'S': 500, 'T': 556, 'U': 722, 'V': 611, 'W': 833, 'X': 611, 'Y': 556, 'Z': 556, '[': 389, '\\': 278, ']': 389, '^': 422, '_': 500, '`': 333, 'a': 500, 'b': 500, 'c': 444, 'd': 500, 'e': 444, 'f': 278, 'g': 500, 'h': 500, 'i': 278, 'j': 278, 'k': 444, 'l': 278, 'm': 722, 'n': 500, 'o': 500, 'p': 500, 'q': 500, 'r': 389, 's': 389, 't': 278, 'u': 500, 'v': 444, 'w': 667, 'x': 444, 'y': 444, 'z': 389, '{': 400, '|': 275, '}': 400, '~': 541, '\xa1': 389, '\xa2': 500, '\xa3': 500, '\xa4': 500, '\xa5': 500, '\xa6': 275, '\xa7': 500, '\xa8': 333, '\xa9': 760, '\xaa': 276, '\xab': 500, '\xac': 675, '\xae': 760, '\xaf': 333, '\xb0': 400, '\xb1': 675, '\xb2': 300, '\xb3': 300, '\xb4': 333, '\xb5': 500, '\xb6': 523, '\xb7': 250, '\xb8': 333, '\xb9': 300, '\xba': 310, '\xbb': 500, '\xbc': 750, '\xbd': 750, '\xbe': 750, '\xbf': 500, '\xc0': 611, '\xc1': 611, '\xc2': 611, '\xc3': 611, '\xc4': 611, '\xc5': 611, '\xc6': 889, '\xc7': 667, '\xc8': 611, '\xc9': 611, '\xca': 611, '\xcb': 611, '\xcc': 333, '\xcd': 333, '\xce': 333, '\xcf': 333, '\xd0': 722, '\xd1': 667, '\xd2': 722, '\xd3': 722, '\xd4': 722, '\xd5': 722, '\xd6': 722, '\xd7': 675, '\xd8': 722, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 556, '\xde': 611, '\xdf': 500, '\xe0': 500, '\xe1': 500, '\xe2': 500, '\xe3': 500, '\xe4': 500, '\xe5': 500, '\xe6': 667, '\xe7': 444, '\xe8': 444, '\xe9': 444, '\xea': 444, '\xeb': 444, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 500, '\xf1': 500, '\xf2': 500, '\xf3': 500, '\xf4': 500, '\xf5': 500, '\xf6': 500, '\xf7': 675, '\xf8': 500, '\xf9': 500, '\xfa': 500, '\xfb': 500, '\xfc': 500, '\xfd': 444, '\xfe': 500, '\xff': 444, '\u0100': 611, '\u0101': 500, '\u0102': 611, '\u0103': 500, '\u0104': 611, '\u0105': 500, '\u0106': 667, '\u0107': 444, '\u010c': 667, '\u010d': 444, '\u010e': 722, '\u010f': 544, '\u0110': 722, '\u0111': 500, '\u0112': 611, '\u0113': 444, '\u0116': 611, '\u0117': 444, '\u0118': 611, '\u0119': 444, '\u011a': 611, '\u011b': 444, '\u011e': 722, '\u011f': 500, '\u0122': 722, '\u0123': 500, '\u012a': 333, '\u012b': 278, '\u012e': 333, '\u012f': 278, '\u0130': 333, '\u0131': 278, '\u0136': 667, '\u0137': 444, '\u0139': 556, '\u013a': 278, '\u013b': 556, '\u013c': 278, '\u013d': 611, '\u013e': 300, '\u0141': 556, '\u0142': 278, '\u0143': 667, '\u0144': 500, '\u0145': 667, '\u0146': 500, '\u0147': 667, '\u0148': 500, '\u014c': 722, '\u014d': 500, '\u0150': 722, '\u0151': 500, '\u0152': 944, '\u0153': 667, '\u0154': 611, '\u0155': 389, '\u0156': 611, '\u0157': 389, '\u0158': 611, '\u0159': 389, '\u015a': 500, '\u015b': 389, '\u015e': 500, '\u015f': 389, '\u0160': 500, '\u0161': 389, '\u0162': 556, '\u0163': 278, '\u0164': 556, '\u0165': 300, '\u016a': 722, '\u016b': 500, '\u016e': 722, '\u016f': 500, '\u0170': 722, '\u0171': 500, '\u0172': 722, '\u0173': 500, '\u0178': 556, '\u0179': 556, '\u017a': 389, '\u017b': 556, '\u017c': 389, '\u017d': 556, '\u017e': 389, '\u0192': 500, '\u0218': 500, '\u0219': 389, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 500, '\u2014': 889, '\u2018': 333, '\u2019': 333, '\u201a': 333, '\u201c': 556, '\u201d': 556, '\u201e': 556, '\u2020': 500, '\u2021': 500, '\u2022': 350, '\u2026': 889, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 980, '\u2202': 476, '\u2206': 612, '\u2211': 600, '\u2212': 675, '\u221a': 453, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 471, '\uf6c3': 250, '\ufb01': 500, '\ufb02': 500}),
'Times-Roman': ({'FontName': 'Times-Roman', 'Descent': -217.0, 'FontBBox': (-168.0, -218.0, 1000.0, 898.0), 'FontWeight': 'Roman', 'CapHeight': 662.0, 'FontFamily': 'Times', 'Flags': 0, 'XHeight': 450.0, 'ItalicAngle': 0.0, 'Ascent': 683.0}, {' ': 250, '!': 333, '"': 408, '#': 500, '$': 500, '%': 833, '&': 778, "'": 180, '(': 333, ')': 333, '*': 500, '+': 564, ',': 250, '-': 333, '.': 250, '/': 278, '0': 500, '1': 500, '2': 500, '3': 500, '4': 500, '5': 500, '6': 500, '7': 500, '8': 500, '9': 500, ':': 278, ';': 278, '<': 564, '=': 564, '>': 564, '?': 444, '@': 921, 'A': 722, 'B': 667, 'C': 667, 'D': 722, 'E': 611, 'F': 556, 'G': 722, 'H': 722, 'I': 333, 'J': 389, 'K': 722, 'L': 611, 'M': 889, 'N': 722, 'O': 722, 'P': 556, 'Q': 722, 'R': 667, 'S': 556, 'T': 611, 'U': 722, 'V': 722, 'W': 944, 'X': 722, 'Y': 722, 'Z': 611, '[': 333, '\\': 278, ']': 333, '^': 469, '_': 500, '`': 333, 'a': 444, 'b': 500, 'c': 444, 'd': 500, 'e': 444, 'f': 333, 'g': 500, 'h': 500, 'i': 278, 'j': 278, 'k': 500, 'l': 278, 'm': 778, 'n': 500, 'o': 500, 'p': 500, 'q': 500, 'r': 333, 's': 389, 't': 278, 'u': 500, 'v': 500, 'w': 722, 'x': 500, 'y': 500, 'z': 444, '{': 480, '|': 200, '}': 480, '~': 541, '\xa1': 333, '\xa2': 500, '\xa3': 500, '\xa4': 500, '\xa5': 500, '\xa6': 200, '\xa7': 500, '\xa8': 333, '\xa9': 760, '\xaa': 276, '\xab': 500, '\xac': 564, '\xae': 760, '\xaf': 333, '\xb0': 400, '\xb1': 564, '\xb2': 300, '\xb3': 300, '\xb4': 333, '\xb5': 500, '\xb6': 453, '\xb7': 250, '\xb8': 333, '\xb9': 300, '\xba': 310, '\xbb': 500, '\xbc': 750, '\xbd': 750, '\xbe': 750, '\xbf': 444, '\xc0': 722, '\xc1': 722, '\xc2': 722, '\xc3': 722, '\xc4': 722, '\xc5': 722, '\xc6': 889, '\xc7': 667, '\xc8': 611, '\xc9': 611, '\xca': 611, '\xcb': 611, '\xcc': 333, '\xcd': 333, '\xce': 333, '\xcf': 333, '\xd0': 722, '\xd1': 722, '\xd2': 722, '\xd3': 722, '\xd4': 722, '\xd5': 722, '\xd6': 722, '\xd7': 564, '\xd8': 722, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 722, '\xde': 556, '\xdf': 500, '\xe0': 444, '\xe1': 444, '\xe2': 444, '\xe3': 444, '\xe4': 444, '\xe5': 444, '\xe6': 667, '\xe7': 444, '\xe8': 444, '\xe9': 444, '\xea': 444, '\xeb': 444, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 500, '\xf1': 500, '\xf2': 500, '\xf3': 500, '\xf4': 500, '\xf5': 500, '\xf6': 500, '\xf7': 564, '\xf8': 500, '\xf9': 500, '\xfa': 500, '\xfb': 500, '\xfc': 500, '\xfd': 500, '\xfe': 500, '\xff': 500, '\u0100': 722, '\u0101': 444, '\u0102': 722, '\u0103': 444, '\u0104': 722, '\u0105': 444, '\u0106': 667, '\u0107': 444, '\u010c': 667, '\u010d': 444, '\u010e': 722, '\u010f': 588, '\u0110': 722, '\u0111': 500, '\u0112': 611, '\u0113': 444, '\u0116': 611, '\u0117': 444, '\u0118': 611, '\u0119': 444, '\u011a': 611, '\u011b': 444, '\u011e': 722, '\u011f': 500, '\u0122': 722, '\u0123': 500, '\u012a': 333, '\u012b': 278, '\u012e': 333, '\u012f': 278, '\u0130': 333, '\u0131': 278, '\u0136': 722, '\u0137': 500, '\u0139': 611, '\u013a': 278, '\u013b': 611, '\u013c': 278, '\u013d': 611, '\u013e': 344, '\u0141': 611, '\u0142': 278, '\u0143': 722, '\u0144': 500, '\u0145': 722, '\u0146': 500, '\u0147': 722, '\u0148': 500, '\u014c': 722, '\u014d': 500, '\u0150': 722, '\u0151': 500, '\u0152': 889, '\u0153': 722, '\u0154': 667, '\u0155': 333, '\u0156': 667, '\u0157': 333, '\u0158': 667, '\u0159': 333, '\u015a': 556, '\u015b': 389, '\u015e': 556, '\u015f': 389, '\u0160': 556, '\u0161': 389, '\u0162': 611, '\u0163': 278, '\u0164': 611, '\u0165': 326, '\u016a': 722, '\u016b': 500, '\u016e': 722, '\u016f': 500, '\u0170': 722, '\u0171': 500, '\u0172': 722, '\u0173': 500, '\u0178': 722, '\u0179': 611, '\u017a': 444, '\u017b': 611, '\u017c': 444, '\u017d': 611, '\u017e': 444, '\u0192': 500, '\u0218': 556, '\u0219': 389, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 500, '\u2014': 1000, '\u2018': 333, '\u2019': 333, '\u201a': 333, '\u201c': 444, '\u201d': 444, '\u201e': 444, '\u2020': 500, '\u2021': 500, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 980, '\u2202': 476, '\u2206': 612, '\u2211': 600, '\u2212': 564, '\u221a': 453, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 471, '\uf6c3': 250, '\ufb01': 556, '\ufb02': 556}),
'ZapfDingbats': ({'FontName': 'ZapfDingbats', 'FontBBox': (-1.0, -143.0, 981.0, 820.0), 'FontWeight': 'Medium', 'FontFamily': 'ITC', 'Flags': 0, 'ItalicAngle': 0.0}, {'\x01': 974, '\x02': 961, '\x03': 980, '\x04': 719, '\x05': 789, '\x06': 494, '\x07': 552, '\x08': 537, '\t': 577, '\n': 692, '\x0b': 960, '\x0c': 939, '\r': 549, '\x0e': 855, '\x0f': 911, '\x10': 933, '\x11': 945, '\x12': 974, '\x13': 755, '\x14': 846, '\x15': 762, '\x16': 761, '\x17': 571, '\x18': 677, '\x19': 763, '\x1a': 760, '\x1b': 759, '\x1c': 754, '\x1d': 786, '\x1e': 788, '\x1f': 788, ' ': 790, '!': 793, '"': 794, '#': 816, '$': 823, '%': 789, '&': 841, "'": 823, '(': 833, ')': 816, '*': 831, '+': 923, ',': 744, '-': 723, '.': 749, '/': 790, '0': 792, '1': 695, '2': 776, '3': 768, '4': 792, '5': 759, '6': 707, '7': 708, '8': 682, '9': 701, ':': 826, ';': 815, '<': 789, '=': 789, '>': 707, '?': 687, '@': 696, 'A': 689, 'B': 786, 'C': 787, 'D': 713, 'E': 791, 'F': 785, 'G': 791, 'H': 873, 'I': 761, 'J': 762, 'K': 759, 'L': 892, 'M': 892, 'N': 788, 'O': 784, 'Q': 438, 'R': 138, 'S': 277, 'T': 415, 'U': 509, 'V': 410, 'W': 234, 'X': 234, 'Y': 390, 'Z': 390, '[': 276, '\\': 276, ']': 317, '^': 317, '_': 334, '`': 334, 'a': 392, 'b': 392, 'c': 668, 'd': 668, 'e': 732, 'f': 544, 'g': 544, 'h': 910, 'i': 911, 'j': 667, 'k': 760, 'l': 760, 'm': 626, 'n': 694, 'o': 595, 'p': 776, 'u': 690, 'v': 791, 'w': 790, 'x': 788, 'y': 788, 'z': 788, '{': 788, '|': 788, '}': 788, '~': 788, '\x7f': 788, '\x80': 788, '\x81': 788, '\x82': 788, '\x83': 788, '\x84': 788, '\x85': 788, '\x86': 788, '\x87': 788, '\x88': 788, '\x89': 788, '\x8a': 788, '\x8b': 788, '\x8c': 788, '\x8d': 788, '\x8e': 788, '\x8f': 788, '\x90': 788, '\x91': 788, '\x92': 788, '\x93': 788, '\x94': 788, '\x95': 788, '\x96': 788, '\x97': 788, '\x98': 788, '\x99': 788, '\x9a': 788, '\x9b': 788, '\x9c': 788, '\x9d': 788, '\x9e': 788, '\x9f': 788, '\xa0': 894, '\xa1': 838, '\xa2': 924, '\xa3': 1016, '\xa4': 458, '\xa5': 924, '\xa6': 918, '\xa7': 927, '\xa8': 928, '\xa9': 928, '\xaa': 834, '\xab': 873, '\xac': 828, '\xad': 924, '\xae': 917, '\xaf': 930, '\xb0': 931, '\xb1': 463, '\xb2': 883, '\xb3': 836, '\xb4': 867, '\xb5': 696, '\xb6': 874, '\xb7': 760, '\xb8': 946, '\xb9': 865, '\xba': 967, '\xbb': 831, '\xbc': 873, '\xbd': 927, '\xbe': 970, '\xbf': 918, '\xc0': 748, '\xc1': 836, '\xc2': 771, '\xc3': 888, '\xc4': 748, '\xc5': 771, '\xc6': 888, '\xc7': 867, '\xc8': 696, '\xc9': 874, '\xca': 974, '\xcb': 762, '\xcc': 759, '\xcd': 509, '\xce': 410}),
}
File diff suppressed because it is too large Load Diff
-151
View File
@@ -1,151 +0,0 @@
"""Functions that can be used for the most common use-cases for pdfminer.six"""
import logging
import sys
from io import StringIO
from .converter import XMLConverter, HTMLConverter, TextConverter, \
PDFPageAggregator
from .image import ImageWriter
from .layout import LAParams
from .pdfdevice import TagExtractor
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
from .pdfpage import PDFPage
from .utils import open_filename
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
laparams=None, maxpages=0, page_numbers=None,
password="", scale=1.0, rotation=0, layoutmode='normal',
output_dir=None, strip_control=False, debug=False,
disable_caching=False, **kwargs):
"""Parses text from inf-file and writes to outfp file-like object.
Takes loads of optional arguments but the defaults are somewhat sane.
Beware laparams: Including an empty LAParams is not the same as passing
None!
:param inf: a file-like object to read PDF structure from, such as a
file handler (using the builtin `open()` function) or a `BytesIO`.
:param outfp: a file-like object to write the text to.
:param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works
properly.
:param codec: Text decoding codec
:param laparams: An LAParams object from pdfminer.layout. Default is None
but may not layout correctly.
:param maxpages: How many pages to stop parsing after
:param page_numbers: zero-indexed page numbers to operate on.
:param password: For encrypted PDFs, the password to decrypt.
:param scale: Scale factor
:param rotation: Rotation factor
:param layoutmode: Default is 'normal', see
pdfminer.converter.HTMLConverter
:param output_dir: If given, creates an ImageWriter for extracted images.
:param strip_control: Does what it says on the tin
:param debug: Output more logging data
:param disable_caching: Does what it says on the tin
:param other:
:return: nothing, acting as it does on two streams. Use StringIO to get
strings.
"""
if debug:
logging.getLogger().setLevel(logging.DEBUG)
imagewriter = None
if output_dir:
imagewriter = ImageWriter(output_dir)
rsrcmgr = PDFResourceManager(caching=not disable_caching)
if output_type == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
if outfp == sys.stdout:
outfp = sys.stdout.buffer
if output_type == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter,
stripcontrol=strip_control)
elif output_type == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)
elif output_type == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(inf,
page_numbers,
maxpages=maxpages,
password=password,
caching=not disable_caching):
page.rotate = (page.rotate + rotation) % 360
interpreter.process_page(page)
device.close()
def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, codec='utf-8', laparams=None):
"""Parse and return the text contained in a PDF file.
:param pdf_file: Either a file path or a file-like object for the PDF file
to be worked on.
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
:param caching: If resources should be cached
:param codec: Text decoding codec
:param laparams: An LAParams object from pdfminer.layout. If None, uses
some default settings that often work well.
:return: a string containing all of the text extracted.
"""
if laparams is None:
laparams = LAParams()
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
rsrcmgr = PDFResourceManager(caching=caching)
device = TextConverter(rsrcmgr, output_string, codec=codec,
laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(
fp,
page_numbers,
maxpages=maxpages,
password=password,
caching=caching,
):
interpreter.process_page(page)
return output_string.getvalue()
def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, laparams=None):
"""Extract and yield LTPage objects
:param pdf_file: Either a file path or a file-like object for the PDF file
to be worked on.
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
:param caching: If resources should be cached
:param laparams: An LAParams object from pdfminer.layout. If None, uses
some default settings that often work well.
:return:
"""
if laparams is None:
laparams = LAParams()
with open_filename(pdf_file, "rb") as fp:
resource_manager = PDFResourceManager(caching=caching)
device = PDFPageAggregator(resource_manager, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device)
for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,
password=password, caching=caching):
interpreter.process_page(page)
layout = device.get_result()
yield layout
-165
View File
@@ -1,165 +0,0 @@
import os
import os.path
import struct
from io import BytesIO
from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter
from .pdfcolor import LITERAL_DEVICE_CMYK
from .pdfcolor import LITERAL_DEVICE_GRAY
from .pdfcolor import LITERAL_DEVICE_RGB
from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE
def align32(x):
return ((x+3)//4)*4
class BMPWriter:
def __init__(self, fp, bits, width, height):
self.fp = fp
self.bits = bits
self.width = width
self.height = height
if bits == 1:
ncols = 2
elif bits == 8:
ncols = 256
elif bits == 24:
ncols = 0
else:
raise ValueError(bits)
self.linesize = align32((self.width*self.bits+7)//8)
self.datasize = self.linesize * self.height
headersize = 14+40+ncols*4
info = struct.pack('<IiiHHIIIIII', 40, self.width, self.height,
1, self.bits, 0, self.datasize, 0, 0, ncols, 0)
assert len(info) == 40, str(len(info))
header = struct.pack('<ccIHHI', b'B', b'M',
headersize+self.datasize, 0, 0, headersize)
assert len(header) == 14, str(len(header))
self.fp.write(header)
self.fp.write(info)
if ncols == 2:
# B&W color table
for i in (0, 255):
self.fp.write(struct.pack('BBBx', i, i, i))
elif ncols == 256:
# grayscale color table
for i in range(256):
self.fp.write(struct.pack('BBBx', i, i, i))
self.pos0 = self.fp.tell()
self.pos1 = self.pos0 + self.datasize
return
def write_line(self, y, data):
self.fp.seek(self.pos1 - (y+1)*self.linesize)
self.fp.write(data)
return
class ImageWriter:
"""Write image to a file
Supports various image types: JPEG, JBIG2 and bitmaps
"""
def __init__(self, outdir):
self.outdir = outdir
if not os.path.exists(self.outdir):
os.makedirs(self.outdir)
return
def export_image(self, image):
(width, height) = image.srcsize
is_jbig2 = self.is_jbig2_image(image)
ext = self._get_image_extension(image, width, height, is_jbig2)
name, path = self._create_unique_image_name(self.outdir,
image.name, ext)
fp = open(path, 'wb')
if ext == '.jpg':
raw_data = image.stream.get_rawdata()
if LITERAL_DEVICE_CMYK in image.colorspace:
from PIL import Image
from PIL import ImageChops
ifp = BytesIO(raw_data)
i = Image.open(ifp)
i = ImageChops.invert(i)
i = i.convert('RGB')
i.save(fp, 'JPEG')
else:
fp.write(raw_data)
elif is_jbig2:
input_stream = BytesIO()
input_stream.write(image.stream.get_data())
input_stream.seek(0)
reader = JBIG2StreamReader(input_stream)
segments = reader.get_segments()
writer = JBIG2StreamWriter(fp)
writer.write_file(segments)
elif image.bits == 1:
bmp = BMPWriter(fp, 1, width, height)
data = image.stream.get_data()
i = 0
width = (width+7)//8
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
bmp = BMPWriter(fp, 24, width, height)
data = image.stream.get_data()
i = 0
width = width*3
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
bmp = BMPWriter(fp, 8, width, height)
data = image.stream.get_data()
i = 0
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
else:
fp.write(image.stream.get_data())
fp.close()
return name
@staticmethod
def is_jbig2_image(image):
filters = image.stream.get_filters()
is_jbig2 = False
for filter_name, params in filters:
if filter_name in LITERALS_JBIG2_DECODE:
is_jbig2 = True
break
return is_jbig2
@staticmethod
def _get_image_extension(image, width, height, is_jbig2):
filters = image.stream.get_filters()
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
ext = '.jpg'
elif is_jbig2:
ext = '.jb2'
elif (image.bits == 1 or
image.bits == 8 and
(LITERAL_DEVICE_RGB in image.colorspace or
LITERAL_DEVICE_GRAY in image.colorspace)):
ext = '.%dx%d.bmp' % (width, height)
else:
ext = '.%d.%dx%d.img' % (image.bits, width, height)
return ext
@staticmethod
def _create_unique_image_name(dirname, image_name, ext):
name = image_name + ext
path = os.path.join(dirname, name)
img_index = 0
while os.path.exists(path):
name = '%s.%d%s' % (image_name, img_index, ext)
path = os.path.join(dirname, name)
img_index += 1
return name, path
-321
View File
@@ -1,321 +0,0 @@
import math
import os
from struct import pack, unpack, calcsize
# segment structure base
SEG_STRUCT = [
(">L", "number"),
(">B", "flags"),
(">B", "retention_flags"),
(">B", "page_assoc"),
(">L", "data_length"),
]
# segment header literals
HEADER_FLAG_DEFERRED = 0b10000000
HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000
SEG_TYPE_MASK = 0b00111111
REF_COUNT_SHORT_MASK = 0b11100000
REF_COUNT_LONG_MASK = 0x1fffffff
REF_COUNT_LONG = 7
DATA_LEN_UNKNOWN = 0xffffffff
# segment types
SEG_TYPE_IMMEDIATE_GEN_REGION = 38
SEG_TYPE_END_OF_PAGE = 49
SEG_TYPE_END_OF_FILE = 50
# file literals
FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A'
FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010
def bit_set(bit_pos, value):
return bool((value >> bit_pos) & 1)
def check_flag(flag, value):
return bool(flag & value)
def masked_value(mask, value):
for bit_pos in range(0, 31):
if bit_set(bit_pos, mask):
return (value & mask) >> bit_pos
raise Exception("Invalid mask or value")
def mask_value(mask, value):
for bit_pos in range(0, 31):
if bit_set(bit_pos, mask):
return (value & (mask >> bit_pos)) << bit_pos
raise Exception("Invalid mask or value")
class JBIG2StreamReader:
"""Read segments from a JBIG2 byte stream"""
def __init__(self, stream):
self.stream = stream
def get_segments(self):
segments = []
while not self.is_eof():
segment = {}
for field_format, name in SEG_STRUCT:
field_len = calcsize(field_format)
field = self.stream.read(field_len)
if len(field) < field_len:
segment["_error"] = True
break
value = unpack(field_format, field)
if len(value) == 1:
[value] = value
parser = getattr(self, "parse_%s" % name, None)
if callable(parser):
value = parser(segment, value, field)
segment[name] = value
if not segment.get("_error"):
segments.append(segment)
return segments
def is_eof(self):
if self.stream.read(1) == b'':
return True
else:
self.stream.seek(-1, os.SEEK_CUR)
return False
def parse_flags(self, segment, flags, field):
return {
"deferred": check_flag(HEADER_FLAG_DEFERRED, flags),
"page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags),
"type": masked_value(SEG_TYPE_MASK, flags)
}
def parse_retention_flags(self, segment, flags, field):
ref_count = masked_value(REF_COUNT_SHORT_MASK, flags)
retain_segments = []
ref_segments = []
if ref_count < REF_COUNT_LONG:
for bit_pos in range(5):
retain_segments.append(bit_set(bit_pos, flags))
else:
field += self.stream.read(3)
[ref_count] = unpack(">L", field)
ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count)
ret_bytes_count = int(math.ceil((ref_count + 1) / 8))
for ret_byte_index in range(ret_bytes_count):
[ret_byte] = unpack(">B", self.stream.read(1))
for bit_pos in range(7):
retain_segments.append(bit_set(bit_pos, ret_byte))
seg_num = segment["number"]
if seg_num <= 256:
ref_format = ">B"
elif seg_num <= 65536:
ref_format = ">I"
else:
ref_format = ">L"
ref_size = calcsize(ref_format)
for ref_index in range(ref_count):
ref = self.stream.read(ref_size)
[ref] = unpack(ref_format, ref)
ref_segments.append(ref)
return {
"ref_count": ref_count,
"retain_segments": retain_segments,
"ref_segments": ref_segments,
}
def parse_page_assoc(self, segment, page, field):
if segment["flags"]["page_assoc_long"]:
field += self.stream.read(3)
[page] = unpack(">L", field)
return page
def parse_data_length(self, segment, length, field):
if length:
if (segment["flags"]["type"] == SEG_TYPE_IMMEDIATE_GEN_REGION) \
and (length == DATA_LEN_UNKNOWN):
raise NotImplementedError(
"Working with unknown segment length "
"is not implemented yet"
)
else:
segment["raw_data"] = self.stream.read(length)
return length
class JBIG2StreamWriter:
"""Write JBIG2 segments to a file in JBIG2 format"""
def __init__(self, stream):
self.stream = stream
def write_segments(self, segments, fix_last_page=True):
data_len = 0
current_page = None
seg_num = None
for segment in segments:
data = self.encode_segment(segment)
self.stream.write(data)
data_len += len(data)
seg_num = segment["number"]
if fix_last_page:
seg_page = segment.get("page_assoc")
if segment["flags"]["type"] == SEG_TYPE_END_OF_PAGE:
current_page = None
elif seg_page:
current_page = seg_page
if fix_last_page and current_page and (seg_num is not None):
segment = self.get_eop_segment(seg_num + 1, current_page)
data = self.encode_segment(segment)
self.stream.write(data)
data_len += len(data)
return data_len
def write_file(self, segments, fix_last_page=True):
header = FILE_HEADER_ID
header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN
header += pack(">B", header_flags)
self.stream.write(header)
data_len = len(header)
data_len += self.write_segments(segments, fix_last_page)
seg_num = 0
for segment in segments:
seg_num = segment["number"]
eof_segment = self.get_eof_segment(seg_num + 1)
data = self.encode_segment(eof_segment)
self.stream.write(data)
data_len += len(data)
return data_len
def encode_segment(self, segment):
data = b''
for field_format, name in SEG_STRUCT:
value = segment.get(name)
encoder = getattr(self, "encode_%s" % name, None)
if callable(encoder):
field = encoder(value, segment)
else:
field = pack(field_format, value)
data += field
return data
def encode_flags(self, value, segment):
flags = 0
if value.get("deferred"):
flags |= HEADER_FLAG_DEFERRED
if "page_assoc_long" in value:
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \
if value["page_assoc_long"] else flags
else:
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \
if segment.get("page", 0) > 255 else flags
flags |= mask_value(SEG_TYPE_MASK, value["type"])
return pack(">B", flags)
def encode_retention_flags(self, value, segment):
flags = []
flags_format = ">B"
ref_count = value["ref_count"]
retain_segments = value.get("retain_segments", [])
if ref_count <= 4:
flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
for ref_index, ref_retain in enumerate(retain_segments):
flags_byte |= 1 << ref_index
flags.append(flags_byte)
else:
bytes_count = math.ceil((ref_count + 1) / 8)
flags_format = ">L" + ("B" * bytes_count)
flags_dword = mask_value(
REF_COUNT_SHORT_MASK,
REF_COUNT_LONG
) << 24
flags.append(flags_dword)
for byte_index in range(bytes_count):
ret_byte = 0
ret_part = retain_segments[byte_index * 8:byte_index * 8 + 8]
for bit_pos, ret_seg in enumerate(ret_part):
ret_byte |= 1 << bit_pos if ret_seg else ret_byte
flags.append(ret_byte)
ref_segments = value.get("ref_segments", [])
seg_num = segment["number"]
if seg_num <= 256:
ref_format = "B"
elif seg_num <= 65536:
ref_format = "I"
else:
ref_format = "L"
for ref in ref_segments:
flags_format += ref_format
flags.append(ref)
return pack(flags_format, *flags)
def encode_data_length(self, value, segment):
data = pack(">L", value)
data += segment["raw_data"]
return data
def get_eop_segment(self, seg_number, page_number):
return {
'data_length': 0,
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_PAGE},
'number': seg_number,
'page_assoc': page_number,
'raw_data': b'',
'retention_flags': {
'ref_count': 0,
'ref_segments': [],
'retain_segments': []
}
}
def get_eof_segment(self, seg_number):
return {
'data_length': 0,
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_FILE},
'number': seg_number,
'page_assoc': 0,
'raw_data': b'',
'retention_flags': {
'ref_count': 0,
'ref_segments': [],
'retain_segments': []
}
}
-242
View File
@@ -1,242 +0,0 @@
""" Standard encoding tables used in PDF.
This table is extracted from PDF Reference Manual 1.6, pp.925
"D.1 Latin Character Set and Encodings"
"""
ENCODING = [
# (name, std, mac, win, pdf)
('A', 65, 65, 65, 65),
('AE', 225, 174, 198, 198),
('Aacute', None, 231, 193, 193),
('Acircumflex', None, 229, 194, 194),
('Adieresis', None, 128, 196, 196),
('Agrave', None, 203, 192, 192),
('Aring', None, 129, 197, 197),
('Atilde', None, 204, 195, 195),
('B', 66, 66, 66, 66),
('C', 67, 67, 67, 67),
('Ccedilla', None, 130, 199, 199),
('D', 68, 68, 68, 68),
('E', 69, 69, 69, 69),
('Eacute', None, 131, 201, 201),
('Ecircumflex', None, 230, 202, 202),
('Edieresis', None, 232, 203, 203),
('Egrave', None, 233, 200, 200),
('Eth', None, None, 208, 208),
('Euro', None, None, 128, 160),
('F', 70, 70, 70, 70),
('G', 71, 71, 71, 71),
('H', 72, 72, 72, 72),
('I', 73, 73, 73, 73),
('Iacute', None, 234, 205, 205),
('Icircumflex', None, 235, 206, 206),
('Idieresis', None, 236, 207, 207),
('Igrave', None, 237, 204, 204),
('J', 74, 74, 74, 74),
('K', 75, 75, 75, 75),
('L', 76, 76, 76, 76),
('Lslash', 232, None, None, 149),
('M', 77, 77, 77, 77),
('N', 78, 78, 78, 78),
('Ntilde', None, 132, 209, 209),
('O', 79, 79, 79, 79),
('OE', 234, 206, 140, 150),
('Oacute', None, 238, 211, 211),
('Ocircumflex', None, 239, 212, 212),
('Odieresis', None, 133, 214, 214),
('Ograve', None, 241, 210, 210),
('Oslash', 233, 175, 216, 216),
('Otilde', None, 205, 213, 213),
('P', 80, 80, 80, 80),
('Q', 81, 81, 81, 81),
('R', 82, 82, 82, 82),
('S', 83, 83, 83, 83),
('Scaron', None, None, 138, 151),
('T', 84, 84, 84, 84),
('Thorn', None, None, 222, 222),
('U', 85, 85, 85, 85),
('Uacute', None, 242, 218, 218),
('Ucircumflex', None, 243, 219, 219),
('Udieresis', None, 134, 220, 220),
('Ugrave', None, 244, 217, 217),
('V', 86, 86, 86, 86),
('W', 87, 87, 87, 87),
('X', 88, 88, 88, 88),
('Y', 89, 89, 89, 89),
('Yacute', None, None, 221, 221),
('Ydieresis', None, 217, 159, 152),
('Z', 90, 90, 90, 90),
('Zcaron', None, None, 142, 153),
('a', 97, 97, 97, 97),
('aacute', None, 135, 225, 225),
('acircumflex', None, 137, 226, 226),
('acute', 194, 171, 180, 180),
('adieresis', None, 138, 228, 228),
('ae', 241, 190, 230, 230),
('agrave', None, 136, 224, 224),
('ampersand', 38, 38, 38, 38),
('aring', None, 140, 229, 229),
('asciicircum', 94, 94, 94, 94),
('asciitilde', 126, 126, 126, 126),
('asterisk', 42, 42, 42, 42),
('at', 64, 64, 64, 64),
('atilde', None, 139, 227, 227),
('b', 98, 98, 98, 98),
('backslash', 92, 92, 92, 92),
('bar', 124, 124, 124, 124),
('braceleft', 123, 123, 123, 123),
('braceright', 125, 125, 125, 125),
('bracketleft', 91, 91, 91, 91),
('bracketright', 93, 93, 93, 93),
('breve', 198, 249, None, 24),
('brokenbar', None, None, 166, 166),
('bullet', 183, 165, 149, 128),
('c', 99, 99, 99, 99),
('caron', 207, 255, None, 25),
('ccedilla', None, 141, 231, 231),
('cedilla', 203, 252, 184, 184),
('cent', 162, 162, 162, 162),
('circumflex', 195, 246, 136, 26),
('colon', 58, 58, 58, 58),
('comma', 44, 44, 44, 44),
('copyright', None, 169, 169, 169),
('currency', 168, 219, 164, 164),
('d', 100, 100, 100, 100),
('dagger', 178, 160, 134, 129),
('daggerdbl', 179, 224, 135, 130),
('degree', None, 161, 176, 176),
('dieresis', 200, 172, 168, 168),
('divide', None, 214, 247, 247),
('dollar', 36, 36, 36, 36),
('dotaccent', 199, 250, None, 27),
('dotlessi', 245, 245, None, 154),
('e', 101, 101, 101, 101),
('eacute', None, 142, 233, 233),
('ecircumflex', None, 144, 234, 234),
('edieresis', None, 145, 235, 235),
('egrave', None, 143, 232, 232),
('eight', 56, 56, 56, 56),
('ellipsis', 188, 201, 133, 131),
('emdash', 208, 209, 151, 132),
('endash', 177, 208, 150, 133),
('equal', 61, 61, 61, 61),
('eth', None, None, 240, 240),
('exclam', 33, 33, 33, 33),
('exclamdown', 161, 193, 161, 161),
('f', 102, 102, 102, 102),
('fi', 174, 222, None, 147),
('five', 53, 53, 53, 53),
('fl', 175, 223, None, 148),
('florin', 166, 196, 131, 134),
('four', 52, 52, 52, 52),
('fraction', 164, 218, None, 135),
('g', 103, 103, 103, 103),
('germandbls', 251, 167, 223, 223),
('grave', 193, 96, 96, 96),
('greater', 62, 62, 62, 62),
('guillemotleft', 171, 199, 171, 171),
('guillemotright', 187, 200, 187, 187),
('guilsinglleft', 172, 220, 139, 136),
('guilsinglright', 173, 221, 155, 137),
('h', 104, 104, 104, 104),
('hungarumlaut', 205, 253, None, 28),
('hyphen', 45, 45, 45, 45),
('i', 105, 105, 105, 105),
('iacute', None, 146, 237, 237),
('icircumflex', None, 148, 238, 238),
('idieresis', None, 149, 239, 239),
('igrave', None, 147, 236, 236),
('j', 106, 106, 106, 106),
('k', 107, 107, 107, 107),
('l', 108, 108, 108, 108),
('less', 60, 60, 60, 60),
('logicalnot', None, 194, 172, 172),
('lslash', 248, None, None, 155),
('m', 109, 109, 109, 109),
('macron', 197, 248, 175, 175),
('minus', None, None, None, 138),
('mu', None, 181, 181, 181),
('multiply', None, None, 215, 215),
('n', 110, 110, 110, 110),
('nbspace', None, 202, 160, None),
('nine', 57, 57, 57, 57),
('ntilde', None, 150, 241, 241),
('numbersign', 35, 35, 35, 35),
('o', 111, 111, 111, 111),
('oacute', None, 151, 243, 243),
('ocircumflex', None, 153, 244, 244),
('odieresis', None, 154, 246, 246),
('oe', 250, 207, 156, 156),
('ogonek', 206, 254, None, 29),
('ograve', None, 152, 242, 242),
('one', 49, 49, 49, 49),
('onehalf', None, None, 189, 189),
('onequarter', None, None, 188, 188),
('onesuperior', None, None, 185, 185),
('ordfeminine', 227, 187, 170, 170),
('ordmasculine', 235, 188, 186, 186),
('oslash', 249, 191, 248, 248),
('otilde', None, 155, 245, 245),
('p', 112, 112, 112, 112),
('paragraph', 182, 166, 182, 182),
('parenleft', 40, 40, 40, 40),
('parenright', 41, 41, 41, 41),
('percent', 37, 37, 37, 37),
('period', 46, 46, 46, 46),
('periodcentered', 180, 225, 183, 183),
('perthousand', 189, 228, 137, 139),
('plus', 43, 43, 43, 43),
('plusminus', None, 177, 177, 177),
('q', 113, 113, 113, 113),
('question', 63, 63, 63, 63),
('questiondown', 191, 192, 191, 191),
('quotedbl', 34, 34, 34, 34),
('quotedblbase', 185, 227, 132, 140),
('quotedblleft', 170, 210, 147, 141),
('quotedblright', 186, 211, 148, 142),
('quoteleft', 96, 212, 145, 143),
('quoteright', 39, 213, 146, 144),
('quotesinglbase', 184, 226, 130, 145),
('quotesingle', 169, 39, 39, 39),
('r', 114, 114, 114, 114),
('registered', None, 168, 174, 174),
('ring', 202, 251, None, 30),
('s', 115, 115, 115, 115),
('scaron', None, None, 154, 157),
('section', 167, 164, 167, 167),
('semicolon', 59, 59, 59, 59),
('seven', 55, 55, 55, 55),
('six', 54, 54, 54, 54),
('slash', 47, 47, 47, 47),
('space', 32, 32, 32, 32),
('space', None, 202, 160, None),
('space', None, 202, 173, None),
('sterling', 163, 163, 163, 163),
('t', 116, 116, 116, 116),
('thorn', None, None, 254, 254),
('three', 51, 51, 51, 51),
('threequarters', None, None, 190, 190),
('threesuperior', None, None, 179, 179),
('tilde', 196, 247, 152, 31),
('trademark', None, 170, 153, 146),
('two', 50, 50, 50, 50),
('twosuperior', None, None, 178, 178),
('u', 117, 117, 117, 117),
('uacute', None, 156, 250, 250),
('ucircumflex', None, 158, 251, 251),
('udieresis', None, 159, 252, 252),
('ugrave', None, 157, 249, 249),
('underscore', 95, 95, 95, 95),
('v', 118, 118, 118, 118),
('w', 119, 119, 119, 119),
('x', 120, 120, 120, 120),
('y', 121, 121, 121, 121),
('yacute', None, None, 253, 253),
('ydieresis', None, 216, 255, 255),
('yen', 165, 180, 165, 165),
('z', 122, 122, 122, 122),
('zcaron', None, None, 158, 158),
('zero', 48, 48, 48, 48),
]
-866
View File
@@ -1,866 +0,0 @@
import heapq
import logging
from .utils import INF
from .utils import Plane
from .utils import apply_matrix_pt
from .utils import bbox2str
from .utils import fsplit
from .utils import get_bound
from .utils import matrix2str
from .utils import uniq
logger = logging.getLogger(__name__)
class IndexAssigner:
def __init__(self, index=0):
self.index = index
return
def run(self, obj):
if isinstance(obj, LTTextBox):
obj.index = self.index
self.index += 1
elif isinstance(obj, LTTextGroup):
for x in obj:
self.run(x)
return
class LAParams:
"""Parameters for layout analysis
:param line_overlap: If two characters have more overlap than this they
are considered to be on the same line. The overlap is specified
relative to the minimum height of both characters.
:param char_margin: If two characters are closer together than this
margin they are considered part of the same line. The margin is
specified relative to the width of the character.
:param word_margin: If two characters on the same line are further apart
than this margin then they are considered to be two separate words, and
an intermediate space will be added for readability. The margin is
specified relative to the width of the character.
:param line_margin: If two lines are are close together they are
considered to be part of the same paragraph. The margin is
specified relative to the height of a line.
:param boxes_flow: Specifies how much a horizontal and vertical position
of a text matters when determining the order of text boxes. The value
should be within the range of -1.0 (only horizontal position
matters) to +1.0 (only vertical position matters). You can also pass
`None` to disable advanced layout analysis, and instead return text
based on the position of the bottom left corner of the text box.
:param detect_vertical: If vertical text should be considered during
layout analysis
:param all_texts: If layout analysis should be performed on text in
figures.
"""
def __init__(self,
line_overlap=0.5,
char_margin=2.0,
line_margin=0.5,
word_margin=0.1,
boxes_flow=0.5,
detect_vertical=False,
all_texts=False):
self.line_overlap = line_overlap
self.char_margin = char_margin
self.line_margin = line_margin
self.word_margin = word_margin
self.boxes_flow = boxes_flow
self.detect_vertical = detect_vertical
self.all_texts = all_texts
self._validate()
return
def _validate(self):
if self.boxes_flow is not None:
boxes_flow_err_msg = ("LAParam boxes_flow should be None, or a "
"number between -1 and +1")
if not (isinstance(self.boxes_flow, int) or
isinstance(self.boxes_flow, float)):
raise TypeError(boxes_flow_err_msg)
if not -1 <= self.boxes_flow <= 1:
raise ValueError(boxes_flow_err_msg)
def __repr__(self):
return '<LAParams: char_margin=%.1f, line_margin=%.1f, ' \
'word_margin=%.1f all_texts=%r>' % \
(self.char_margin, self.line_margin, self.word_margin,
self.all_texts)
class LTItem:
"""Interface for things that can be analyzed"""
def analyze(self, laparams):
"""Perform the layout analysis."""
return
class LTText:
"""Interface for things that have text"""
def __repr__(self):
return ('<%s %r>' %
(self.__class__.__name__, self.get_text()))
def get_text(self):
"""Text contained in this object"""
raise NotImplementedError
class LTComponent(LTItem):
"""Object with a bounding box"""
def __init__(self, bbox):
LTItem.__init__(self)
self.set_bbox(bbox)
return
def __repr__(self):
return ('<%s %s>' %
(self.__class__.__name__, bbox2str(self.bbox)))
# Disable comparison.
def __lt__(self, _):
raise ValueError
def __le__(self, _):
raise ValueError
def __gt__(self, _):
raise ValueError
def __ge__(self, _):
raise ValueError
def set_bbox(self, bbox):
(x0, y0, x1, y1) = bbox
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
self.bbox = bbox
return
def is_empty(self):
return self.width <= 0 or self.height <= 0
def is_hoverlap(self, obj):
assert isinstance(obj, LTComponent), str(type(obj))
return obj.x0 <= self.x1 and self.x0 <= obj.x1
def hdistance(self, obj):
assert isinstance(obj, LTComponent), str(type(obj))
if self.is_hoverlap(obj):
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def hoverlap(self, obj):
assert isinstance(obj, LTComponent), str(type(obj))
if self.is_hoverlap(obj):
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
else:
return 0
def is_voverlap(self, obj):
assert isinstance(obj, LTComponent), str(type(obj))
return obj.y0 <= self.y1 and self.y0 <= obj.y1
def vdistance(self, obj):
assert isinstance(obj, LTComponent), str(type(obj))
if self.is_voverlap(obj):
return 0
else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
def voverlap(self, obj):
assert isinstance(obj, LTComponent), str(type(obj))
if self.is_voverlap(obj):
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
else:
return 0
class LTCurve(LTComponent):
"""A generic Bezier curve"""
def __init__(self, linewidth, pts, stroke=False, fill=False, evenodd=False,
stroking_color=None, non_stroking_color=None):
LTComponent.__init__(self, get_bound(pts))
self.pts = pts
self.linewidth = linewidth
self.stroke = stroke
self.fill = fill
self.evenodd = evenodd
self.stroking_color = stroking_color
self.non_stroking_color = non_stroking_color
return
def get_pts(self):
return ','.join('%.3f,%.3f' % p for p in self.pts)
class LTLine(LTCurve):
"""A single straight line.
Could be used for separating text or figures.
"""
def __init__(self, linewidth, p0, p1, stroke=False, fill=False,
evenodd=False, stroking_color=None, non_stroking_color=None):
LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd,
stroking_color, non_stroking_color)
return
class LTRect(LTCurve):
"""A rectangle.
Could be used for framing another pictures or figures.
"""
def __init__(self, linewidth, bbox, stroke=False, fill=False,
evenodd=False, stroking_color=None, non_stroking_color=None):
(x0, y0, x1, y1) = bbox
LTCurve.__init__(self, linewidth,
[(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke,
fill, evenodd, stroking_color, non_stroking_color)
return
class LTImage(LTComponent):
"""An image object.
Embedded images can be in JPEG, Bitmap or JBIG2.
"""
def __init__(self, name, stream, bbox):
LTComponent.__init__(self, bbox)
self.name = name
self.stream = stream
self.srcsize = (stream.get_any(('W', 'Width')),
stream.get_any(('H', 'Height')))
self.imagemask = stream.get_any(('IM', 'ImageMask'))
self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
self.colorspace = stream.get_any(('CS', 'ColorSpace'))
if not isinstance(self.colorspace, list):
self.colorspace = [self.colorspace]
return
def __repr__(self):
return ('<%s(%s) %s %r>' %
(self.__class__.__name__, self.name,
bbox2str(self.bbox), self.srcsize))
class LTAnno(LTItem, LTText):
"""Actual letter in the text as a Unicode string.
Note that, while a LTChar object has actual boundaries, LTAnno objects does
not, as these are "virtual" characters, inserted by a layout analyzer
according to the relationship between two characters (e.g. a space).
"""
def __init__(self, text):
self._text = text
return
def get_text(self):
return self._text
class LTChar(LTComponent, LTText):
"""Actual letter in the text as a Unicode string."""
def __init__(self, matrix, font, fontsize, scaling, rise,
text, textwidth, textdisp, ncs, graphicstate):
LTText.__init__(self)
self._text = text
self.matrix = matrix
self.fontname = font.fontname
self.ncs = ncs
self.graphicstate = graphicstate
self.adv = textwidth * fontsize * scaling
# compute the boundary rectangle.
if font.is_vertical():
# vertical
(vx, vy) = textdisp
if vx is None:
vx = fontsize * 0.5
else:
vx = vx * fontsize * .001
vy = (1000 - vy) * fontsize * .001
bbox_lower_left = (-vx, vy + rise + self.adv)
bbox_upper_right = (-vx + fontsize, vy + rise)
else:
# horizontal
descent = font.get_descent() * fontsize
bbox_lower_left = (0, descent + rise)
bbox_upper_right = (self.adv, descent + rise + fontsize)
(a, b, c, d, e, f) = self.matrix
self.upright = (0 < a*d*scaling and b*c <= 0)
(x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
(x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
if x1 < x0:
(x0, x1) = (x1, x0)
if y1 < y0:
(y0, y1) = (y1, y0)
LTComponent.__init__(self, (x0, y0, x1, y1))
if font.is_vertical():
self.size = self.width
else:
self.size = self.height
return
def __repr__(self):
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
(self.__class__.__name__, bbox2str(self.bbox),
matrix2str(self.matrix), self.fontname, self.adv,
self.get_text()))
def get_text(self):
return self._text
def is_compatible(self, obj):
"""Returns True if two characters can coexist in the same line."""
return True
class LTContainer(LTComponent):
"""Object that can be extended and analyzed"""
def __init__(self, bbox):
LTComponent.__init__(self, bbox)
self._objs = []
return
def __iter__(self):
return iter(self._objs)
def __len__(self):
return len(self._objs)
def add(self, obj):
self._objs.append(obj)
return
def extend(self, objs):
for obj in objs:
self.add(obj)
return
def analyze(self, laparams):
for obj in self._objs:
obj.analyze(laparams)
return
class LTExpandableContainer(LTContainer):
def __init__(self):
LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
return
def add(self, obj):
LTContainer.add(self, obj)
self.set_bbox((min(self.x0, obj.x0), min(self.y0, obj.y0),
max(self.x1, obj.x1), max(self.y1, obj.y1)))
return
class LTTextContainer(LTExpandableContainer, LTText):
def __init__(self):
LTText.__init__(self)
LTExpandableContainer.__init__(self)
return
def get_text(self):
return ''.join(obj.get_text() for obj in self
if isinstance(obj, LTText))
class LTTextLine(LTTextContainer):
"""Contains a list of LTChar objects that represent a single text line.
The characters are aligned either horizontally or vertically, depending on
the text's writing mode.
"""
def __init__(self, word_margin):
LTTextContainer.__init__(self)
self.word_margin = word_margin
return
def __repr__(self):
return ('<%s %s %r>' %
(self.__class__.__name__, bbox2str(self.bbox),
self.get_text()))
def analyze(self, laparams):
LTTextContainer.analyze(self, laparams)
LTContainer.add(self, LTAnno('\n'))
return
def find_neighbors(self, plane, ratio):
raise NotImplementedError
class LTTextLineHorizontal(LTTextLine):
def __init__(self, word_margin):
LTTextLine.__init__(self, word_margin)
self._x1 = +INF
return
def add(self, obj):
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * max(obj.width, obj.height)
if self._x1 < obj.x0 - margin:
LTContainer.add(self, LTAnno(' '))
self._x1 = obj.x1
LTTextLine.add(self, obj)
return
def find_neighbors(self, plane, ratio):
"""
Finds neighboring LTTextLineHorizontals in the plane.
Returns a list of other LTTestLineHorizontals in the plane which are
close to self. "Close" can be controlled by ratio. The returned objects
will be the same height as self, and also either left-, right-, or
centrally-aligned.
"""
d = ratio * self.height
objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
return [obj for obj in objs
if (isinstance(obj, LTTextLineHorizontal) and
self._is_same_height_as(obj, tolerance=d) and
(self._is_left_aligned_with(obj, tolerance=d) or
self._is_right_aligned_with(obj, tolerance=d) or
self._is_centrally_aligned_with(obj, tolerance=d)))]
def _is_left_aligned_with(self, other, tolerance=0):
"""
Whether the left-hand edge of `other` is within `tolerance`.
"""
return abs(other.x0 - self.x0) <= tolerance
def _is_right_aligned_with(self, other, tolerance=0):
"""
Whether the right-hand edge of `other` is within `tolerance`.
"""
return abs(other.x1 - self.x1) <= tolerance
def _is_centrally_aligned_with(self, other, tolerance=0):
"""
Whether the horizontal center of `other` is within `tolerance`.
"""
return abs(
(other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
def _is_same_height_as(self, other, tolerance):
return abs(other.height - self.height) <= tolerance
class LTTextLineVertical(LTTextLine):
def __init__(self, word_margin):
LTTextLine.__init__(self, word_margin)
self._y0 = -INF
return
def add(self, obj):
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * max(obj.width, obj.height)
if obj.y1 + margin < self._y0:
LTContainer.add(self, LTAnno(' '))
self._y0 = obj.y0
LTTextLine.add(self, obj)
return
def find_neighbors(self, plane, ratio):
"""
Finds neighboring LTTextLineVerticals in the plane.
Returns a list of other LTTextLineVerticals in the plane which are
close to self. "Close" can be controlled by ratio. The returned objects
will be the same width as self, and also either upper-, lower-, or
centrally-aligned.
"""
d = ratio * self.width
objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
return [obj for obj in objs
if (isinstance(obj, LTTextLineVertical) and
self._is_same_width_as(obj, tolerance=d) and
(self._is_lower_aligned_with(obj, tolerance=d) or
self._is_upper_aligned_with(obj, tolerance=d) or
self._is_centrally_aligned_with(obj, tolerance=d)))]
def _is_lower_aligned_with(self, other, tolerance=0):
"""
Whether the lower edge of `other` is within `tolerance`.
"""
return abs(other.y0 - self.y0) <= tolerance
def _is_upper_aligned_with(self, other, tolerance=0):
"""
Whether the upper edge of `other` is within `tolerance`.
"""
return abs(other.y1 - self.y1) <= tolerance
def _is_centrally_aligned_with(self, other, tolerance=0):
"""
Whether the vertical center of `other` is within `tolerance`.
"""
return abs(
(other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
def _is_same_width_as(self, other, tolerance):
return abs(other.width - self.width) <= tolerance
class LTTextBox(LTTextContainer):
"""Represents a group of text chunks in a rectangular area.
Note that this box is created by geometric analysis and does not
necessarily represents a logical boundary of the text. It contains a list
of LTTextLine objects.
"""
def __init__(self):
LTTextContainer.__init__(self)
self.index = -1
return
def __repr__(self):
return ('<%s(%s) %s %r>' %
(self.__class__.__name__,
self.index, bbox2str(self.bbox), self.get_text()))
class LTTextBoxHorizontal(LTTextBox):
def analyze(self, laparams):
LTTextBox.analyze(self, laparams)
self._objs.sort(key=lambda obj: -obj.y1)
return
def get_writing_mode(self):
return 'lr-tb'
class LTTextBoxVertical(LTTextBox):
def analyze(self, laparams):
LTTextBox.analyze(self, laparams)
self._objs.sort(key=lambda obj: -obj.x1)
return
def get_writing_mode(self):
return 'tb-rl'
class LTTextGroup(LTTextContainer):
def __init__(self, objs):
LTTextContainer.__init__(self)
self.extend(objs)
return
class LTTextGroupLRTB(LTTextGroup):
def analyze(self, laparams):
LTTextGroup.analyze(self, laparams)
# reorder the objects from top-left to bottom-right.
self._objs.sort(
key=lambda obj: (1 - laparams.boxes_flow) * obj.x0
- (1 + laparams.boxes_flow) * (obj.y0 + obj.y1))
return
class LTTextGroupTBRL(LTTextGroup):
def analyze(self, laparams):
LTTextGroup.analyze(self, laparams)
# reorder the objects from top-right to bottom-left.
self._objs.sort(
key=lambda obj: - (1 + laparams.boxes_flow) * (obj.x0 + obj.x1)
- (1 - laparams.boxes_flow) * obj.y1)
return
class LTLayoutContainer(LTContainer):
def __init__(self, bbox):
LTContainer.__init__(self, bbox)
self.groups = None
return
# group_objects: group text object to textlines.
def group_objects(self, laparams, objs):
obj0 = None
line = None
for obj1 in objs:
if obj0 is not None:
# halign: obj0 and obj1 is horizontally aligned.
#
# +------+ - - -
# | obj0 | - - +------+ -
# | | | obj1 | | (line_overlap)
# +------+ - - | | -
# - - - +------+
#
# |<--->|
# (char_margin)
halign = \
obj0.is_compatible(obj1) \
and obj0.is_voverlap(obj1) \
and min(obj0.height, obj1.height) * laparams.line_overlap \
< obj0.voverlap(obj1) \
and obj0.hdistance(obj1) \
< max(obj0.width, obj1.width) * laparams.char_margin
# valign: obj0 and obj1 is vertically aligned.
#
# +------+
# | obj0 |
# | |
# +------+ - - -
# | | | (char_margin)
# +------+ - -
# | obj1 |
# | |
# +------+
#
# |<-->|
# (line_overlap)
valign = \
laparams.detect_vertical \
and obj0.is_compatible(obj1) \
and obj0.is_hoverlap(obj1) \
and min(obj0.width, obj1.width) * laparams.line_overlap \
< obj0.hoverlap(obj1) \
and obj0.vdistance(obj1) \
< max(obj0.height, obj1.height) * laparams.char_margin
if ((halign and isinstance(line, LTTextLineHorizontal)) or
(valign and isinstance(line, LTTextLineVertical))):
line.add(obj1)
elif line is not None:
yield line
line = None
else:
if valign and not halign:
line = LTTextLineVertical(laparams.word_margin)
line.add(obj0)
line.add(obj1)
elif halign and not valign:
line = LTTextLineHorizontal(laparams.word_margin)
line.add(obj0)
line.add(obj1)
else:
line = LTTextLineHorizontal(laparams.word_margin)
line.add(obj0)
yield line
line = None
obj0 = obj1
if line is None:
line = LTTextLineHorizontal(laparams.word_margin)
line.add(obj0)
yield line
return
def group_textlines(self, laparams, lines):
"""Group neighboring lines to textboxes"""
plane = Plane(self.bbox)
plane.extend(lines)
boxes = {}
for line in lines:
neighbors = line.find_neighbors(plane, laparams.line_margin)
members = [line]
for obj1 in neighbors:
members.append(obj1)
if obj1 in boxes:
members.extend(boxes.pop(obj1))
if isinstance(line, LTTextLineHorizontal):
box = LTTextBoxHorizontal()
else:
box = LTTextBoxVertical()
for obj in uniq(members):
box.add(obj)
boxes[obj] = box
done = set()
for line in lines:
if line not in boxes:
continue
box = boxes[line]
if box in done:
continue
done.add(box)
if not box.is_empty():
yield box
return
def group_textboxes(self, laparams, boxes):
"""Group textboxes hierarchically.
Get pair-wise distances, via dist func defined below, and then merge
from the closest textbox pair. Once obj1 and obj2 are merged /
grouped, the resulting group is considered as a new object, and its
distances to other objects & groups are added to the process queue.
For performance reason, pair-wise distances and object pair info are
maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2)
tuples. It ensures quick access to the smallest element. Note that
since comparison operators, e.g., __lt__, are disabled for
LTComponent, id(obj) has to appear before obj in element tuples.
:param laparams: LAParams object.
:param boxes: All textbox objects to be grouped.
:return: a list that has only one element, the final top level textbox.
"""
def dist(obj1, obj2):
"""A distance function between two TextBoxes.
Consider the bounding rectangle for obj1 and obj2.
Return its area less the areas of obj1 and obj2,
shown as 'www' below. This value may be negative.
+------+..........+ (x1, y1)
| obj1 |wwwwwwwwww:
+------+www+------+
:wwwwwwwwww| obj2 |
(x0, y0) +..........+------+
"""
x0 = min(obj1.x0, obj2.x0)
y0 = min(obj1.y0, obj2.y0)
x1 = max(obj1.x1, obj2.x1)
y1 = max(obj1.y1, obj2.y1)
return (x1 - x0) * (y1 - y0) \
- obj1.width*obj1.height - obj2.width*obj2.height
def isany(obj1, obj2):
"""Check if there's any other object between obj1 and obj2."""
x0 = min(obj1.x0, obj2.x0)
y0 = min(obj1.y0, obj2.y0)
x1 = max(obj1.x1, obj2.x1)
y1 = max(obj1.y1, obj2.y1)
objs = set(plane.find((x0, y0, x1, y1)))
return objs.difference((obj1, obj2))
dists = []
for i in range(len(boxes)):
obj1 = boxes[i]
for j in range(i+1, len(boxes)):
obj2 = boxes[j]
dists.append((False, dist(obj1, obj2), id(obj1), id(obj2),
obj1, obj2))
heapq.heapify(dists)
plane = Plane(self.bbox)
plane.extend(boxes)
done = set()
while len(dists) > 0:
(skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
# Skip objects that are already merged
if (id1 not in done) and (id2 not in done):
if skip_isany and isany(obj1, obj2):
heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
continue
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):
group = LTTextGroupTBRL([obj1, obj2])
else:
group = LTTextGroupLRTB([obj1, obj2])
plane.remove(obj1)
plane.remove(obj2)
done.update([id1, id2])
for other in plane:
heapq.heappush(dists, (False, dist(group, other),
id(group), id(other), group, other))
plane.add(group)
return list(plane)
def analyze(self, laparams):
# textobjs is a list of LTChar objects, i.e.
# it has all the individual characters in the page.
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar),
self)
for obj in otherobjs:
obj.analyze(laparams)
if not textobjs:
return
textlines = list(self.group_objects(laparams, textobjs))
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
for obj in empties:
obj.analyze(laparams)
textboxes = list(self.group_textlines(laparams, textlines))
if laparams.boxes_flow is None:
for textbox in textboxes:
textbox.analyze(laparams)
def getkey(box):
if isinstance(box, LTTextBoxVertical):
return (0, -box.x1, -box.y0)
else:
return (1, -box.y0, box.x0)
textboxes.sort(key=getkey)
else:
self.groups = self.group_textboxes(laparams, textboxes)
assigner = IndexAssigner()
for group in self.groups:
group.analyze(laparams)
assigner.run(group)
textboxes.sort(key=lambda box: box.index)
self._objs = textboxes + otherobjs + empties
return
class LTFigure(LTLayoutContainer):
"""Represents an area used by PDF Form objects.
PDF Forms can be used to present figures or pictures by embedding yet
another PDF document within a page. Note that LTFigure objects can appear
recursively.
"""
def __init__(self, name, bbox, matrix):
self.name = name
self.matrix = matrix
(x, y, w, h) = bbox
bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h))
bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds)
LTLayoutContainer.__init__(self, bbox)
return
def __repr__(self):
return ('<%s(%s) %s matrix=%s>' %
(self.__class__.__name__, self.name,
bbox2str(self.bbox), matrix2str(self.matrix)))
def analyze(self, laparams):
if not laparams.all_texts:
return
LTLayoutContainer.analyze(self, laparams)
return
class LTPage(LTLayoutContainer):
"""Represents an entire page.
May contain child objects like LTTextBox, LTFigure, LTImage, LTRect,
LTCurve and LTLine.
"""
def __init__(self, pageid, bbox, rotate=0):
LTLayoutContainer.__init__(self, bbox)
self.pageid = pageid
self.rotate = rotate
return
def __repr__(self):
return ('<%s(%r) %s rotate=%r>' %
(self.__class__.__name__, self.pageid,
bbox2str(self.bbox), self.rotate))
-99
View File
@@ -1,99 +0,0 @@
from io import BytesIO
import logging
logger = logging.getLogger(__name__)
class CorruptDataError(Exception):
pass
class LZWDecoder:
def __init__(self, fp):
self.fp = fp
self.buff = 0
self.bpos = 8
self.nbits = 9
self.table = None
self.prevbuf = None
return
def readbits(self, bits):
v = 0
while 1:
# the number of remaining bits we can get from the current buffer.
r = 8-self.bpos
if bits <= r:
# |-----8-bits-----|
# |-bpos-|-bits-| |
# | |----r----|
v = (v << bits) | ((self.buff >> (r-bits)) & ((1 << bits)-1))
self.bpos += bits
break
else:
# |-----8-bits-----|
# |-bpos-|---bits----...
# | |----r----|
v = (v << r) | (self.buff & ((1 << r)-1))
bits -= r
x = self.fp.read(1)
if not x:
raise EOFError
self.buff = ord(x)
self.bpos = 0
return v
def feed(self, code):
x = b''
if code == 256:
self.table = [bytes((c,)) for c in range(256)] # 0-255
self.table.append(None) # 256
self.table.append(None) # 257
self.prevbuf = b''
self.nbits = 9
elif code == 257:
pass
elif not self.prevbuf:
x = self.prevbuf = self.table[code]
else:
if code < len(self.table):
x = self.table[code]
self.table.append(self.prevbuf+x[:1])
elif code == len(self.table):
self.table.append(self.prevbuf+self.prevbuf[:1])
x = self.table[code]
else:
raise CorruptDataError
table_length = len(self.table)
if table_length == 511:
self.nbits = 10
elif table_length == 1023:
self.nbits = 11
elif table_length == 2047:
self.nbits = 12
self.prevbuf = x
return x
def run(self):
while 1:
try:
code = self.readbits(self.nbits)
except EOFError:
break
try:
x = self.feed(code)
except CorruptDataError:
# just ignore corrupt data and stop yielding there
break
yield x
logger.debug('nbits=%d, code=%d, output=%r, table=%r'
% (self.nbits, code, x, self.table[258:]))
return
def lzwdecode(data):
fp = BytesIO(data)
s = LZWDecoder(fp).run()
return b''.join(s)
-35
View File
@@ -1,35 +0,0 @@
import collections
from .psparser import LIT
LITERAL_DEVICE_GRAY = LIT('DeviceGray')
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
class PDFColorSpace:
def __init__(self, name, ncomponents):
self.name = name
self.ncomponents = ncomponents
return
def __repr__(self):
return '<PDFColorSpace: %s, ncomponents=%d>' % \
(self.name, self.ncomponents)
PREDEFINED_COLORSPACE = collections.OrderedDict()
for (name, n) in [
('DeviceGray', 1), # default value first
('CalRGB', 3),
('CalGray', 1),
('Lab', 3),
('DeviceRGB', 3),
('DeviceCMYK', 4),
('Separation', 1),
('Indexed', 1),
('Pattern', 1),
]:
PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n)
-193
View File
@@ -1,193 +0,0 @@
from . import utils
from .pdffont import PDFUnicodeNotDefined
class PDFDevice:
"""Translate the output of PDFPageInterpreter to the output that is needed
"""
def __init__(self, rsrcmgr):
self.rsrcmgr = rsrcmgr
self.ctm = None
return
def __repr__(self):
return '<PDFDevice>'
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def close(self):
return
def set_ctm(self, ctm):
self.ctm = ctm
return
def begin_tag(self, tag, props=None):
return
def end_tag(self):
return
def do_tag(self, tag, props=None):
return
def begin_page(self, page, ctm):
return
def end_page(self, page):
return
def begin_figure(self, name, bbox, matrix):
return
def end_figure(self, name):
return
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return
def render_image(self, name, stream):
return
def render_string(self, textstate, seq, ncs, graphicstate):
return
class PDFTextDevice(PDFDevice):
def render_string(self, textstate, seq, ncs, graphicstate):
matrix = utils.mult_matrix(textstate.matrix, self.ctm)
font = textstate.font
fontsize = textstate.fontsize
scaling = textstate.scaling * .01
charspace = textstate.charspace * scaling
wordspace = textstate.wordspace * scaling
rise = textstate.rise
if font.is_multibyte():
wordspace = 0
dxscale = .001 * fontsize * scaling
if font.is_vertical():
textstate.linematrix = self.render_string_vertical(
seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale, ncs,
graphicstate)
else:
textstate.linematrix = self.render_string_horizontal(
seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale, ncs,
graphicstate)
return
def render_string_horizontal(self, seq, matrix, pos,
font, fontsize, scaling, charspace, wordspace,
rise, dxscale, ncs, graphicstate):
(x, y) = pos
needcharspace = False
for obj in seq:
if utils.isnumber(obj):
x -= obj*dxscale
needcharspace = True
else:
for cid in font.decode(obj):
if needcharspace:
x += charspace
x += self.render_char(
utils.translate_matrix(matrix, (x, y)), font,
fontsize, scaling, rise, cid, ncs, graphicstate)
if cid == 32 and wordspace:
x += wordspace
needcharspace = True
return (x, y)
def render_string_vertical(self, seq, matrix, pos,
font, fontsize, scaling, charspace, wordspace,
rise, dxscale, ncs, graphicstate):
(x, y) = pos
needcharspace = False
for obj in seq:
if utils.isnumber(obj):
y -= obj*dxscale
needcharspace = True
else:
for cid in font.decode(obj):
if needcharspace:
y += charspace
y += self.render_char(
utils.translate_matrix(matrix, (x, y)), font, fontsize,
scaling, rise, cid, ncs, graphicstate)
if cid == 32 and wordspace:
y += wordspace
needcharspace = True
return (x, y)
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
graphicstate):
return 0
class TagExtractor(PDFDevice):
def __init__(self, rsrcmgr, outfp, codec='utf-8'):
PDFDevice.__init__(self, rsrcmgr)
self.outfp = outfp
self.codec = codec
self.pageno = 0
self._stack = []
return
def render_string(self, textstate, seq, ncs, graphicstate):
font = textstate.font
text = ''
for obj in seq:
if isinstance(obj, str):
obj = utils.make_compat_bytes(obj)
if not isinstance(obj, bytes):
continue
chars = font.decode(obj)
for cid in chars:
try:
char = font.to_unichr(cid)
text += char
except PDFUnicodeNotDefined:
print(chars)
pass
self.outfp.write(utils.enc(text))
return
def begin_page(self, page, ctm):
output = '<page id="%s" bbox="%s" rotate="%d">' %\
(self.pageno, utils.bbox2str(page.mediabox), page.rotate)
self.outfp.write(utils.make_compat_bytes(output))
return
def end_page(self, page):
self.outfp.write(utils.make_compat_bytes('</page>\n'))
self.pageno += 1
return
def begin_tag(self, tag, props=None):
s = ''
if isinstance(props, dict):
s = ''.join(' {}="{}"'.format(utils.enc(k), utils.enc(str(v)))
for (k, v) in sorted(props.items()))
out_s = '<{}{}>'.format(utils.enc(tag.name), s)
self.outfp.write(utils.make_compat_bytes(out_s))
self._stack.append(tag)
return
def end_tag(self):
assert self._stack, str(self.pageno)
tag = self._stack.pop(-1)
out_s = '</%s>' % utils.enc(tag.name)
self.outfp.write(utils.make_compat_bytes(out_s))
return
def do_tag(self, tag, props=None):
self.begin_tag(tag, props)
self._stack.pop(-1)
return
-831
View File
@@ -1,831 +0,0 @@
import logging
import re
import struct
from hashlib import sha256, md5
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from . import settings
from .arcfour import Arcfour
from .pdfparser import PDFSyntaxError, PDFStreamParser
from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
dict_value, stream_value
from .psparser import PSEOF, literal_name, LIT, KWD
from .utils import choplist, nunpack, decode_text
log = logging.getLogger(__name__)
class PDFNoValidXRef(PDFSyntaxError):
pass
class PDFNoValidXRefWarning(SyntaxWarning):
pass
class PDFNoOutlines(PDFException):
pass
class PDFDestinationNotFound(PDFException):
pass
class PDFEncryptionError(PDFException):
pass
class PDFPasswordIncorrect(PDFEncryptionError):
pass
class PDFTextExtractionNotAllowedWarning(UserWarning):
pass
class PDFTextExtractionNotAllowed(PDFEncryptionError):
pass
class PDFTextExtractionNotAllowedError(PDFTextExtractionNotAllowed):
def __init__(self, *args):
from warnings import warn
warn('PDFTextExtractionNotAllowedError will be removed in the future. '
'Use PDFTextExtractionNotAllowed instead.', DeprecationWarning)
super().__init__(*args)
# some predefined literals and keywords.
LITERAL_OBJSTM = LIT('ObjStm')
LITERAL_XREF = LIT('XRef')
LITERAL_CATALOG = LIT('Catalog')
class PDFBaseXRef:
def get_trailer(self):
raise NotImplementedError
def get_objids(self):
return []
# Must return
# (strmid, index, genno)
# or (None, pos, genno)
def get_pos(self, objid):
raise KeyError(objid)
class PDFXRef(PDFBaseXRef):
def __init__(self):
self.offsets = {}
self.trailer = {}
return
def __repr__(self):
return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
def load(self, parser):
while True:
try:
(pos, line) = parser.nextline()
if not line.strip():
continue
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line:
raise PDFNoValidXRef('Premature eof: %r' % parser)
if line.startswith(b'trailer'):
parser.seek(pos)
break
f = line.strip().split(b' ')
if len(f) != 2:
error_msg = 'Trailer not found: {!r}: line={!r}'\
.format(parser, line)
raise PDFNoValidXRef(error_msg)
try:
(start, nobjs) = map(int, f)
except ValueError:
error_msg = 'Invalid line: {!r}: line={!r}'\
.format(parser, line)
raise PDFNoValidXRef(error_msg)
for objid in range(start, start+nobjs):
try:
(_, line) = parser.nextline()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
f = line.strip().split(b' ')
if len(f) != 3:
error_msg = 'Invalid XRef format: {!r}, line={!r}'\
.format(parser, line)
raise PDFNoValidXRef(error_msg)
(pos, genno, use) = f
if use != b'n':
continue
self.offsets[objid] = (None, int(pos), int(genno))
log.info('xref objects: %r', self.offsets)
self.load_trailer(parser)
return
def load_trailer(self, parser):
try:
(_, kwd) = parser.nexttoken()
assert kwd is KWD(b'trailer'), str(kwd)
(_, dic) = parser.nextobject()
except PSEOF:
x = parser.pop(1)
if not x:
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
(_, dic) = x[0]
self.trailer.update(dict_value(dic))
log.debug('trailer=%r', self.trailer)
return
def get_trailer(self):
return self.trailer
def get_objids(self):
return self.offsets.keys()
def get_pos(self, objid):
try:
return self.offsets[objid]
except KeyError:
raise
class PDFXRefFallback(PDFXRef):
def __repr__(self):
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
def load(self, parser):
parser.seek(0)
while 1:
try:
(pos, line) = parser.nextline()
except PSEOF:
break
if line.startswith(b'trailer'):
parser.seek(pos)
self.load_trailer(parser)
log.info('trailer: %r', self.trailer)
break
line = line.decode('latin-1') # default pdf encoding
m = self.PDFOBJ_CUE.match(line)
if not m:
continue
(objid, genno) = m.groups()
objid = int(objid)
genno = int(genno)
self.offsets[objid] = (None, pos, genno)
# expand ObjStm.
parser.seek(pos)
(_, obj) = parser.nextobject()
if isinstance(obj, PDFStream) \
and obj.get('Type') is LITERAL_OBJSTM:
stream = stream_value(obj)
try:
n = stream['N']
except KeyError:
if settings.STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
parser1 = PDFStreamParser(stream.get_data())
objs = []
try:
while 1:
(_, obj) = parser1.nextobject()
objs.append(obj)
except PSEOF:
pass
n = min(n, len(objs)//2)
for index in range(n):
objid1 = objs[index*2]
self.offsets[objid1] = (objid, index, 0)
return
class PDFXRefStream(PDFBaseXRef):
def __init__(self):
self.data = None
self.entlen = None
self.fl1 = self.fl2 = self.fl3 = None
self.ranges = []
return
def __repr__(self):
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
def load(self, parser):
(_, objid) = parser.nexttoken() # ignored
(_, genno) = parser.nexttoken() # ignored
(_, kwd) = parser.nexttoken()
(_, stream) = parser.nextobject()
if not isinstance(stream, PDFStream) \
or stream['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream['Size']
index_array = stream.get('Index', (0, size))
if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number')
self.ranges.extend(choplist(2, index_array))
(self.fl1, self.fl2, self.fl3) = stream['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.attrs
log.info('xref stream: objid=%s, fields=%d,%d,%d',
', '.join(map(repr, self.ranges)),
self.fl1, self.fl2, self.fl3)
return
def get_trailer(self):
return self.trailer
def get_objids(self):
for (start, nobjs) in self.ranges:
for i in range(nobjs):
offset = self.entlen * i
ent = self.data[offset:offset+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
if f1 == 1 or f1 == 2:
yield start+i
return
def get_pos(self, objid):
index = 0
for (start, nobjs) in self.ranges:
if start <= objid and objid < start+nobjs:
index += objid - start
break
else:
index += nobjs
else:
raise KeyError(objid)
offset = self.entlen * index
ent = self.data[offset:offset+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
f2 = nunpack(ent[self.fl1:self.fl1+self.fl2])
f3 = nunpack(ent[self.fl1+self.fl2:])
if f1 == 1:
return (None, f2, f3)
elif f1 == 2:
return (f2, f3, 0)
else:
# this is a free object
raise KeyError(objid)
class PDFStandardSecurityHandler:
PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
supported_revisions = (2, 3)
def __init__(self, docid, param, password=''):
self.docid = docid
self.param = param
self.password = password
self.init()
return
def init(self):
self.init_params()
if self.r not in self.supported_revisions:
error_msg = 'Unsupported revision: param=%r' % self.param
raise PDFEncryptionError(error_msg)
self.init_key()
return
def init_params(self):
self.v = int_value(self.param.get('V', 0))
self.r = int_value(self.param['R'])
self.p = uint_value(self.param['P'], 32)
self.o = str_value(self.param['O'])
self.u = str_value(self.param['U'])
self.length = int_value(self.param.get('Length', 40))
return
def init_key(self):
self.key = self.authenticate(self.password)
if self.key is None:
raise PDFPasswordIncorrect
return
def is_printable(self):
return bool(self.p & 4)
def is_modifiable(self):
return bool(self.p & 8)
def is_extractable(self):
return bool(self.p & 16)
def compute_u(self, key):
if self.r == 2:
# Algorithm 3.4
return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2
else:
# Algorithm 3.5
hash = md5(self.PASSWORD_PADDING) # 2
hash.update(self.docid[0]) # 3
result = Arcfour(key).encrypt(hash.digest()) # 4
for i in range(1, 20): # 5
k = b''.join(bytes((c ^ i,)) for c in iter(key))
result = Arcfour(k).encrypt(result)
result += result # 6
return result
def compute_encryption_key(self, password):
# Algorithm 3.2
password = (password + self.PASSWORD_PADDING)[:32] # 1
hash = md5(password) # 2
hash.update(self.o) # 3
# See https://github.com/pdfminer/pdfminer.six/issues/186
hash.update(struct.pack('<L', self.p)) # 4
hash.update(self.docid[0]) # 5
if self.r >= 4:
if not self.encrypt_metadata:
hash.update(b'\xff\xff\xff\xff')
result = hash.digest()
n = 5
if self.r >= 3:
n = self.length // 8
for _ in range(50):
result = md5(result[:n]).digest()
return result[:n]
def authenticate(self, password):
password = password.encode("latin1")
key = self.authenticate_user_password(password)
if key is None:
key = self.authenticate_owner_password(password)
return key
def authenticate_user_password(self, password):
key = self.compute_encryption_key(password)
if self.verify_encryption_key(key):
return key
else:
return None
def verify_encryption_key(self, key):
# Algorithm 3.6
u = self.compute_u(key)
if self.r == 2:
return u == self.u
return u[:16] == self.u[:16]
def authenticate_owner_password(self, password):
# Algorithm 3.7
password = (password + self.PASSWORD_PADDING)[:32]
hash = md5(password)
if self.r >= 3:
for _ in range(50):
hash = md5(hash.digest())
n = 5
if self.r >= 3:
n = self.length // 8
key = hash.digest()[:n]
if self.r == 2:
user_password = Arcfour(key).decrypt(self.o)
else:
user_password = self.o
for i in range(19, -1, -1):
k = b''.join(bytes((c ^ i,)) for c in iter(key))
user_password = Arcfour(k).decrypt(user_password)
return self.authenticate_user_password(user_password)
def decrypt(self, objid, genno, data, attrs=None):
return self.decrypt_rc4(objid, genno, data)
def decrypt_rc4(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] \
+ struct.pack('<L', genno)[:2]
hash = md5(key)
key = hash.digest()[:min(len(key), 16)]
return Arcfour(key).decrypt(data)
class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
supported_revisions = (4,)
def init_params(self):
super().init_params()
self.length = 128
self.cf = dict_value(self.param.get('CF'))
self.stmf = literal_name(self.param['StmF'])
self.strf = literal_name(self.param['StrF'])
self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True))
if self.stmf != self.strf:
error_msg = 'Unsupported crypt filter: param=%r' % self.param
raise PDFEncryptionError(error_msg)
self.cfm = {}
for k, v in self.cf.items():
f = self.get_cfm(literal_name(v['CFM']))
if f is None:
error_msg = 'Unknown crypt filter method: param=%r' \
% self.param
raise PDFEncryptionError(error_msg)
self.cfm[k] = f
self.cfm['Identity'] = self.decrypt_identity
if self.strf not in self.cfm:
error_msg = 'Undefined crypt filter: param=%r' % self.param
raise PDFEncryptionError(error_msg)
return
def get_cfm(self, name):
if name == 'V2':
return self.decrypt_rc4
elif name == 'AESV2':
return self.decrypt_aes128
else:
return None
def decrypt(self, objid, genno, data, attrs=None, name=None):
if not self.encrypt_metadata and attrs is not None:
t = attrs.get('Type')
if t is not None and literal_name(t) == 'Metadata':
return data
if name is None:
name = self.strf
return self.cfm[name](objid, genno, data)
def decrypt_identity(self, objid, genno, data):
return data
def decrypt_aes128(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] \
+ struct.pack('<L', genno)[:2] + b'sAlT'
hash = md5(key)
key = hash.digest()[:min(len(key), 16)]
initialization_vector = data[:16]
ciphertext = data[16:]
cipher = Cipher(algorithms.AES(key),
modes.CBC(initialization_vector),
backend=default_backend())
return cipher.decryptor().update(ciphertext)
class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
supported_revisions = (5,)
def init_params(self):
super().init_params()
self.length = 256
self.oe = str_value(self.param['OE'])
self.ue = str_value(self.param['UE'])
self.o_hash = self.o[:32]
self.o_validation_salt = self.o[32:40]
self.o_key_salt = self.o[40:]
self.u_hash = self.u[:32]
self.u_validation_salt = self.u[32:40]
self.u_key_salt = self.u[40:]
return
def get_cfm(self, name):
if name == 'AESV3':
return self.decrypt_aes256
else:
return None
def authenticate(self, password):
password = password.encode('utf-8')[:127]
hash = sha256(password)
hash.update(self.o_validation_salt)
hash.update(self.u)
if hash.digest() == self.o_hash:
hash = sha256(password)
hash.update(self.o_key_salt)
hash.update(self.u)
cipher = Cipher(algorithms.AES(hash.digest()),
modes.CBC(b'\0' * 16),
backend=default_backend())
return cipher.decryptor().update(self.oe)
hash = sha256(password)
hash.update(self.u_validation_salt)
if hash.digest() == self.u_hash:
hash = sha256(password)
hash.update(self.u_key_salt)
cipher = Cipher(algorithms.AES(hash.digest()),
modes.CBC(b'\0' * 16),
backend=default_backend())
return cipher.decryptor().update(self.ue)
return None
def decrypt_aes256(self, objid, genno, data):
initialization_vector = data[:16]
ciphertext = data[16:]
cipher = Cipher(algorithms.AES(self.key),
modes.CBC(initialization_vector),
backend=default_backend())
return cipher.decryptor().update(ciphertext)
class PDFDocument:
"""PDFDocument object represents a PDF document.
Since a PDF file can be very big, normally it is not loaded at
once. So PDF document has to cooperate with a PDF parser in order to
dynamically import the data as processing goes.
Typical usage:
doc = PDFDocument(parser, password)
obj = doc.getobj(objid)
"""
security_handler_registry = {
1: PDFStandardSecurityHandler,
2: PDFStandardSecurityHandler,
4: PDFStandardSecurityHandlerV4,
5: PDFStandardSecurityHandlerV5,
}
def __init__(self, parser, password='', caching=True, fallback=True):
"Set the document to use a given PDFParser object."
self.caching = caching
self.xrefs = []
self.info = []
self.catalog = None
self.encryption = None
self.decipher = None
self._parser = None
self._cached_objs = {}
self._parsed_objs = {}
self._parser = parser
self._parser.set_document(self)
self.is_printable = self.is_modifiable = self.is_extractable = True
# Retrieve the information of each header that was appended
# (maybe multiple times) at the end of the document.
try:
pos = self.find_xref(parser)
self.read_xref_from(parser, pos, self.xrefs)
except PDFNoValidXRef:
pass # fallback = True
if fallback:
parser.fallback = True
xref = PDFXRefFallback()
xref.load(parser)
self.xrefs.append(xref)
for xref in self.xrefs:
trailer = xref.get_trailer()
if not trailer:
continue
# If there's an encryption info, remember it.
if 'Encrypt' in trailer:
self.encryption = (list_value(trailer['ID']),
dict_value(trailer['Encrypt']))
self._initialize_password(password)
if 'Info' in trailer:
self.info.append(dict_value(trailer['Info']))
if 'Root' in trailer:
# Every PDF file must have exactly one /Root dictionary.
self.catalog = dict_value(trailer['Root'])
break
else:
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
if self.catalog.get('Type') is not LITERAL_CATALOG:
if settings.STRICT:
raise PDFSyntaxError('Catalog not found!')
return
KEYWORD_OBJ = KWD(b'obj')
# _initialize_password(password=b'')
# Perform the initialization with a given password.
def _initialize_password(self, password=''):
(docid, param) = self.encryption
if literal_name(param.get('Filter')) != 'Standard':
raise PDFEncryptionError('Unknown filter: param=%r' % param)
v = int_value(param.get('V', 0))
factory = self.security_handler_registry.get(v)
if factory is None:
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
handler = factory(docid, param, password)
self.decipher = handler.decrypt
self.is_printable = handler.is_printable()
self.is_modifiable = handler.is_modifiable()
self.is_extractable = handler.is_extractable()
self._parser.fallback = False # need to read streams with exact length
return
def _getobj_objstm(self, stream, index, objid):
if stream.objid in self._parsed_objs:
(objs, n) = self._parsed_objs[stream.objid]
else:
(objs, n) = self._get_objects(stream)
if self.caching:
self._parsed_objs[stream.objid] = (objs, n)
i = n*2+index
try:
obj = objs[i]
except IndexError:
raise PDFSyntaxError('index too big: %r' % index)
return obj
def _get_objects(self, stream):
if stream.get('Type') is not LITERAL_OBJSTM:
if settings.STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
try:
n = stream['N']
except KeyError:
if settings.STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
parser = PDFStreamParser(stream.get_data())
parser.set_document(self)
objs = []
try:
while 1:
(_, obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
return (objs, n)
def _getobj_parse(self, pos, objid):
self._parser.seek(pos)
(_, objid1) = self._parser.nexttoken() # objid
(_, genno) = self._parser.nexttoken() # genno
(_, kwd) = self._parser.nexttoken()
# hack around malformed pdf files
# copied from https://github.com/jaepil/pdfminer3k/blob/master/
# pdfminer/pdfparser.py#L399
# to solve https://github.com/pdfminer/pdfminer.six/issues/56
# assert objid1 == objid, str((objid1, objid))
if objid1 != objid:
x = []
while kwd is not self.KEYWORD_OBJ:
(_, kwd) = self._parser.nexttoken()
x.append(kwd)
if len(x) >= 2:
objid1 = x[-2]
# #### end hack around malformed pdf files
if objid1 != objid:
raise PDFSyntaxError('objid mismatch: {!r}={!r}'
.format(objid1, objid))
if kwd != KWD(b'obj'):
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
(_, obj) = self._parser.nextobject()
return obj
# can raise PDFObjectNotFound
def getobj(self, objid):
"""Get object from PDF
:raises PDFException if PDFDocument is not initialized
:raises PDFObjectNotFound if objid does not exist in PDF
"""
if not self.xrefs:
raise PDFException('PDFDocument is not initialized')
log.debug('getobj: objid=%r', objid)
if objid in self._cached_objs:
(obj, genno) = self._cached_objs[objid]
else:
for xref in self.xrefs:
try:
(strmid, index, genno) = xref.get_pos(objid)
except KeyError:
continue
try:
if strmid is not None:
stream = stream_value(self.getobj(strmid))
obj = self._getobj_objstm(stream, index, objid)
else:
obj = self._getobj_parse(index, objid)
if self.decipher:
obj = decipher_all(self.decipher, objid, genno,
obj)
if isinstance(obj, PDFStream):
obj.set_objid(objid, genno)
break
except (PSEOF, PDFSyntaxError):
continue
else:
raise PDFObjectNotFound(objid)
log.debug('register: objid=%r: %r', objid, obj)
if self.caching:
self._cached_objs[objid] = (obj, genno)
return obj
def get_outlines(self):
if 'Outlines' not in self.catalog:
raise PDFNoOutlines
def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:
if 'A' in entry or 'Dest' in entry:
title = decode_text(str_value(entry['Title']))
dest = entry.get('Dest')
action = entry.get('A')
se = entry.get('SE')
yield (level, title, dest, action, se)
if 'First' in entry and 'Last' in entry:
yield from search(entry['First'], level+1)
if 'Next' in entry:
yield from search(entry['Next'], level)
return
return search(self.catalog['Outlines'], 0)
def lookup_name(self, cat, key):
try:
names = dict_value(self.catalog['Names'])
except (PDFTypeError, KeyError):
raise KeyError((cat, key))
# may raise KeyError
d0 = dict_value(names[cat])
def lookup(d):
if 'Limits' in d:
(k1, k2) = list_value(d['Limits'])
if key < k1 or k2 < key:
return None
if 'Names' in d:
objs = list_value(d['Names'])
names = dict(choplist(2, objs))
return names[key]
if 'Kids' in d:
for c in list_value(d['Kids']):
v = lookup(dict_value(c))
if v:
return v
raise KeyError((cat, key))
return lookup(d0)
def get_dest(self, name):
try:
# PDF-1.2 or later
obj = self.lookup_name('Dests', name)
except KeyError:
# PDF-1.1 or prior
if 'Dests' not in self.catalog:
raise PDFDestinationNotFound(name)
d0 = dict_value(self.catalog['Dests'])
if name not in d0:
raise PDFDestinationNotFound(name)
obj = d0[name]
return obj
# find_xref
def find_xref(self, parser):
"""Internal function used to locate the first XRef."""
# search the last xref table by scanning the file backwards.
prev = None
for line in parser.revreadlines():
line = line.strip()
log.debug('find_xref: %r', line)
if line == b'startxref':
break
if line:
prev = line
else:
raise PDFNoValidXRef('Unexpected EOF')
log.info('xref found: pos=%r', prev)
return int(prev)
# read xref table
def read_xref_from(self, parser, start, xrefs):
"""Reads XRefs from the given location."""
parser.seek(start)
parser.reset()
try:
(pos, token) = parser.nexttoken()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF')
log.info('read_xref_from: start=%d, token=%r', start, token)
if isinstance(token, int):
# XRefStream: PDF-1.5
parser.seek(pos)
parser.reset()
xref = PDFXRefStream()
xref.load(parser)
else:
if token is parser.KEYWORD_XREF:
parser.nextline()
xref = PDFXRef()
xref.load(parser)
xrefs.append(xref)
trailer = xref.get_trailer()
log.info('trailer: %r', trailer)
if 'XRefStm' in trailer:
pos = int_value(trailer['XRefStm'])
self.read_xref_from(parser, pos, xrefs)
if 'Prev' in trailer:
# find previous xref
pos = int_value(trailer['Prev'])
self.read_xref_from(parser, pos, xrefs)
return
-801
View File
@@ -1,801 +0,0 @@
import logging
import struct
import sys
from io import BytesIO
from . import settings
from .cmapdb import CMap
from .cmapdb import CMapDB
from .cmapdb import CMapParser
from .cmapdb import FileUnicodeMap
from .encodingdb import EncodingDB
from .encodingdb import name2unicode
from .fontmetrics import FONT_METRICS
from .pdftypes import PDFException
from .pdftypes import PDFStream
from .pdftypes import dict_value
from .pdftypes import int_value
from .pdftypes import list_value
from .pdftypes import num_value
from .pdftypes import resolve1, resolve_all
from .pdftypes import stream_value
from .psparser import KWD
from .psparser import LIT
from .psparser import PSEOF
from .psparser import PSLiteral
from .psparser import PSStackParser
from .psparser import literal_name
from .utils import apply_matrix_norm
from .utils import choplist
from .utils import isnumber
from .utils import nunpack
log = logging.getLogger(__name__)
def get_widths(seq):
widths = {}
r = []
for v in seq:
if isinstance(v, list):
if r:
char1 = r[-1]
for (i, w) in enumerate(v):
widths[char1+i] = w
r = []
elif isnumber(v):
r.append(v)
if len(r) == 3:
(char1, char2, w) = r
for i in range(char1, char2+1):
widths[i] = w
r = []
return widths
def get_widths2(seq):
widths = {}
r = []
for v in seq:
if isinstance(v, list):
if r:
char1 = r[-1]
for (i, (w, vx, vy)) in enumerate(choplist(3, v)):
widths[char1+i] = (w, (vx, vy))
r = []
elif isnumber(v):
r.append(v)
if len(r) == 5:
(char1, char2, w, vx, vy) = r
for i in range(char1, char2+1):
widths[i] = (w, (vx, vy))
r = []
return widths
class FontMetricsDB:
@classmethod
def get_metrics(cls, fontname):
return FONT_METRICS[fontname]
class Type1FontHeaderParser(PSStackParser):
KEYWORD_BEGIN = KWD(b'begin')
KEYWORD_END = KWD(b'end')
KEYWORD_DEF = KWD(b'def')
KEYWORD_PUT = KWD(b'put')
KEYWORD_DICT = KWD(b'dict')
KEYWORD_ARRAY = KWD(b'array')
KEYWORD_READONLY = KWD(b'readonly')
KEYWORD_FOR = KWD(b'for')
def __init__(self, data):
PSStackParser.__init__(self, data)
self._cid2unicode = {}
return
def get_encoding(self):
"""Parse the font encoding.
The Type1 font encoding maps character codes to character names. These
character names could either be standard Adobe glyph names, or
character names associated with custom CharStrings for this font. A
CharString is a sequence of operations that describe how the character
should be drawn. Currently, this function returns '' (empty string)
for character names that are associated with a CharStrings.
Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format
:returns mapping of character identifiers (cid's) to unicode characters
"""
while 1:
try:
(cid, name) = self.nextobject()
except PSEOF:
break
try:
self._cid2unicode[cid] = name2unicode(name)
except KeyError as e:
log.debug(str(e))
return self._cid2unicode
def do_keyword(self, pos, token):
if token is self.KEYWORD_PUT:
((_, key), (_, value)) = self.pop(2)
if (isinstance(key, int) and isinstance(value, PSLiteral)):
self.add_results((key, literal_name(value)))
return
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-',
None, '-')
# Mapping of cmap names. Original cmap name is kept if not in the mapping.
# (missing reference for why DLIdent is mapped to Identity)
IDENTITY_ENCODER = {
'DLIdent-H': 'Identity-H',
'DLIdent-V': 'Identity-V',
}
def getdict(data):
d = {}
fp = BytesIO(data)
stack = []
while 1:
c = fp.read(1)
if not c:
break
b0 = ord(c)
if b0 <= 21:
d[b0] = stack
stack = []
continue
if b0 == 30:
s = ''
loop = True
while loop:
b = ord(fp.read(1))
for n in (b >> 4, b & 15):
if n == 15:
loop = False
else:
s += NIBBLES[n]
value = float(s)
elif 32 <= b0 and b0 <= 246:
value = b0-139
else:
b1 = ord(fp.read(1))
if 247 <= b0 and b0 <= 250:
value = ((b0-247) << 8)+b1+108
elif 251 <= b0 and b0 <= 254:
value = -((b0-251) << 8)-b1-108
else:
b2 = ord(fp.read(1))
if 128 <= b1:
b1 -= 256
if b0 == 28:
value = b1 << 8 | b2
else:
value = b1 << 24 | b2 << 16 | \
struct.unpack('>H', fp.read(2))[0]
stack.append(value)
return d
class CFFFont:
STANDARD_STRINGS = (
'.notdef', 'space', 'exclam', 'quotedbl', 'numbersign',
'dollar', 'percent', 'ampersand', 'quoteright', 'parenleft',
'parenright', 'asterisk', 'plus', 'comma', 'hyphen', 'period',
'slash', 'zero', 'one', 'two', 'three', 'four', 'five', 'six',
'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal',
'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
'U', 'V', 'W', 'X', 'Y', 'Z', 'bracketleft', 'backslash',
'bracketright', 'asciicircum', 'underscore', 'quoteleft', 'a',
'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'braceleft', 'bar', 'braceright', 'asciitilde', 'exclamdown',
'cent', 'sterling', 'fraction', 'yen', 'florin', 'section',
'currency', 'quotesingle', 'quotedblleft', 'guillemotleft',
'guilsinglleft', 'guilsinglright', 'fi', 'fl', 'endash',
'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet',
'quotesinglbase', 'quotedblbase', 'quotedblright',
'guillemotright', 'ellipsis', 'perthousand', 'questiondown',
'grave', 'acute', 'circumflex', 'tilde', 'macron', 'breve',
'dotaccent', 'dieresis', 'ring', 'cedilla', 'hungarumlaut',
'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash',
'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash',
'oslash', 'oe', 'germandbls', 'onesuperior', 'logicalnot', 'mu',
'trademark', 'Eth', 'onehalf', 'plusminus', 'Thorn',
'onequarter', 'divide', 'brokenbar', 'degree', 'thorn',
'threequarters', 'twosuperior', 'registered', 'minus', 'eth',
'multiply', 'threesuperior', 'copyright', 'Aacute',
'Acircumflex', 'Adieresis', 'Agrave', 'Aring', 'Atilde',
'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave',
'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde',
'Oacute', 'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde',
'Scaron', 'Uacute', 'Ucircumflex', 'Udieresis', 'Ugrave',
'Yacute', 'Ydieresis', 'Zcaron', 'aacute', 'acircumflex',
'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla', 'eacute',
'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex',
'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex',
'odieresis', 'ograve', 'otilde', 'scaron', 'uacute',
'ucircumflex', 'udieresis', 'ugrave', 'yacute', 'ydieresis',
'zcaron', 'exclamsmall', 'Hungarumlautsmall', 'dollaroldstyle',
'dollarsuperior', 'ampersandsmall', 'Acutesmall',
'parenleftsuperior', 'parenrightsuperior', 'twodotenleader',
'onedotenleader', 'zerooldstyle', 'oneoldstyle', 'twooldstyle',
'threeoldstyle', 'fouroldstyle', 'fiveoldstyle', 'sixoldstyle',
'sevenoldstyle', 'eightoldstyle', 'nineoldstyle',
'commasuperior', 'threequartersemdash', 'periodsuperior',
'questionsmall', 'asuperior', 'bsuperior', 'centsuperior',
'dsuperior', 'esuperior', 'isuperior', 'lsuperior', 'msuperior',
'nsuperior', 'osuperior', 'rsuperior', 'ssuperior', 'tsuperior',
'ff', 'ffi', 'ffl', 'parenleftinferior', 'parenrightinferior',
'Circumflexsmall', 'hyphensuperior', 'Gravesmall', 'Asmall',
'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall',
'Hsmall', 'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall',
'Nsmall', 'Osmall', 'Psmall', 'Qsmall', 'Rsmall', 'Ssmall',
'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall', 'Ysmall',
'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall',
'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall',
'Zcaronsmall', 'Dieresissmall', 'Brevesmall', 'Caronsmall',
'Dotaccentsmall', 'Macronsmall', 'figuredash', 'hypheninferior',
'Ogoneksmall', 'Ringsmall', 'Cedillasmall', 'questiondownsmall',
'oneeighth', 'threeeighths', 'fiveeighths', 'seveneighths',
'onethird', 'twothirds', 'zerosuperior', 'foursuperior',
'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior',
'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior',
'threeinferior', 'fourinferior', 'fiveinferior', 'sixinferior',
'seveninferior', 'eightinferior', 'nineinferior',
'centinferior', 'dollarinferior', 'periodinferior',
'commainferior', 'Agravesmall', 'Aacutesmall',
'Acircumflexsmall', 'Atildesmall', 'Adieresissmall',
'Aringsmall', 'AEsmall', 'Ccedillasmall', 'Egravesmall',
'Eacutesmall', 'Ecircumflexsmall', 'Edieresissmall',
'Igravesmall', 'Iacutesmall', 'Icircumflexsmall',
'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall',
'Oacutesmall', 'Ocircumflexsmall', 'Otildesmall',
'Odieresissmall', 'OEsmall', 'Oslashsmall', 'Ugravesmall',
'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall',
'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000',
'001.001', '001.002', '001.003', 'Black', 'Bold', 'Book',
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
)
class INDEX:
def __init__(self, fp):
self.fp = fp
self.offsets = []
(count, offsize) = struct.unpack('>HB', self.fp.read(3))
for i in range(count+1):
self.offsets.append(nunpack(self.fp.read(offsize)))
self.base = self.fp.tell()-1
self.fp.seek(self.base+self.offsets[-1])
return
def __repr__(self):
return '<INDEX: size=%d>' % len(self)
def __len__(self):
return len(self.offsets)-1
def __getitem__(self, i):
self.fp.seek(self.base+self.offsets[i])
return self.fp.read(self.offsets[i+1]-self.offsets[i])
def __iter__(self):
return iter(self[i] for i in range(len(self)))
def __init__(self, name, fp):
self.name = name
self.fp = fp
# Header
(_major, _minor, hdrsize, offsize) = struct.unpack('BBBB',
self.fp.read(4))
self.fp.read(hdrsize-4)
# Name INDEX
self.name_index = self.INDEX(self.fp)
# Top DICT INDEX
self.dict_index = self.INDEX(self.fp)
# String INDEX
self.string_index = self.INDEX(self.fp)
# Global Subr INDEX
self.subr_index = self.INDEX(self.fp)
# Top DICT DATA
self.top_dict = getdict(self.dict_index[0])
(charset_pos,) = self.top_dict.get(15, [0])
(encoding_pos,) = self.top_dict.get(16, [0])
(charstring_pos,) = self.top_dict.get(17, [0])
# CharStrings
self.fp.seek(charstring_pos)
self.charstring = self.INDEX(self.fp)
self.nglyphs = len(self.charstring)
# Encodings
self.code2gid = {}
self.gid2code = {}
self.fp.seek(encoding_pos)
format = self.fp.read(1)
if format == b'\x00':
# Format 0
(n,) = struct.unpack('B', self.fp.read(1))
for (code, gid) in enumerate(struct.unpack('B'*n,
self.fp.read(n))):
self.code2gid[code] = gid
self.gid2code[gid] = code
elif format == b'\x01':
# Format 1
(n,) = struct.unpack('B', self.fp.read(1))
code = 0
for i in range(n):
(first, nleft) = struct.unpack('BB', self.fp.read(2))
for gid in range(first, first+nleft+1):
self.code2gid[code] = gid
self.gid2code[gid] = code
code += 1
else:
raise ValueError('unsupported encoding format: %r' % format)
# Charsets
self.name2gid = {}
self.gid2name = {}
self.fp.seek(charset_pos)
format = self.fp.read(1)
if format == b'\x00':
# Format 0
n = self.nglyphs-1
for (gid, sid) in enumerate(struct.unpack('>'+'H'*n,
self.fp.read(2*n))):
gid += 1
name = self.getstr(sid)
self.name2gid[name] = gid
self.gid2name[gid] = name
elif format == b'\x01':
# Format 1
(n,) = struct.unpack('B', self.fp.read(1))
sid = 0
for i in range(n):
(first, nleft) = struct.unpack('BB', self.fp.read(2))
for gid in range(first, first+nleft+1):
name = self.getstr(sid)
self.name2gid[name] = gid
self.gid2name[gid] = name
sid += 1
elif format == b'\x02':
# Format 2
assert False, str(('Unhandled', format))
else:
raise ValueError('unsupported charset format: %r' % format)
return
def getstr(self, sid):
if sid < len(self.STANDARD_STRINGS):
return self.STANDARD_STRINGS[sid]
return self.string_index[sid-len(self.STANDARD_STRINGS)]
class TrueTypeFont:
class CMapNotFound(Exception):
pass
def __init__(self, name, fp):
self.name = name
self.fp = fp
self.tables = {}
self.fonttype = fp.read(4)
try:
(ntables, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
for _ in range(ntables):
(name, tsum, offset, length) = struct.unpack('>4sLLL',
fp.read(16))
self.tables[name] = (offset, length)
except struct.error:
# Do not fail if there are not enough bytes to read. Even for
# corrupted PDFs we would like to get as much information as
# possible, so continue.
pass
return
def create_unicode_map(self):
if 'cmap' not in self.tables:
raise TrueTypeFont.CMapNotFound
(base_offset, length) = self.tables['cmap']
fp = self.fp
fp.seek(base_offset)
(version, nsubtables) = struct.unpack('>HH', fp.read(4))
subtables = []
for i in range(nsubtables):
subtables.append(struct.unpack('>HHL', fp.read(8)))
char2gid = {}
# Only supports subtable type 0, 2 and 4.
for (_1, _2, st_offset) in subtables:
fp.seek(base_offset+st_offset)
(fmttype, fmtlen, fmtlang) = struct.unpack('>HHH', fp.read(6))
if fmttype == 0:
char2gid.update(enumerate(struct.unpack('>256B',
fp.read(256))))
elif fmttype == 2:
subheaderkeys = struct.unpack('>256H', fp.read(512))
firstbytes = [0]*8192
for (i, k) in enumerate(subheaderkeys):
firstbytes[k//8] = i
nhdrs = max(subheaderkeys)//8 + 1
hdrs = []
for i in range(nhdrs):
(firstcode, entcount, delta, offset) = \
struct.unpack('>HHhH', fp.read(8))
hdrs.append((i, firstcode, entcount, delta,
fp.tell()-2+offset))
for (i, firstcode, entcount, delta, pos) in hdrs:
if not entcount:
continue
first = firstcode + (firstbytes[i] << 8)
fp.seek(pos)
for c in range(entcount):
gid = struct.unpack('>H', fp.read(2))
if gid:
gid += delta
char2gid[first+c] = gid
elif fmttype == 4:
(segcount, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
segcount //= 2
ecs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
fp.read(2)
scs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount))
pos = fp.tell()
idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
if idr:
fp.seek(pos+idr)
for c in range(sc, ec+1):
b = struct.unpack('>H', fp.read(2))[0]
char2gid[c] = (b + idd) & 0xffff
else:
for c in range(sc, ec+1):
char2gid[c] = (c + idd) & 0xffff
else:
assert False, str(('Unhandled', fmttype))
# create unicode map
unicode_map = FileUnicodeMap()
for (char, gid) in char2gid.items():
unicode_map.add_cid2unichr(gid, char)
return unicode_map
class PDFFontError(PDFException):
pass
class PDFUnicodeNotDefined(PDFFontError):
pass
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
LITERAL_TYPE1C = LIT('Type1C')
class PDFFont:
def __init__(self, descriptor, widths, default_width=None):
self.descriptor = descriptor
self.widths = resolve_all(widths)
self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
if isinstance(self.fontname, PSLiteral):
self.fontname = literal_name(self.fontname)
self.flags = int_value(descriptor.get('Flags', 0))
self.ascent = num_value(descriptor.get('Ascent', 0))
self.descent = num_value(descriptor.get('Descent', 0))
self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
if default_width is None:
self.default_width = num_value(descriptor.get('MissingWidth', 0))
else:
self.default_width = default_width
self.leading = num_value(descriptor.get('Leading', 0))
self.bbox = list_value(resolve_all(descriptor.get('FontBBox',
(0, 0, 0, 0))))
self.hscale = self.vscale = .001
# PDF RM 9.8.1 specifies /Descent should always be a negative number.
# PScript5.dll seems to produce Descent with a positive number, but
# text analysis will be wrong if this is taken as correct. So force
# descent to negative.
if self.descent > 0:
self.descent = -self.descent
return
def __repr__(self):
return '<PDFFont>'
def is_vertical(self):
return False
def is_multibyte(self):
return False
def decode(self, bytes):
return bytearray(bytes) # map(ord, bytes)
def get_ascent(self):
"""Ascent above the baseline, in text space units"""
return self.ascent * self.vscale
def get_descent(self):
"""Descent below the baseline, in text space units; always negative"""
return self.descent * self.vscale
def get_width(self):
w = self.bbox[2]-self.bbox[0]
if w == 0:
w = -self.default_width
return w * self.hscale
def get_height(self):
h = self.bbox[3]-self.bbox[1]
if h == 0:
h = self.ascent - self.descent
return h * self.vscale
def char_width(self, cid):
try:
return self.widths[cid] * self.hscale
except KeyError:
try:
return self.widths[self.to_unichr(cid)] * self.hscale
except (KeyError, PDFUnicodeNotDefined):
return self.default_width * self.hscale
def char_disp(self, cid):
return 0
def string_width(self, s):
return sum(self.char_width(cid) for cid in self.decode(s))
class PDFSimpleFont(PDFFont):
def __init__(self, descriptor, widths, spec):
# Font encoding is specified either by a name of
# built-in encoding or a dictionary that describes
# the differences.
if 'Encoding' in spec:
encoding = resolve1(spec['Encoding'])
else:
encoding = LITERAL_STANDARD_ENCODING
if isinstance(encoding, dict):
name = literal_name(encoding.get('BaseEncoding',
LITERAL_STANDARD_ENCODING))
diff = list_value(encoding.get('Differences', []))
self.cid2unicode = EncodingDB.get_encoding(name, diff)
else:
self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
self.unicode_map = None
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.unicode_map = FileUnicodeMap()
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
PDFFont.__init__(self, descriptor, widths)
return
def to_unichr(self, cid):
if self.unicode_map:
try:
return self.unicode_map.get_unichr(cid)
except KeyError:
pass
try:
return self.cid2unicode[cid]
except KeyError:
raise PDFUnicodeNotDefined(None, cid)
class PDFType1Font(PDFSimpleFont):
def __init__(self, rsrcmgr, spec):
try:
self.basefont = literal_name(spec['BaseFont'])
except KeyError:
if settings.STRICT:
raise PDFFontError('BaseFont is missing')
self.basefont = 'unknown'
try:
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
except KeyError:
descriptor = dict_value(spec.get('FontDescriptor', {}))
firstchar = int_value(spec.get('FirstChar', 0))
# lastchar = int_value(spec.get('LastChar', 255))
widths = list_value(spec.get('Widths', [0]*256))
widths = {i+firstchar: w for (i, w) in enumerate(widths)}
PDFSimpleFont.__init__(self, descriptor, widths, spec)
if 'Encoding' not in spec and 'FontFile' in descriptor:
# try to recover the missing encoding info from the font file.
self.fontfile = stream_value(descriptor.get('FontFile'))
length1 = int_value(self.fontfile['Length1'])
data = self.fontfile.get_data()[:length1]
parser = Type1FontHeaderParser(BytesIO(data))
self.cid2unicode = parser.get_encoding()
return
def __repr__(self):
return '<PDFType1Font: basefont=%r>' % self.basefont
class PDFTrueTypeFont(PDFType1Font):
def __repr__(self):
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
class PDFType3Font(PDFSimpleFont):
def __init__(self, rsrcmgr, spec):
firstchar = int_value(spec.get('FirstChar', 0))
# lastchar = int_value(spec.get('LastChar', 0))
widths = list_value(spec.get('Widths', [0]*256))
widths = {i+firstchar: w for (i, w) in enumerate(widths)}
if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor'])
else:
descriptor = {'Ascent': 0, 'Descent': 0,
'FontBBox': spec['FontBBox']}
PDFSimpleFont.__init__(self, descriptor, widths, spec)
self.matrix = tuple(list_value(spec.get('FontMatrix')))
(_, self.descent, _, self.ascent) = self.bbox
(self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
return
def __repr__(self):
return '<PDFType3Font>'
class PDFCIDFont(PDFFont):
def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
try:
self.basefont = literal_name(spec['BaseFont'])
except KeyError:
if strict:
raise PDFFontError('BaseFont is missing')
self.basefont = 'unknown'
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
cid_registry = resolve1(
self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1")
cid_ordering = resolve1(
self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")
self.cidcoding = '{}-{}'.format(cid_registry, cid_ordering)
self.cmap = self.get_cmap_from_spec(spec, strict)
try:
descriptor = dict_value(spec['FontDescriptor'])
except KeyError:
if strict:
raise PDFFontError('FontDescriptor is missing')
descriptor = {}
ttf = None
if 'FontFile2' in descriptor:
self.fontfile = stream_value(descriptor.get('FontFile2'))
ttf = TrueTypeFont(self.basefont,
BytesIO(self.fontfile.get_data()))
self.unicode_map = None
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.unicode_map = FileUnicodeMap()
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'):
if ttf:
try:
self.unicode_map = ttf.create_unicode_map()
except TrueTypeFont.CMapNotFound:
pass
else:
try:
self.unicode_map = CMapDB.get_unicode_map(
self.cidcoding, self.cmap.is_vertical())
except CMapDB.CMapNotFound:
pass
self.vertical = self.cmap.is_vertical()
if self.vertical:
# writing mode: vertical
widths = get_widths2(list_value(spec.get('W2', [])))
self.disps = {cid: (vx, vy)
for (cid, (_, (vx, vy))) in widths.items()}
(vy, w) = spec.get('DW2', [880, -1000])
self.default_disp = (None, vy)
widths = {cid: w for (cid, (w, _)) in widths.items()}
default_width = w
else:
# writing mode: horizontal
self.disps = {}
self.default_disp = 0
widths = get_widths(list_value(spec.get('W', [])))
default_width = spec.get('DW', 1000)
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
return
def get_cmap_from_spec(self, spec, strict):
"""Get cmap from font specification
For certain PDFs, Encoding Type isn't mentioned as an attribute of
Encoding but as an attribute of CMapName, where CMapName is an
attribute of spec['Encoding'].
The horizontal/vertical modes are mentioned with different name
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
"""
cmap_name = self._get_cmap_name(spec, strict)
try:
return CMapDB.get_cmap(cmap_name)
except CMapDB.CMapNotFound as e:
if strict:
raise PDFFontError(e)
return CMap()
@staticmethod
def _get_cmap_name(spec, strict):
"""Get cmap name from font specification"""
cmap_name = 'unknown' # default value
try:
spec_encoding = spec['Encoding']
if hasattr(spec_encoding, 'name'):
cmap_name = literal_name(spec['Encoding'])
else:
cmap_name = literal_name(spec_encoding['CMapName'])
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
if type(cmap_name) is PDFStream:
if 'CMapName' in cmap_name:
cmap_name = cmap_name.get('CMapName').name
else:
if strict:
raise PDFFontError('CMapName unspecified for encoding')
cmap_name = IDENTITY_ENCODER.get(cmap_name, cmap_name)
return cmap_name
def __repr__(self):
return '<PDFCIDFont: basefont={!r}, cidcoding={!r}>'\
.format(self.basefont, self.cidcoding)
def is_vertical(self):
return self.vertical
def is_multibyte(self):
return True
def decode(self, bytes):
return self.cmap.decode(bytes)
def char_disp(self, cid):
"Returns an integer for horizontal fonts, a tuple for vertical fonts."
return self.disps.get(cid, self.default_disp)
def to_unichr(self, cid):
try:
if not self.unicode_map:
raise KeyError(cid)
return self.unicode_map.get_unichr(cid)
except KeyError:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
def main(argv):
for fname in argv[1:]:
fp = open(fname, 'rb')
font = CFFFont(fname, fp)
print(font)
fp.close()
return
if __name__ == '__main__':
sys.exit(main(sys.argv))
-943
View File
@@ -1,943 +0,0 @@
import re
import logging
from io import BytesIO
from .cmapdb import CMapDB
from .cmapdb import CMap
from .psparser import PSTypeError
from .psparser import PSEOF
from .psparser import PSKeyword
from .psparser import literal_name
from .psparser import keyword_name
from .psparser import PSStackParser
from .psparser import LIT
from .psparser import KWD
from . import settings
from .pdftypes import PDFException
from .pdftypes import PDFStream
from .pdftypes import PDFObjRef
from .pdftypes import resolve1
from .pdftypes import list_value
from .pdftypes import dict_value
from .pdftypes import stream_value
from .pdffont import PDFFontError
from .pdffont import PDFType1Font
from .pdffont import PDFTrueTypeFont
from .pdffont import PDFType3Font
from .pdffont import PDFCIDFont
from .pdfcolor import PDFColorSpace
from .pdfcolor import PREDEFINED_COLORSPACE
from .utils import choplist
from .utils import mult_matrix
from .utils import MATRIX_IDENTITY
log = logging.getLogger(__name__)
class PDFResourceError(PDFException):
pass
class PDFInterpreterError(PDFException):
pass
LITERAL_PDF = LIT('PDF')
LITERAL_TEXT = LIT('Text')
LITERAL_FONT = LIT('Font')
LITERAL_FORM = LIT('Form')
LITERAL_IMAGE = LIT('Image')
class PDFTextState:
def __init__(self):
self.font = None
self.fontsize = 0
self.charspace = 0
self.wordspace = 0
self.scaling = 100
self.leading = 0
self.render = 0
self.rise = 0
self.reset()
# self.matrix is set
# self.linematrix is set
return
def __repr__(self):
return '<PDFTextState: font=%r, fontsize=%r, charspace=%r, ' \
'wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, ' \
'matrix=%r, linematrix=%r>' \
% (self.font, self.fontsize, self.charspace, self.wordspace,
self.scaling, self.leading, self.render, self.rise,
self.matrix, self.linematrix)
def copy(self):
obj = PDFTextState()
obj.font = self.font
obj.fontsize = self.fontsize
obj.charspace = self.charspace
obj.wordspace = self.wordspace
obj.scaling = self.scaling
obj.leading = self.leading
obj.render = self.render
obj.rise = self.rise
obj.matrix = self.matrix
obj.linematrix = self.linematrix
return obj
def reset(self):
self.matrix = MATRIX_IDENTITY
self.linematrix = (0, 0)
return
class PDFGraphicState:
def __init__(self):
self.linewidth = 0
self.linecap = None
self.linejoin = None
self.miterlimit = None
self.dash = None
self.intent = None
self.flatness = None
# stroking color
self.scolor = None
# non stroking color
self.ncolor = None
return
def copy(self):
obj = PDFGraphicState()
obj.linewidth = self.linewidth
obj.linecap = self.linecap
obj.linejoin = self.linejoin
obj.miterlimit = self.miterlimit
obj.dash = self.dash
obj.intent = self.intent
obj.flatness = self.flatness
obj.scolor = self.scolor
obj.ncolor = self.ncolor
return obj
def __repr__(self):
return ('<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
' miterlimit=%r, dash=%r, intent=%r, flatness=%r, '
' stroking color=%r, non stroking color=%r>' %
(self.linewidth, self.linecap, self.linejoin,
self.miterlimit, self.dash, self.intent, self.flatness,
self.scolor, self.ncolor))
class PDFResourceManager:
"""Repository of shared resources.
ResourceManager facilitates reuse of shared resources
such as fonts and images so that large objects are not
allocated multiple times.
"""
def __init__(self, caching=True):
self.caching = caching
self._cached_fonts = {}
return
def get_procset(self, procs):
for proc in procs:
if proc is LITERAL_PDF:
pass
elif proc is LITERAL_TEXT:
pass
else:
pass
return
def get_cmap(self, cmapname, strict=False):
try:
return CMapDB.get_cmap(cmapname)
except CMapDB.CMapNotFound:
if strict:
raise
return CMap()
def get_font(self, objid, spec):
if objid and objid in self._cached_fonts:
font = self._cached_fonts[objid]
else:
log.info('get_font: create: objid=%r, spec=%r', objid, spec)
if settings.STRICT:
if spec['Type'] is not LITERAL_FONT:
raise PDFFontError('Type is not /Font')
# Create a Font object.
if 'Subtype' in spec:
subtype = literal_name(spec['Subtype'])
else:
if settings.STRICT:
raise PDFFontError('Font Subtype is not specified.')
subtype = 'Type1'
if subtype in ('Type1', 'MMType1'):
# Type1 Font
font = PDFType1Font(self, spec)
elif subtype == 'TrueType':
# TrueType Font
font = PDFTrueTypeFont(self, spec)
elif subtype == 'Type3':
# Type3 Font
font = PDFType3Font(self, spec)
elif subtype in ('CIDFontType0', 'CIDFontType2'):
# CID Font
font = PDFCIDFont(self, spec)
elif subtype == 'Type0':
# Type0 Font
dfonts = list_value(spec['DescendantFonts'])
assert dfonts
subspec = dict_value(dfonts[0]).copy()
for k in ('Encoding', 'ToUnicode'):
if k in spec:
subspec[k] = resolve1(spec[k])
font = self.get_font(None, subspec)
else:
if settings.STRICT:
raise PDFFontError('Invalid Font spec: %r' % spec)
font = PDFType1Font(self, spec) # this is so wrong!
if objid and self.caching:
self._cached_fonts[objid] = font
return font
class PDFContentParser(PSStackParser):
def __init__(self, streams):
self.streams = streams
self.istream = 0
PSStackParser.__init__(self, None)
return
def fillfp(self):
if not self.fp:
if self.istream < len(self.streams):
strm = stream_value(self.streams[self.istream])
self.istream += 1
else:
raise PSEOF('Unexpected EOF, file truncated?')
self.fp = BytesIO(strm.get_data())
return
def seek(self, pos):
self.fillfp()
PSStackParser.seek(self, pos)
return
def fillbuf(self):
if self.charpos < len(self.buf):
return
while 1:
self.fillfp()
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if self.buf:
break
self.fp = None
self.charpos = 0
return
def get_inline_data(self, pos, target=b'EI'):
self.seek(pos)
i = 0
data = b''
while i <= len(target):
self.fillbuf()
if i:
c = self.buf[self.charpos]
c = bytes((c,))
data += c
self.charpos += 1
if len(target) <= i and c.isspace():
i += 1
elif i < len(target) and c == (bytes((target[i],))):
i += 1
else:
i = 0
else:
try:
j = self.buf.index(target[0], self.charpos)
data += self.buf[self.charpos:j+1]
self.charpos = j+1
i = 1
except ValueError:
data += self.buf[self.charpos:]
self.charpos = len(self.buf)
data = data[:-(len(target)+1)] # strip the last part
data = re.sub(br'(\x0d\x0a|[\x0d\x0a])$', b'', data)
return (pos, data)
def flush(self):
self.add_results(*self.popall())
return
KEYWORD_BI = KWD(b'BI')
KEYWORD_ID = KWD(b'ID')
KEYWORD_EI = KWD(b'EI')
def do_keyword(self, pos, token):
if token is self.KEYWORD_BI:
# inline image within a content stream
self.start_type(pos, 'inline')
elif token is self.KEYWORD_ID:
try:
(_, objs) = self.end_type('inline')
if len(objs) % 2 != 0:
error_msg = 'Invalid dictionary construct: {!r}' \
.format(objs)
raise PSTypeError(error_msg)
d = {literal_name(k): v for (k, v) in choplist(2, objs)}
(pos, data) = self.get_inline_data(pos+len(b'ID '))
obj = PDFStream(d, data)
self.push((pos, obj))
self.push((pos, self.KEYWORD_EI))
except PSTypeError:
if settings.STRICT:
raise
else:
self.push((pos, token))
return
class PDFPageInterpreter:
"""Processor for the content of a PDF page
Reference: PDF Reference, Appendix A, Operator Summary
"""
def __init__(self, rsrcmgr, device):
self.rsrcmgr = rsrcmgr
self.device = device
return
def dup(self):
return self.__class__(self.rsrcmgr, self.device)
def init_resources(self, resources):
"""Prepare the fonts and XObjects listed in the Resource attribute."""
self.resources = resources
self.fontmap = {}
self.xobjmap = {}
self.csmap = PREDEFINED_COLORSPACE.copy()
if not resources:
return
def get_colorspace(spec):
if isinstance(spec, list):
name = literal_name(spec[0])
else:
name = literal_name(spec)
if name == 'ICCBased' and isinstance(spec, list) \
and 2 <= len(spec):
return PDFColorSpace(name, stream_value(spec[1])['N'])
elif name == 'DeviceN' and isinstance(spec, list) \
and 2 <= len(spec):
return PDFColorSpace(name, len(list_value(spec[1])))
else:
return PREDEFINED_COLORSPACE.get(name)
for (k, v) in dict_value(resources).items():
log.debug('Resource: %r: %r', k, v)
if k == 'Font':
for (fontid, spec) in dict_value(v).items():
objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
elif k == 'ColorSpace':
for (csid, spec) in dict_value(v).items():
self.csmap[csid] = get_colorspace(resolve1(spec))
elif k == 'ProcSet':
self.rsrcmgr.get_procset(list_value(v))
elif k == 'XObject':
for (xobjid, xobjstrm) in dict_value(v).items():
self.xobjmap[xobjid] = xobjstrm
return
def init_state(self, ctm):
"""Initialize the text and graphic states for rendering a page."""
self.gstack = [] # stack for graphical states.
self.ctm = ctm
self.device.set_ctm(self.ctm)
self.textstate = PDFTextState()
self.graphicstate = PDFGraphicState()
self.curpath = []
# argstack: stack for command arguments.
self.argstack = []
# set some global states.
self.scs = self.ncs = None
if self.csmap:
self.scs = self.ncs = next(iter(self.csmap.values()))
return
def push(self, obj):
self.argstack.append(obj)
return
def pop(self, n):
if n == 0:
return []
x = self.argstack[-n:]
self.argstack = self.argstack[:-n]
return x
def get_current_state(self):
return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
def set_current_state(self, state):
(self.ctm, self.textstate, self.graphicstate) = state
self.device.set_ctm(self.ctm)
return
def do_q(self):
"""Save graphics state"""
self.gstack.append(self.get_current_state())
return
def do_Q(self):
"""Restore graphics state"""
if self.gstack:
self.set_current_state(self.gstack.pop())
return
def do_cm(self, a1, b1, c1, d1, e1, f1):
"""Concatenate matrix to current transformation matrix"""
self.ctm = mult_matrix((a1, b1, c1, d1, e1, f1), self.ctm)
self.device.set_ctm(self.ctm)
return
def do_w(self, linewidth):
"""Set line width"""
self.graphicstate.linewidth = linewidth
return
def do_J(self, linecap):
"""Set line cap style"""
self.graphicstate.linecap = linecap
return
def do_j(self, linejoin):
"""Set line join style"""
self.graphicstate.linejoin = linejoin
return
def do_M(self, miterlimit):
"""Set miter limit"""
self.graphicstate.miterlimit = miterlimit
return
def do_d(self, dash, phase):
"""Set line dash pattern"""
self.graphicstate.dash = (dash, phase)
return
def do_ri(self, intent):
"""Set color rendering intent"""
self.graphicstate.intent = intent
return
def do_i(self, flatness):
"""Set flatness tolerance"""
self.graphicstate.flatness = flatness
return
def do_gs(self, name):
"""Set parameters from graphics state parameter dictionary"""
# todo
return
def do_m(self, x, y):
"""Begin new subpath"""
self.curpath.append(('m', x, y))
return
def do_l(self, x, y):
"""Append straight line segment to path"""
self.curpath.append(('l', x, y))
return
def do_c(self, x1, y1, x2, y2, x3, y3):
"""Append curved segment to path (three control points)"""
self.curpath.append(('c', x1, y1, x2, y2, x3, y3))
return
def do_v(self, x2, y2, x3, y3):
"""Append curved segment to path (initial point replicated)"""
self.curpath.append(('v', x2, y2, x3, y3))
return
def do_y(self, x1, y1, x3, y3):
"""Append curved segment to path (final point replicated)"""
self.curpath.append(('y', x1, y1, x3, y3))
return
def do_h(self):
"""Close subpath"""
self.curpath.append(('h',))
return
def do_re(self, x, y, w, h):
"""Append rectangle to path"""
self.curpath.append(('m', x, y))
self.curpath.append(('l', x+w, y))
self.curpath.append(('l', x+w, y+h))
self.curpath.append(('l', x, y+h))
self.curpath.append(('h',))
return
def do_S(self):
"""Stroke path"""
self.device.paint_path(self.graphicstate, True, False, False,
self.curpath)
self.curpath = []
return
def do_s(self):
"""Close and stroke path"""
self.do_h()
self.do_S()
return
def do_f(self):
"""Fill path using nonzero winding number rule"""
self.device.paint_path(self.graphicstate, False, True, False,
self.curpath)
self.curpath = []
return
def do_F(self):
"""Fill path using nonzero winding number rule (obsolete)"""
return self.do_f()
def do_f_a(self):
"""Fill path using even-odd rule"""
self.device.paint_path(self.graphicstate, False, True, True,
self.curpath)
self.curpath = []
return
def do_B(self):
"""Fill and stroke path using nonzero winding number rule"""
self.device.paint_path(self.graphicstate, True, True, False,
self.curpath)
self.curpath = []
return
def do_B_a(self):
"""Fill and stroke path using even-odd rule"""
self.device.paint_path(self.graphicstate, True, True, True,
self.curpath)
self.curpath = []
return
def do_b(self):
"""Close, fill, and stroke path using nonzero winding number rule"""
self.do_h()
self.do_B()
return
def do_b_a(self):
"""Close, fill, and stroke path using even-odd rule"""
self.do_h()
self.do_B_a()
return
def do_n(self):
"""End path without filling or stroking"""
self.curpath = []
return
def do_W(self):
"""Set clipping path using nonzero winding number rule"""
return
def do_W_a(self):
"""Set clipping path using even-odd rule"""
return
def do_CS(self, name):
"""Set color space for stroking operations
Introduced in PDF 1.1
"""
try:
self.scs = self.csmap[literal_name(name)]
except KeyError:
if settings.STRICT:
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
return
def do_cs(self, name):
"""Set color space for nonstroking operations"""
try:
self.ncs = self.csmap[literal_name(name)]
except KeyError:
if settings.STRICT:
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
return
def do_G(self, gray):
"""Set gray level for stroking operations"""
self.graphicstate.scolor = gray
return
def do_g(self, gray):
"""Set gray level for nonstroking operations"""
self.graphicstate.ncolor = gray
return
def do_RG(self, r, g, b):
"""Set RGB color for stroking operations"""
self.graphicstate.scolor = (r, g, b)
return
def do_rg(self, r, g, b):
"""Set RGB color for nonstroking operations"""
self.graphicstate.ncolor = (r, g, b)
return
def do_K(self, c, m, y, k):
"""Set CMYK color for stroking operations"""
self.graphicstate.scolor = (c, m, y, k)
return
def do_k(self, c, m, y, k):
"""Set CMYK color for nonstroking operations"""
self.graphicstate.ncolor = (c, m, y, k)
return
def do_SCN(self):
"""Set color for stroking operations."""
if self.scs:
n = self.scs.ncomponents
else:
if settings.STRICT:
raise PDFInterpreterError('No colorspace specified!')
n = 1
self.graphicstate.scolor = self.pop(n)
return
def do_scn(self):
"""Set color for nonstroking operations"""
if self.ncs:
n = self.ncs.ncomponents
else:
if settings.STRICT:
raise PDFInterpreterError('No colorspace specified!')
n = 1
self.graphicstate.ncolor = self.pop(n)
return
def do_SC(self):
"""Set color for stroking operations"""
self.do_SCN()
return
def do_sc(self):
"""Set color for nonstroking operations"""
self.do_scn()
return
def do_sh(self, name):
"""Paint area defined by shading pattern"""
return
def do_BT(self):
"""Begin text object
Initializing the text matrix, Tm, and the text line matrix, Tlm, to
the identity matrix. Text objects cannot be nested; a second BT cannot
appear before an ET.
"""
self.textstate.reset()
return
def do_ET(self):
"""End a text object"""
return
def do_BX(self):
"""Begin compatibility section"""
return
def do_EX(self):
"""End compatibility section"""
return
def do_MP(self, tag):
"""Define marked-content point"""
self.device.do_tag(tag)
return
def do_DP(self, tag, props):
"""Define marked-content point with property list"""
self.device.do_tag(tag, props)
return
def do_BMC(self, tag):
"""Begin marked-content sequence"""
self.device.begin_tag(tag)
return
def do_BDC(self, tag, props):
"""Begin marked-content sequence with property list"""
self.device.begin_tag(tag, props)
return
def do_EMC(self):
"""End marked-content sequence"""
self.device.end_tag()
return
def do_Tc(self, space):
"""Set character spacing.
Character spacing is used by the Tj, TJ, and ' operators.
:param space: a number expressed in unscaled text space units.
"""
self.textstate.charspace = space
return
def do_Tw(self, space):
"""Set the word spacing.
Word spacing is used by the Tj, TJ, and ' operators.
:param space: a number expressed in unscaled text space units
"""
self.textstate.wordspace = space
return
def do_Tz(self, scale):
"""Set the horizontal scaling.
:param scale: is a number specifying the percentage of the normal width
"""
self.textstate.scaling = scale
return
def do_TL(self, leading):
"""Set the text leading.
Text leading is used only by the T*, ', and " operators.
:param leading: a number expressed in unscaled text space units
"""
self.textstate.leading = -leading
return
def do_Tf(self, fontid, fontsize):
"""Set the text font
:param fontid: the name of a font resource in the Font subdictionary
of the current resource dictionary
:param fontsize: size is a number representing a scale factor.
"""
try:
self.textstate.font = self.fontmap[literal_name(fontid)]
except KeyError:
if settings.STRICT:
raise PDFInterpreterError('Undefined Font id: %r' % fontid)
self.textstate.font = self.rsrcmgr.get_font(None, {})
self.textstate.fontsize = fontsize
return
def do_Tr(self, render):
"""Set the text rendering mode"""
self.textstate.render = render
return
def do_Ts(self, rise):
"""Set the text rise
:param rise: a number expressed in unscaled text space units
"""
self.textstate.rise = rise
return
def do_Td(self, tx, ty):
"""Move text position"""
(a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
self.textstate.linematrix = (0, 0)
return
def do_TD(self, tx, ty):
"""Move text position and set leading"""
(a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
self.textstate.leading = ty
self.textstate.linematrix = (0, 0)
return
def do_Tm(self, a, b, c, d, e, f):
"""Set text matrix and text line matrix"""
self.textstate.matrix = (a, b, c, d, e, f)
self.textstate.linematrix = (0, 0)
return
def do_T_a(self):
"""Move to start of next text line"""
(a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e,
self.textstate.leading*d+f)
self.textstate.linematrix = (0, 0)
return
def do_TJ(self, seq):
"""Show text, allowing individual glyph positioning"""
if self.textstate.font is None:
if settings.STRICT:
raise PDFInterpreterError('No font specified!')
return
self.device.render_string(self.textstate, seq, self.ncs,
self.graphicstate.copy())
return
def do_Tj(self, s):
"""Show text"""
self.do_TJ([s])
return
def do__q(self, s):
"""Move to next line and show text
The ' (single quote) operator.
"""
self.do_T_a()
self.do_TJ([s])
return
def do__w(self, aw, ac, s):
"""Set word and character spacing, move to next line, and show text
The " (double quote) operator.
"""
self.do_Tw(aw)
self.do_Tc(ac)
self.do_TJ([s])
return
def do_BI(self):
"""Begin inline image object"""
return
def do_ID(self):
"""Begin inline image data"""
return
def do_EI(self, obj):
"""End inline image object"""
if isinstance(obj, PDFStream) and 'W' in obj and 'H' in obj:
iobjid = str(id(obj))
self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(iobjid, obj)
self.device.end_figure(iobjid)
return
def do_Do(self, xobjid):
"""Invoke named XObject"""
xobjid = literal_name(xobjid)
try:
xobj = stream_value(self.xobjmap[xobjid])
except KeyError:
if settings.STRICT:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
return
log.info('Processing xobj: %r', xobj)
subtype = xobj.get('Subtype')
if subtype is LITERAL_FORM and 'BBox' in xobj:
interpreter = self.dup()
bbox = list_value(xobj['BBox'])
matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
# According to PDF reference 1.7 section 4.9.1, XObjects in
# earlier PDFs (prior to v1.2) use the page's Resources entry
# instead of having their own Resources entry.
xobjres = xobj.get('Resources')
if xobjres:
resources = dict_value(xobjres)
else:
resources = self.resources.copy()
self.device.begin_figure(xobjid, bbox, matrix)
interpreter.render_contents(resources, [xobj],
ctm=mult_matrix(matrix, self.ctm))
self.device.end_figure(xobjid)
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(xobjid, xobj)
self.device.end_figure(xobjid)
else:
# unsupported xobject type.
pass
return
def process_page(self, page):
log.info('Processing page: %r', page)
(x0, y0, x1, y1) = page.mediabox
if page.rotate == 90:
ctm = (0, -1, 1, 0, -y0, x1)
elif page.rotate == 180:
ctm = (-1, 0, 0, -1, x1, y1)
elif page.rotate == 270:
ctm = (0, 1, -1, 0, y1, -x0)
else:
ctm = (1, 0, 0, 1, -x0, -y0)
self.device.begin_page(page, ctm)
self.render_contents(page.resources, page.contents, ctm=ctm)
self.device.end_page(page)
return
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
"""Render the content streams.
This method may be called recursively.
"""
log.info('render_contents: resources=%r, streams=%r, ctm=%r',
resources, streams, ctm)
self.init_resources(resources)
self.init_state(ctm)
self.execute(list_value(streams))
return
def execute(self, streams):
try:
parser = PDFContentParser(streams)
except PSEOF:
# empty page
return
while 1:
try:
(_, obj) = parser.nextobject()
except PSEOF:
break
if isinstance(obj, PSKeyword):
name = keyword_name(obj)
method = 'do_%s' % name.replace('*', '_a').replace('"', '_w')\
.replace("'", '_q')
if hasattr(self, method):
func = getattr(self, method)
nargs = func.__code__.co_argcount-1
if nargs:
args = self.pop(nargs)
log.debug('exec: %s %r', name, args)
if len(args) == nargs:
func(*args)
else:
log.debug('exec: %s', name)
func()
else:
if settings.STRICT:
error_msg = 'Unknown operator: %r' % name
raise PDFInterpreterError(error_msg)
else:
self.push(obj)
return
-148
View File
@@ -1,148 +0,0 @@
import logging
import warnings
from . import settings
from .psparser import LIT
from .pdftypes import PDFObjectNotFound
from .pdftypes import resolve1
from .pdftypes import int_value
from .pdftypes import list_value
from .pdftypes import dict_value
from .pdfparser import PDFParser
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
from .pdfdocument import PDFTextExtractionNotAllowedWarning
log = logging.getLogger(__name__)
# some predefined literals and keywords.
LITERAL_PAGE = LIT('Page')
LITERAL_PAGES = LIT('Pages')
class PDFPage:
"""An object that holds the information about a page.
A PDFPage object is merely a convenience class that has a set
of keys and values, which describe the properties of a page
and point to its contents.
Attributes:
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
contents: a list of PDFStream objects that represents the page content.
lastmod: the last modified time of the page.
resources: a list of resources used by the page.
mediabox: the physical size of the page.
cropbox: the crop rectangle of the page.
rotate: the page rotation (in degree).
annots: the page annotations.
beads: a chain that represents natural reading order.
"""
def __init__(self, doc, pageid, attrs):
"""Initialize a page object.
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
"""
self.doc = doc
self.pageid = pageid
self.attrs = dict_value(attrs)
self.lastmod = resolve1(self.attrs.get('LastModified'))
self.resources = resolve1(self.attrs.get('Resources', dict()))
self.mediabox = resolve1(self.attrs['MediaBox'])
if 'CropBox' in self.attrs:
self.cropbox = resolve1(self.attrs['CropBox'])
else:
self.cropbox = self.mediabox
self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B')
if 'Contents' in self.attrs:
contents = resolve1(self.attrs['Contents'])
else:
contents = []
if not isinstance(contents, list):
contents = [contents]
self.contents = contents
return
def __repr__(self):
return '<PDFPage: Resources={!r}, MediaBox={!r}>'\
.format(self.resources, self.mediabox)
INHERITABLE_ATTRS = {'Resources', 'MediaBox', 'CropBox', 'Rotate'}
@classmethod
def create_pages(cls, document):
def search(obj, parent):
if isinstance(obj, int):
objid = obj
tree = dict_value(document.getobj(objid)).copy()
else:
objid = obj.objid
tree = dict_value(obj).copy()
for (k, v) in parent.items():
if k in cls.INHERITABLE_ATTRS and k not in tree:
tree[k] = v
tree_type = tree.get('Type')
if tree_type is None and not settings.STRICT: # See #64
tree_type = tree.get('type')
if tree_type is LITERAL_PAGES and 'Kids' in tree:
log.info('Pages: Kids=%r', tree['Kids'])
for c in list_value(tree['Kids']):
yield from search(c, tree)
elif tree_type is LITERAL_PAGE:
log.info('Page: %r', tree)
yield (objid, tree)
pages = False
if 'Pages' in document.catalog:
objects = search(document.catalog['Pages'], document.catalog)
for (objid, tree) in objects:
yield cls(document, objid, tree)
pages = True
if not pages:
# fallback when /Pages is missing.
for xref in document.xrefs:
for objid in xref.get_objids():
try:
obj = document.getobj(objid)
if isinstance(obj, dict) \
and obj.get('Type') is LITERAL_PAGE:
yield cls(document, objid, obj)
except PDFObjectNotFound:
pass
return
@classmethod
def get_pages(cls, fp,
pagenos=None, maxpages=0, password='',
caching=True, check_extractable=False):
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument(parser, password=password, caching=caching)
# Check if the document allows text extraction.
# If not, warn the user and proceed.
if not doc.is_extractable:
if check_extractable:
error_msg = 'Text extraction is not allowed: %r' % fp
raise PDFTextExtractionNotAllowed(error_msg)
else:
warning_msg = 'The PDF %r contains a metadata field '\
'indicating that it should not allow ' \
'text extraction. Ignoring this field ' \
'and proceeding.' % fp
warnings.warn(warning_msg, PDFTextExtractionNotAllowedWarning)
# Process each page contained in the document.
for (pageno, page) in enumerate(cls.create_pages(doc)):
if pagenos and (pageno not in pagenos):
continue
yield page
if maxpages and maxpages <= pageno+1:
break
return
-170
View File
@@ -1,170 +0,0 @@
import logging
from io import BytesIO
from .psparser import PSStackParser
from .psparser import PSSyntaxError
from .psparser import PSEOF
from .psparser import KWD
from . import settings
from .pdftypes import PDFException
from .pdftypes import PDFStream
from .pdftypes import PDFObjRef
from .pdftypes import int_value
from .pdftypes import dict_value
log = logging.getLogger(__name__)
class PDFSyntaxError(PDFException):
pass
class PDFParser(PSStackParser):
"""
PDFParser fetch PDF objects from a file stream.
It can handle indirect references by referring to
a PDF document set by set_document method.
It also reads XRefs at the end of every PDF file.
Typical usage:
parser = PDFParser(fp)
parser.read_xref()
parser.read_xref(fallback=True) # optional
parser.set_document(doc)
parser.seek(offset)
parser.nextobject()
"""
def __init__(self, fp):
PSStackParser.__init__(self, fp)
self.doc = None
self.fallback = False
return
def set_document(self, doc):
"""Associates the parser with a PDFDocument object."""
self.doc = doc
return
KEYWORD_R = KWD(b'R')
KEYWORD_NULL = KWD(b'null')
KEYWORD_ENDOBJ = KWD(b'endobj')
KEYWORD_STREAM = KWD(b'stream')
KEYWORD_XREF = KWD(b'xref')
KEYWORD_STARTXREF = KWD(b'startxref')
def do_keyword(self, pos, token):
"""Handles PDF-related keywords."""
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
self.add_results(*self.pop(1))
elif token is self.KEYWORD_ENDOBJ:
self.add_results(*self.pop(4))
elif token is self.KEYWORD_NULL:
# null object
self.push((pos, None))
elif token is self.KEYWORD_R:
# reference to indirect object
try:
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
pass
elif token is self.KEYWORD_STREAM:
# stream object
((_, dic),) = self.pop(1)
dic = dict_value(dic)
objlen = 0
if not self.fallback:
try:
objlen = int_value(dic['Length'])
except KeyError:
if settings.STRICT:
raise PDFSyntaxError('/Length is undefined: %r' % dic)
self.seek(pos)
try:
(_, line) = self.nextline() # 'stream'
except PSEOF:
if settings.STRICT:
raise PDFSyntaxError('Unexpected EOF')
return
pos += len(line)
self.fp.seek(pos)
data = bytearray(self.fp.read(objlen))
self.seek(pos+objlen)
while 1:
try:
(linepos, line) = self.nextline()
except PSEOF:
if settings.STRICT:
raise PDFSyntaxError('Unexpected EOF')
break
if b'endstream' in line:
i = line.index(b'endstream')
objlen += i
if self.fallback:
data += line[:i]
break
objlen += len(line)
if self.fallback:
data += line
data = bytes(data)
self.seek(pos+objlen)
# XXX limit objlen not to exceed object boundary
log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos,
objlen, dic, data[:10])
obj = PDFStream(dic, data, self.doc.decipher)
self.push((pos, obj))
else:
# others
self.push((pos, token))
return
class PDFStreamParser(PDFParser):
"""
PDFStreamParser is used to parse PDF content streams
that is contained in each page and has instructions
for rendering the page. A reference to a PDF document is
needed because a PDF content stream can also have
indirect references to other objects in the same document.
"""
def __init__(self, data):
PDFParser.__init__(self, BytesIO(data))
return
def flush(self):
self.add_results(*self.popall())
return
KEYWORD_OBJ = KWD(b'obj')
def do_keyword(self, pos, token):
if token is self.KEYWORD_R:
# reference to indirect object
try:
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
pass
return
elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
if settings.STRICT:
# See PDF Spec 3.4.6: Only the object values are stored in the
# stream; the obj and endobj keywords are not used.
raise PDFSyntaxError('Keyword endobj found in stream')
return
# others
self.push((pos, token))
return
-323
View File
@@ -1,323 +0,0 @@
import zlib
import logging
from .lzw import lzwdecode
from .ascii85 import ascii85decode
from .ascii85 import asciihexdecode
from .runlength import rldecode
from .ccitt import ccittfaxdecode
from .psparser import PSException
from .psparser import PSObject
from .psparser import LIT
from . import settings
from .utils import apply_png_predictor
from .utils import isnumber
log = logging.getLogger(__name__)
LITERAL_CRYPT = LIT('Crypt')
# Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85'))
LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),)
class PDFObject(PSObject):
pass
class PDFException(PSException):
pass
class PDFTypeError(PDFException):
pass
class PDFValueError(PDFException):
pass
class PDFObjectNotFound(PDFException):
pass
class PDFNotImplementedError(PDFException):
pass
class PDFObjRef(PDFObject):
def __init__(self, doc, objid, _):
if objid == 0:
if settings.STRICT:
raise PDFValueError('PDF object id cannot be 0.')
self.doc = doc
self.objid = objid
return
def __repr__(self):
return '<PDFObjRef:%d>' % (self.objid)
def resolve(self, default=None):
try:
return self.doc.getobj(self.objid)
except PDFObjectNotFound:
return default
def resolve1(x, default=None):
"""Resolves an object.
If this is an array or dictionary, it may still contains
some indirect objects inside.
"""
while isinstance(x, PDFObjRef):
x = x.resolve(default=default)
return x
def resolve_all(x, default=None):
"""Recursively resolves the given object and all the internals.
Make sure there is no indirect reference within the nested object.
This procedure might be slow.
"""
while isinstance(x, PDFObjRef):
x = x.resolve(default=default)
if isinstance(x, list):
x = [resolve_all(v, default=default) for v in x]
elif isinstance(x, dict):
for (k, v) in x.items():
x[k] = resolve_all(v, default=default)
return x
def decipher_all(decipher, objid, genno, x):
"""Recursively deciphers the given object.
"""
if isinstance(x, bytes):
return decipher(objid, genno, x)
if isinstance(x, list):
x = [decipher_all(decipher, objid, genno, v) for v in x]
elif isinstance(x, dict):
for (k, v) in x.items():
x[k] = decipher_all(decipher, objid, genno, v)
return x
def int_value(x):
x = resolve1(x)
if not isinstance(x, int):
if settings.STRICT:
raise PDFTypeError('Integer required: %r' % x)
return 0
return x
def float_value(x):
x = resolve1(x)
if not isinstance(x, float):
if settings.STRICT:
raise PDFTypeError('Float required: %r' % x)
return 0.0
return x
def num_value(x):
x = resolve1(x)
if not isnumber(x):
if settings.STRICT:
raise PDFTypeError('Int or Float required: %r' % x)
return 0
return x
def uint_value(x, n_bits):
"""Resolve number and interpret it as a two's-complement unsigned number"""
x = int_value(x)
if x > 0:
return x
else:
return x + 2**n_bits
def str_value(x):
x = resolve1(x)
if not isinstance(x, bytes):
if settings.STRICT:
raise PDFTypeError('String required: %r' % x)
return ''
return x
def list_value(x):
x = resolve1(x)
if not isinstance(x, (list, tuple)):
if settings.STRICT:
raise PDFTypeError('List required: %r' % x)
return []
return x
def dict_value(x):
x = resolve1(x)
if not isinstance(x, dict):
if settings.STRICT:
log.error('PDFTypeError : Dict required: %r', x)
raise PDFTypeError('Dict required: %r' % x)
return {}
return x
def stream_value(x):
x = resolve1(x)
if not isinstance(x, PDFStream):
if settings.STRICT:
raise PDFTypeError('PDFStream required: %r' % x)
return PDFStream({}, b'')
return x
class PDFStream(PDFObject):
def __init__(self, attrs, rawdata, decipher=None):
assert isinstance(attrs, dict), str(type(attrs))
self.attrs = attrs
self.rawdata = rawdata
self.decipher = decipher
self.data = None
self.objid = None
self.genno = None
return
def set_objid(self, objid, genno):
self.objid = objid
self.genno = genno
return
def __repr__(self):
if self.data is None:
assert self.rawdata is not None
return '<PDFStream(%r): raw=%d, %r>' % \
(self.objid, len(self.rawdata), self.attrs)
else:
assert self.data is not None
return '<PDFStream(%r): len=%d, %r>' % \
(self.objid, len(self.data), self.attrs)
def __contains__(self, name):
return name in self.attrs
def __getitem__(self, name):
return self.attrs[name]
def get(self, name, default=None):
return self.attrs.get(name, default)
def get_any(self, names, default=None):
for name in names:
if name in self.attrs:
return self.attrs[name]
return default
def get_filters(self):
filters = self.get_any(('F', 'Filter'))
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
if not filters:
return []
if not isinstance(filters, list):
filters = [filters]
if not isinstance(params, list):
# Make sure the parameters list is the same as filters.
params = [params] * len(filters)
if settings.STRICT and len(params) != len(filters):
raise PDFException("Parameters len filter mismatch")
# resolve filter if possible
_filters = []
for fltr in filters:
if hasattr(fltr, 'resolve'):
fltr = fltr.resolve()[0]
_filters.append(fltr)
# return list solves https://github.com/pdfminer/pdfminer.six/issues/15
return list(zip(_filters, params))
def decode(self):
assert self.data is None \
and self.rawdata is not None, str((self.data, self.rawdata))
data = self.rawdata
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data, self.attrs)
filters = self.get_filters()
if not filters:
self.data = data
self.rawdata = None
return
for (f, params) in filters:
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
try:
data = zlib.decompress(data)
except zlib.error as e:
if settings.STRICT:
error_msg = 'Invalid zlib bytes: {!r}, {!r}'\
.format(e, data)
raise PDFException(error_msg)
data = b''
elif f in LITERALS_LZW_DECODE:
data = lzwdecode(data)
elif f in LITERALS_ASCII85_DECODE:
data = ascii85decode(data)
elif f in LITERALS_ASCIIHEX_DECODE:
data = asciihexdecode(data)
elif f in LITERALS_RUNLENGTH_DECODE:
data = rldecode(data)
elif f in LITERALS_CCITTFAX_DECODE:
data = ccittfaxdecode(data, params)
elif f in LITERALS_DCT_DECODE:
# This is probably a JPG stream
# it does not need to be decoded twice.
# Just return the stream to the user.
pass
elif f in LITERALS_JBIG2_DECODE:
pass
elif f == LITERAL_CRYPT:
# not yet..
raise PDFNotImplementedError('/Crypt filter is unsupported')
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
if params and 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred == 1:
# no predictor
pass
elif 10 <= pred:
# PNG predictor
colors = int_value(params.get('Colors', 1))
columns = int_value(params.get('Columns', 1))
raw_bits_per_component = params.get('BitsPerComponent', 8)
bitspercomponent = int_value(raw_bits_per_component)
data = apply_png_predictor(pred, colors, columns,
bitspercomponent, data)
else:
error_msg = 'Unsupported predictor: %r' % pred
raise PDFNotImplementedError(error_msg)
self.data = data
self.rawdata = None
return
def get_data(self):
if self.data is None:
self.decode()
return self.data
def get_rawdata(self):
return self.rawdata
-625
View File
@@ -1,625 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import logging
from . import settings
from .utils import choplist
log = logging.getLogger(__name__)
class PSException(Exception):
pass
class PSEOF(PSException):
pass
class PSSyntaxError(PSException):
pass
class PSTypeError(PSException):
pass
class PSValueError(PSException):
pass
class PSObject:
"""Base class for all PS or PDF-related data types."""
pass
class PSLiteral(PSObject):
"""A class that represents a PostScript literal.
Postscript literals are used as identifiers, such as
variable names, property names and dictionary keys.
Literals are case sensitive and denoted by a preceding
slash sign (e.g. "/Name")
Note: Do not create an instance of PSLiteral directly.
Always use PSLiteralTable.intern().
"""
def __init__(self, name):
self.name = name
def __repr__(self):
name = self.name
return '/%r' % name
class PSKeyword(PSObject):
"""A class that represents a PostScript keyword.
PostScript keywords are a dozen of predefined words.
Commands and directives in PostScript are expressed by keywords.
They are also used to denote the content boundaries.
Note: Do not create an instance of PSKeyword directly.
Always use PSKeywordTable.intern().
"""
def __init__(self, name):
self.name = name
return
def __repr__(self):
name = self.name
return '/%r' % name
class PSSymbolTable:
"""A utility class for storing PSLiteral/PSKeyword objects.
Interned objects can be checked its identity with "is" operator.
"""
def __init__(self, klass):
self.dict = {}
self.klass = klass
return
def intern(self, name):
if name in self.dict:
lit = self.dict[name]
else:
lit = self.klass(name)
self.dict[name] = lit
return lit
PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword)
LIT = PSLiteralTable.intern
KWD = PSKeywordTable.intern
KEYWORD_PROC_BEGIN = KWD(b'{')
KEYWORD_PROC_END = KWD(b'}')
KEYWORD_ARRAY_BEGIN = KWD(b'[')
KEYWORD_ARRAY_END = KWD(b']')
KEYWORD_DICT_BEGIN = KWD(b'<<')
KEYWORD_DICT_END = KWD(b'>>')
def literal_name(x):
if not isinstance(x, PSLiteral):
if settings.STRICT:
raise PSTypeError('Literal required: {!r}'.format(x))
else:
name = x
else:
name = x.name
try:
name = str(name, 'utf-8')
except Exception:
pass
return name
def keyword_name(x):
if not isinstance(x, PSKeyword):
if settings.STRICT:
raise PSTypeError('Keyword required: %r' % x)
else:
name = x
else:
name = str(x.name, 'utf-8', 'ignore')
return name
EOL = re.compile(br'[\r\n]')
SPC = re.compile(br'\s')
NONSPC = re.compile(br'\S')
HEX = re.compile(br'[0-9a-fA-F]')
END_LITERAL = re.compile(br'[#/%\[\]()<>{}\s]')
END_HEX_STRING = re.compile(br'[^\s0-9a-fA-F]')
HEX_PAIR = re.compile(br'[0-9a-fA-F]{2}|.')
END_NUMBER = re.compile(br'[^0-9]')
END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]')
END_STRING = re.compile(br'[()\134]')
OCT_STRING = re.compile(br'[0-7]')
ESC_STRING = {
b'b': 8,
b't': 9,
b'n': 10,
b'f': 12,
b'r': 13,
b'(': 40,
b')': 41,
b'\\': 92
}
class PSBaseParser:
"""Most basic PostScript parser that performs only tokenization.
"""
BUFSIZ = 4096
def __init__(self, fp):
self.fp = fp
self.seek(0)
return
def __repr__(self):
return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp,
self.bufpos)
def flush(self):
return
def close(self):
self.flush()
return
def tell(self):
return self.bufpos+self.charpos
def poll(self, pos=None, n=80):
pos0 = self.fp.tell()
if not pos:
pos = self.bufpos+self.charpos
self.fp.seek(pos)
log.info('poll(%d): %r', pos, self.fp.read(n))
self.fp.seek(pos0)
return
def seek(self, pos):
"""Seeks the parser to the given position.
"""
log.debug('seek: %r', pos)
self.fp.seek(pos)
# reset the status for nextline()
self.bufpos = pos
self.buf = b''
self.charpos = 0
# reset the status for nexttoken()
self._parse1 = self._parse_main
self._curtoken = b''
self._curtokenpos = 0
self._tokens = []
return
def fillbuf(self):
if self.charpos < len(self.buf):
return
# fetch next chunk.
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if not self.buf:
raise PSEOF('Unexpected EOF')
self.charpos = 0
return
def nextline(self):
"""Fetches a next line that ends either with \\r or \\n.
"""
linebuf = b''
linepos = self.bufpos + self.charpos
eol = False
while 1:
self.fillbuf()
if eol:
c = self.buf[self.charpos:self.charpos+1]
# handle b'\r\n'
if c == b'\n':
linebuf += c
self.charpos += 1
break
m = EOL.search(self.buf, self.charpos)
if m:
linebuf += self.buf[self.charpos:m.end(0)]
self.charpos = m.end(0)
if linebuf[-1:] == b'\r':
eol = True
else:
break
else:
linebuf += self.buf[self.charpos:]
self.charpos = len(self.buf)
log.debug('nextline: %r, %r', linepos, linebuf)
return (linepos, linebuf)
def revreadlines(self):
"""Fetches a next line backword.
This is used to locate the trailers at the end of a file.
"""
self.fp.seek(0, 2)
pos = self.fp.tell()
buf = b''
while 0 < pos:
prevpos = pos
pos = max(0, pos-self.BUFSIZ)
self.fp.seek(pos)
s = self.fp.read(prevpos-pos)
if not s:
break
while 1:
n = max(s.rfind(b'\r'), s.rfind(b'\n'))
if n == -1:
buf = s + buf
break
yield s[n:] + buf
s = s[:n]
buf = b''
return
def _parse_main(self, s, i):
m = NONSPC.search(s, i)
if not m:
return len(s)
j = m.start(0)
c = s[j:j+1]
self._curtokenpos = self.bufpos+j
if c == b'%':
self._curtoken = b'%'
self._parse1 = self._parse_comment
return j+1
elif c == b'/':
self._curtoken = b''
self._parse1 = self._parse_literal
return j+1
elif c in b'-+' or c.isdigit():
self._curtoken = c
self._parse1 = self._parse_number
return j+1
elif c == b'.':
self._curtoken = c
self._parse1 = self._parse_float
return j+1
elif c.isalpha():
self._curtoken = c
self._parse1 = self._parse_keyword
return j+1
elif c == b'(':
self._curtoken = b''
self.paren = 1
self._parse1 = self._parse_string
return j+1
elif c == b'<':
self._curtoken = b''
self._parse1 = self._parse_wopen
return j+1
elif c == b'>':
self._curtoken = b''
self._parse1 = self._parse_wclose
return j+1
else:
self._add_token(KWD(c))
return j+1
def _add_token(self, obj):
self._tokens.append((self._curtokenpos, obj))
return
def _parse_comment(self, s, i):
m = EOL.search(s, i)
if not m:
self._curtoken += s[i:]
return len(s)
j = m.start(0)
self._curtoken += s[i:j]
self._parse1 = self._parse_main
# We ignore comments.
# self._tokens.append(self._curtoken)
return j
def _parse_literal(self, s, i):
m = END_LITERAL.search(s, i)
if not m:
self._curtoken += s[i:]
return len(s)
j = m.start(0)
self._curtoken += s[i:j]
c = s[j:j+1]
if c == b'#':
self.hex = b''
self._parse1 = self._parse_literal_hex
return j+1
try:
self._curtoken = str(self._curtoken, 'utf-8')
except Exception:
pass
self._add_token(LIT(self._curtoken))
self._parse1 = self._parse_main
return j
def _parse_literal_hex(self, s, i):
c = s[i:i+1]
if HEX.match(c) and len(self.hex) < 2:
self.hex += c
return i+1
if self.hex:
self._curtoken += bytes((int(self.hex, 16),))
self._parse1 = self._parse_literal
return i
def _parse_number(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
self._curtoken += s[i:]
return len(s)
j = m.start(0)
self._curtoken += s[i:j]
c = s[j:j+1]
if c == b'.':
self._curtoken += c
self._parse1 = self._parse_float
return j+1
try:
self._add_token(int(self._curtoken))
except ValueError:
pass
self._parse1 = self._parse_main
return j
def _parse_float(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
self._curtoken += s[i:]
return len(s)
j = m.start(0)
self._curtoken += s[i:j]
try:
self._add_token(float(self._curtoken))
except ValueError:
pass
self._parse1 = self._parse_main
return j
def _parse_keyword(self, s, i):
m = END_KEYWORD.search(s, i)
if not m:
self._curtoken += s[i:]
return len(s)
j = m.start(0)
self._curtoken += s[i:j]
if self._curtoken == b'true':
token = True
elif self._curtoken == b'false':
token = False
else:
token = KWD(self._curtoken)
self._add_token(token)
self._parse1 = self._parse_main
return j
def _parse_string(self, s, i):
m = END_STRING.search(s, i)
if not m:
self._curtoken += s[i:]
return len(s)
j = m.start(0)
self._curtoken += s[i:j]
c = s[j:j+1]
if c == b'\\':
self.oct = b''
self._parse1 = self._parse_string_1
return j+1
if c == b'(':
self.paren += 1
self._curtoken += c
return j+1
if c == b')':
self.paren -= 1
if self.paren:
# WTF, they said balanced parens need no special treatment.
self._curtoken += c
return j+1
self._add_token(self._curtoken)
self._parse1 = self._parse_main
return j+1
def _parse_string_1(self, s, i):
c = s[i:i+1]
if OCT_STRING.match(c) and len(self.oct) < 3:
self.oct += c
return i+1
if self.oct:
self._curtoken += bytes((int(self.oct, 8),))
self._parse1 = self._parse_string
return i
if c in ESC_STRING:
self._curtoken += bytes((ESC_STRING[c],))
self._parse1 = self._parse_string
return i+1
def _parse_wopen(self, s, i):
c = s[i:i+1]
if c == b'<':
self._add_token(KEYWORD_DICT_BEGIN)
self._parse1 = self._parse_main
i += 1
else:
self._parse1 = self._parse_hexstring
return i
def _parse_wclose(self, s, i):
c = s[i:i+1]
if c == b'>':
self._add_token(KEYWORD_DICT_END)
i += 1
self._parse1 = self._parse_main
return i
def _parse_hexstring(self, s, i):
m = END_HEX_STRING.search(s, i)
if not m:
self._curtoken += s[i:]
return len(s)
j = m.start(0)
self._curtoken += s[i:j]
token = HEX_PAIR.sub(lambda m: bytes((int(m.group(0), 16),)),
SPC.sub(b'', self._curtoken))
self._add_token(token)
self._parse1 = self._parse_main
return j
def nexttoken(self):
while not self._tokens:
self.fillbuf()
self.charpos = self._parse1(self.buf, self.charpos)
token = self._tokens.pop(0)
log.debug('nexttoken: %r', token)
return token
class PSStackParser(PSBaseParser):
def __init__(self, fp):
PSBaseParser.__init__(self, fp)
self.reset()
return
def reset(self):
self.context = []
self.curtype = None
self.curstack = []
self.results = []
return
def seek(self, pos):
PSBaseParser.seek(self, pos)
self.reset()
return
def push(self, *objs):
self.curstack.extend(objs)
return
def pop(self, n):
objs = self.curstack[-n:]
self.curstack[-n:] = []
return objs
def popall(self):
objs = self.curstack
self.curstack = []
return objs
def add_results(self, *objs):
try:
log.debug('add_results: %r', objs)
except Exception:
log.debug('add_results: (unprintable object)')
self.results.extend(objs)
return
def start_type(self, pos, type):
self.context.append((pos, self.curtype, self.curstack))
(self.curtype, self.curstack) = (type, [])
log.debug('start_type: pos=%r, type=%r', pos, type)
return
def end_type(self, type):
if self.curtype != type:
raise PSTypeError('Type mismatch: {!r} != {!r}'
.format(self.curtype, type))
objs = [obj for (_, obj) in self.curstack]
(pos, self.curtype, self.curstack) = self.context.pop()
log.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
return (pos, objs)
def do_keyword(self, pos, token):
return
def nextobject(self):
"""Yields a list of objects.
Arrays and dictionaries are represented as Python lists and
dictionaries.
:return: keywords, literals, strings, numbers, arrays and dictionaries.
"""
while not self.results:
(pos, token) = self.nexttoken()
if isinstance(token, (int, float, bool, str, bytes, PSLiteral)):
# normal token
self.push((pos, token))
elif token == KEYWORD_ARRAY_BEGIN:
# begin array
self.start_type(pos, 'a')
elif token == KEYWORD_ARRAY_END:
# end array
try:
self.push(self.end_type('a'))
except PSTypeError:
if settings.STRICT:
raise
elif token == KEYWORD_DICT_BEGIN:
# begin dictionary
self.start_type(pos, 'd')
elif token == KEYWORD_DICT_END:
# end dictionary
try:
(pos, objs) = self.end_type('d')
if len(objs) % 2 != 0:
error_msg = 'Invalid dictionary construct: %r' % objs
raise PSSyntaxError(error_msg)
d = {literal_name(k): v
for (k, v) in choplist(2, objs) if v is not None}
self.push((pos, d))
except PSTypeError:
if settings.STRICT:
raise
elif token == KEYWORD_PROC_BEGIN:
# begin proc
self.start_type(pos, 'p')
elif token == KEYWORD_PROC_END:
# end proc
try:
self.push(self.end_type('p'))
except PSTypeError:
if settings.STRICT:
raise
elif isinstance(token, PSKeyword):
log.debug('do_keyword: pos=%r, token=%r, stack=%r', pos,
token, self.curstack)
self.do_keyword(pos, token)
else:
log.error('unknown token: pos=%r, token=%r, stack=%r', pos,
token, self.curstack)
self.do_keyword(pos, token)
raise
if self.context:
continue
else:
self.flush()
obj = self.results.pop(0)
try:
log.debug('nextobject: %r', obj)
except Exception:
log.debug('nextobject: (unprintable object)')
return obj
-40
View File
@@ -1,40 +0,0 @@
#
# RunLength decoder (Adobe version) implementation based on PDF Reference
# version 1.4 section 3.3.4.
#
# * public domain *
#
def rldecode(data):
"""
RunLength decoder (Adobe version) implementation based on PDF Reference
version 1.4 section 3.3.4:
The RunLengthDecode filter decodes data that has been encoded in a
simple byte-oriented format based on run length. The encoded data
is a sequence of runs, where each run consists of a length byte
followed by 1 to 128 bytes of data. If the length byte is in the
range 0 to 127, the following length + 1 (1 to 128) bytes are
copied literally during decompression. If length is in the range
129 to 255, the following single byte is to be copied 257 - length
(2 to 128) times during decompression. A length value of 128
denotes EOD.
"""
decoded = b''
i = 0
while i < len(data):
length = data[i]
if length == 128:
break
if length >= 0 and length < 128:
for j in range(i+1, (i+1)+(length+1)):
decoded += bytes((data[j],))
i = (i+1) + (length+1)
if length > 128:
run = bytes((data[i+1],))*(257-length)
decoded += run
i = (i+1) + 1
return decoded
-1
View File
@@ -1 +0,0 @@
STRICT = False
-406
View File
@@ -1,406 +0,0 @@
"""
Miscellaneous Routines.
"""
import io
import pathlib
import struct
from html import escape
import chardet # For str encoding detection
# from sys import maxint as INF doesn't work anymore under Python3, but PDF
# still uses 32 bits ints
INF = (1 << 31) - 1
class open_filename(object):
"""
Context manager that allows opening a filename
(str or pathlib.PurePath type is supported) and closes it on exit,
(just like `open`), but does nothing for file-like objects.
"""
def __init__(self, filename, *args, **kwargs):
if isinstance(filename, pathlib.PurePath):
filename = str(filename)
if isinstance(filename, str):
self.file_handler = open(filename, *args, **kwargs)
self.closing = True
elif isinstance(filename, io.IOBase):
self.file_handler = filename
self.closing = False
else:
raise TypeError('Unsupported input type: %s' % type(filename))
def __enter__(self):
return self.file_handler
def __exit__(self, exc_type, exc_val, exc_tb):
if self.closing:
self.file_handler.close()
return False
def make_compat_bytes(in_str):
"Converts to bytes, encoding to unicode."
assert isinstance(in_str, str), str(type(in_str))
return in_str.encode()
def make_compat_str(in_str):
"""Converts to string, guessing encoding."""
assert isinstance(in_str, (bytes, str)), str(type(in_str))
if isinstance(in_str, bytes):
enc = chardet.detect(in_str)
in_str = in_str.decode(enc['encoding'])
return in_str
def shorten_str(s, size):
if size < 7:
return s[:size]
if len(s) > size:
length = (size - 5) // 2
return '{} ... {}'.format(s[:length], s[-length:])
else:
return s
def compatible_encode_method(bytesorstring, encoding='utf-8',
erraction='ignore'):
"""When Py2 str.encode is called, it often means bytes.encode in Py3.
This does either.
"""
if isinstance(bytesorstring, str):
return bytesorstring
assert isinstance(bytesorstring, bytes), str(type(bytesorstring))
return bytesorstring.decode(encoding, erraction)
def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
if bitspercomponent != 8:
# unsupported
raise ValueError("Unsupported `bitspercomponent': %d" %
bitspercomponent)
nbytes = colors * columns * bitspercomponent // 8
buf = b''
line0 = b'\x00' * columns
for i in range(0, len(data), nbytes + 1):
ft = data[i]
i += 1
line1 = data[i:i + nbytes]
line2 = b''
if ft == 0:
# PNG none
line2 += line1
elif ft == 1:
# PNG sub (UNTESTED)
c = 0
for b in line1:
c = (c + b) & 255
line2 += bytes((c,))
elif ft == 2:
# PNG up
for (a, b) in zip(line0, line1):
c = (a + b) & 255
line2 += bytes((c,))
elif ft == 3:
# PNG average (UNTESTED)
c = 0
for (a, b) in zip(line0, line1):
c = ((c + a + b) // 2) & 255
line2 += bytes((c,))
else:
# unsupported
raise ValueError("Unsupported predictor value: %d" % ft)
buf += line2
line0 = line2
return buf
# Matrix operations
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
def mult_matrix(m1, m0):
(a1, b1, c1, d1, e1, f1) = m1
(a0, b0, c0, d0, e0, f0) = m0
"""Returns the multiplication of two matrices."""
return (a0 * a1 + c0 * b1, b0 * a1 + d0 * b1,
a0 * c1 + c0 * d1, b0 * c1 + d0 * d1,
a0 * e1 + c0 * f1 + e0, b0 * e1 + d0 * f1 + f0)
def translate_matrix(m, v):
"""Translates a matrix by (x, y)."""
(a, b, c, d, e, f) = m
(x, y) = v
return a, b, c, d, x * a + y * c + e, x * b + y * d + f
def apply_matrix_pt(m, v):
(a, b, c, d, e, f) = m
(x, y) = v
"""Applies a matrix to a point."""
return a * x + c * y + e, b * x + d * y + f
def apply_matrix_norm(m, v):
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
(a, b, c, d, e, f) = m
(p, q) = v
return a * p + c * q, b * p + d * q
# Utility functions
def isnumber(x):
return isinstance(x, (int, float))
def uniq(objs):
"""Eliminates duplicated elements."""
done = set()
for obj in objs:
if obj in done:
continue
done.add(obj)
yield obj
return
def fsplit(pred, objs):
"""Split a list into two classes according to the predicate."""
t = []
f = []
for obj in objs:
if pred(obj):
t.append(obj)
else:
f.append(obj)
return t, f
def drange(v0, v1, d):
"""Returns a discrete range."""
return range(int(v0) // d, int(v1 + d) // d)
def get_bound(pts):
"""Compute a minimal rectangle that covers all the points."""
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
for (x, y) in pts:
x0 = min(x0, x)
y0 = min(y0, y)
x1 = max(x1, x)
y1 = max(y1, y)
return x0, y0, x1, y1
def pick(seq, func, maxobj=None):
"""Picks the object obj where func(obj) has the highest value."""
maxscore = None
for obj in seq:
score = func(obj)
if maxscore is None or maxscore < score:
(maxscore, maxobj) = (score, obj)
return maxobj
def choplist(n, seq):
"""Groups every n elements of the list."""
r = []
for x in seq:
r.append(x)
if len(r) == n:
yield tuple(r)
r = []
return
def nunpack(s, default=0):
"""Unpacks 1 to 4 or 8 byte integers (big endian)."""
length = len(s)
if not length:
return default
elif length == 1:
return ord(s)
elif length == 2:
return struct.unpack('>H', s)[0]
elif length == 3:
return struct.unpack('>L', b'\x00' + s)[0]
elif length == 4:
return struct.unpack('>L', s)[0]
elif length == 8:
return struct.unpack('>Q', s)[0]
else:
raise TypeError('invalid length: %d' % length)
PDFDocEncoding = ''.join(chr(x) for x in (
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
))
def decode_text(s):
"""Decodes a PDFDocEncoding string to Unicode."""
if s.startswith(b'\xfe\xff'):
return str(s[2:], 'utf-16be', 'ignore')
else:
return ''.join(PDFDocEncoding[c] for c in s)
def enc(x):
"""Encodes a string for SGML/XML/HTML"""
if isinstance(x, bytes):
return ''
return escape(x)
def bbox2str(bbox):
(x0, y0, x1, y1) = bbox
return '{:.3f},{:.3f},{:.3f},{:.3f}'.format(x0, y0, x1, y1)
def matrix2str(m):
(a, b, c, d, e, f) = m
return '[{:.2f},{:.2f},{:.2f},{:.2f}, ({:.2f},{:.2f})]'\
.format(a, b, c, d, e, f)
def vecBetweenBoxes(obj1, obj2):
"""A distance function between two TextBoxes.
Consider the bounding rectangle for obj1 and obj2.
Return vector between 2 boxes boundaries if they don't overlap, otherwise
returns vector betweeen boxes centers
+------+..........+ (x1, y1)
| obj1 | :
+------+www+------+
: | obj2 |
(x0, y0) +..........+------+
"""
(x0, y0) = (min(obj1.x0, obj2.x0), min(obj1.y0, obj2.y0))
(x1, y1) = (max(obj1.x1, obj2.x1), max(obj1.y1, obj2.y1))
(ow, oh) = (x1 - x0, y1 - y0)
(iw, ih) = (ow - obj1.width - obj2.width, oh - obj1.height - obj2.height)
if iw < 0 and ih < 0:
# if one is inside another we compute euclidean distance
(xc1, yc1) = ((obj1.x0 + obj1.x1) / 2, (obj1.y0 + obj1.y1) / 2)
(xc2, yc2) = ((obj2.x0 + obj2.x1) / 2, (obj2.y0 + obj2.y1) / 2)
return xc1 - xc2, yc1 - yc2
else:
return max(0, iw), max(0, ih)
class Plane:
"""A set-like data structure for objects placed on a plane.
Can efficiently find objects in a certain rectangular area.
It maintains two parallel lists of objects, each of
which is sorted by its x or y coordinate.
"""
def __init__(self, bbox, gridsize=50):
self._seq = [] # preserve the object order.
self._objs = set()
self._grid = {}
self.gridsize = gridsize
(self.x0, self.y0, self.x1, self.y1) = bbox
def __repr__(self):
return '<Plane objs=%r>' % list(self)
def __iter__(self):
return (obj for obj in self._seq if obj in self._objs)
def __len__(self):
return len(self._objs)
def __contains__(self, obj):
return obj in self._objs
def _getrange(self, bbox):
(x0, y0, x1, y1) = bbox
if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0:
return
x0 = max(self.x0, x0)
y0 = max(self.y0, y0)
x1 = min(self.x1, x1)
y1 = min(self.y1, y1)
for grid_y in drange(y0, y1, self.gridsize):
for grid_x in drange(x0, x1, self.gridsize):
yield (grid_x, grid_y)
def extend(self, objs):
for obj in objs:
self.add(obj)
def add(self, obj):
"""place an object."""
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
if k not in self._grid:
r = []
self._grid[k] = r
else:
r = self._grid[k]
r.append(obj)
self._seq.append(obj)
self._objs.add(obj)
def remove(self, obj):
"""displace an object."""
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
try:
self._grid[k].remove(obj)
except (KeyError, ValueError):
pass
self._objs.remove(obj)
def find(self, bbox):
"""finds objects that are in a certain area."""
(x0, y0, x1, y1) = bbox
done = set()
for k in self._getrange(bbox):
if k not in self._grid:
continue
for obj in self._grid[k]:
if obj in done:
continue
done.add(obj)
if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 \
or y1 <= obj.y0:
continue
yield obj
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
-177
View File
@@ -1,177 +0,0 @@
#!/bin/bash
RED=$'\x1b[31m'
GREEN=$'\x1b[32m'
GREY=$'\x1b[90m'
RESET=$'\x1b[39m'
[[ $# -lt 1 ]] && {
echo "$0 'needle' where/ [/usr/bin/find options]"
echo "example: $0 's3cr3t' /mnt/share/ -size -10M ! -iname '*.wav' ! -iname '*.mp3'"
exit
}
function fork(){
needle="$1"
tempdir="$2"
ln -s "$(realpath $0)" "$tempdir/$(basename $0)"
( cd "$tempdir"; "./$(basename $0)" "$needle" "." "${opts[@]}"; )
}
needle="$1"
shift
where="$1"
shift
opts=("$@")
find "$where" "${opts[@]}" -type f -print 2> /dev/null |
while read path
do
filename=$(basename "$path")
filename=${filename%\?*}
ext=${filename##*.}
[[ $filename = $ext ]] && ext=''
mime=$(file -bi "$path")
mime=${mime%' '*}
#echo "$path"
case $mime in
*/xml\;)
content=$(cat "$path")
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[xml] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
;;
*/*html*)
codepage=$(uchardet "$path")
content=$(cat "$path" | iconv -f $codepage | lynx -nolist -dump -stdin)
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[html] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
;;
text/*|*/*script\;)
content=$(cat "$path")
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[text] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
;;
application/msword\;)
content=$(catdoc "$path")
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[doc] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
;;
application/vnd.openxmlformats-officedocument.wordprocessingml.document\;)
content=$(unzip -p "$path" | grep -a '<w:r' | sed 's/<w:p[^<\/]*>/ /g' | sed 's/<[^<]*>//g' | grep -a -v '^[[:space:]]*$' | sed G)
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[docx] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
;;
application/vnd.ms-excel\;)
content=$(xls2csv -x "$path")
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[xls] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
;;
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\;)
content=$(unzip -p "$path" | grep -a -e '<si><t>' -e '<vt:lpstr>' | sed 's/<[^<\/]*>/ /g' | sed 's/<[^<]*>//g')
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[xlsx] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
;;
application/pdf\;)
content=$(pdf2txt -t text "$path" 2> /dev/null)
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[pdf] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
;;
application/x-executable\;|application/x*dos*)
content=$(rabin2 -z "$path" 2> /dev/null)
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[exe] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
;;
application/x-object\;|application/x-sharedlib|application/x-executable\;)
content=$(rabin2 -z "$path" 2> /dev/null)
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[elf] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
;;
application/*compressed*|application/*zip*|application/*rar*|application/*tar*|application/*gzip*)
content=$(7z l "$path" | tail -n +13)
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[archive] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
temp=$(tempfile)
rm $temp && mkdir -p "$temp/$path"
7z x "$path" -o"$temp/$path" 1> /dev/null 2> /dev/null
fork "$needle" "$temp"
rm -r "$temp"
#break
;;
image/*)
content=$(identify -verbose "$path" 2> /dev/null)
#content=$(tesseract "$path" stdout -l eng; tesseract "$path" stdout -l rus)
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[img] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
;;
message/*)
content=$(mu view "$path")
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[message] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
temp=$(tempfile)
rm $temp && mkdir -p "$temp/$path"
cp "$path" "$temp/$path/"
munpack -t -f -C "$(realpath $temp/$path)" "$(basename $path)" > /dev/null
rm "$temp/$path/$(basename $path)"
fork "$needle" "$temp"
rm -r "$temp"
#break
;;
application/octet-stream\;)
#content=$(strings "$path")
#if echo "$content"|grep -q -ai "$needle"; then
# echo $GREEN "[raw] $path" $RESET
# echo "$content"|grep -ai "$needle" --color=auto
#fi
false
;;
application/x-raw-disk-image\;)
content=$(binwalk "$path")
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[disk] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
;;
*)
file "$path" | grep -q text &&
{
content=$(cat "$path")
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[unknown] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
} || {
content=$(strings "$path")
if echo "$content"|grep -q -ai "$needle"; then
echo $GREEN "[unknown] $path" $RESET
echo "$content"|grep -ai "$needle" --color=auto
fi
}
;;
esac
done
-32
View File
@@ -1,32 +0,0 @@
#!/bin/bash
GREEN=$'\x1b[32m'
RESET=$'\x1b[39m'
LIMIT=10
OFFSET=1
while getopts "c:o:" opt
do
case $opt in
c) LIMIT=$OPTARG;;
o) OFFSET=$OPTARG;;
esac
done
[[ $(($#-$OPTIND)) -lt 1 ]] && {
echo $0 [opts] words.db QUERY
echo "opts:"
echo " -c count"
echo " -o offset"
exit
}
DB="${@:$OPTIND:1}"
shift $OPTIND
echo $GREEN
#echo "SELECT uri FROM words WHERE text MATCH '$*' limit $LIMIT offset $OFFSET;" | sqlite3 "$DB"
echo "SELECT uri FROM words WHERE text LIKE '%$*%' limit $LIMIT offset $OFFSET;" | sqlite3 "$DB"
echo $RESET
#echo "SELECT text FROM words WHERE text MATCH '$*' limit $LIMIT offset $OFFSET;" | sqlite3 "$DB" | grep -i -o -P ".{0,100}$*..{0,100}" | grep -i --color=auto "$*"
echo "SELECT text FROM words WHERE text LIKE '%$*%' limit $LIMIT offset $OFFSET;" | sqlite3 "$DB" | grep -i -o -P ".{0,100}$*..{0,100}" | grep -i --color=auto "$*"
-21
View File
@@ -1,21 +0,0 @@
#!/bin/bash
USERAGENT="Mozilla"
IGNORE_EXT="gif,GIF,jpg,JPG,png,PNG,ico,ICO,svg,SVG,woff,ttf,eot"
[ $# -lt 1 ] && {
echo "$0 url [/usr/bin/wget options]"
echo "example: $0 --level 5 --wait 2 --domains www.site.com --quota=10000000 -A html,php -R pdf,jpg -X uploads --no-parent http://site.com/path/to"
exit
}
function crawl(){
wget --no-check-certificate --recursive --spider -e robots=off -U $USERAGENT -O "/tmp/spider" --no-verbose $* 2>&1 | sed -rn 's|.*URL:[ ]*([^ ]+).*|\1|p'
}
function save(){
wget --no-check-certificate --recursive -N -e robots=off -U $USERAGENT --no-verbose -R "$IGNORE_EXT" $* 2>&1 | sed -rn 's|.*URL:[ ]*([^ ]+).*|\1|p'
}
#crawl $*
save $*
Executable
+194
View File
@@ -0,0 +1,194 @@
#!/usr/bin/python3
import csv
import json
from hashlib import md5
from opensearchpy import OpenSearch
from os import path
from datetime import datetime
from colorama import Fore
import argparse
CREDS = ('admin', 'admin')
parser = argparse.ArgumentParser( description='search machine control tool' )
parser.add_argument("opensearch", type=str, default="localhost:9200", help="opensearch address (localhost:9200)")
parser.add_argument("-i", "--index", type=str, metavar="index", default="", help="index where to search")
parser.add_argument("-o", "--offset", type=int, metavar="offset", default=0, help="offset results in query")
parser.add_argument("-c", "--count", type=int, metavar="count", default=10, help="count results in query")
parser.add_argument("-init", action="store_true", help="init index")
parser.add_argument("-drop", action="store_true", help="drop index")
parser.add_argument("-import", dest="file_import", metavar="input.csv", help="import data")
parser.add_argument("-delete", dest="file_delete", metavar="input.csv", help="delete data")
parser.add_argument("-query", metavar="query", help="search query")
parser.add_argument("-cache", metavar="cache", help="get cache of a document")
args = parser.parse_args()
host,port = args.opensearch.split(":")
client = OpenSearch(
hosts = [{'host': host, 'port': int(port)}],
http_compress = True,
http_auth = CREDS,
use_ssl = True,
verify_certs = False,
ssl_assert_hostname = False,
ssl_show_warn = False
)
def indexes():
for index in client.indices.get("*"):
print(index, client.cat.count(index))
def info(index):
print(json.dumps(client.indices.get_settings(index=index), indent=4))
#json.dumps(client.indices.get_mapping(index=index))
def init(index):
SETTINGS = {
"mappings": {
"properties": {
"timestamp": {"type": "text"},
"inurl": { "type" : "text" },
"site": { "type" : "text" },
"ext": { "type" : "text" },
"intitle": { "type" : "text" },
"intext": { "type" : "text" },
"filetype": { "type" : "text" }
}
},
"settings": {
"analysis": {
"analyzer": {
"russian": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "russian_stop"],
},
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "russian_stop", "autocomplete_filter"]
}
},
"filter": {
"russian_stop": {
"type": "stop",
"stopwords": "_russian_"
},
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 20
}
}
}
}
}
response = client.indices.create(index, body=SETTINGS)
print(response)
def add(index, source):
csv.field_size_limit(2**32)
reader = csv.reader(open(source, errors="surrogateescape"), delimiter=',', quotechar='"')
for row in reader:
try:
timestamp,filepath,ext,filetype,content,*_ = row
document = {
"timestamp": datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S'),
"inurl": filepath,
"site": path.splitext(path.basename(source))[0],
"ext": ext,
"intitle": "",
"intext": content,
"filetype": filetype
}
response = client.index(
index = index,
id = md5(filepath.encode()).hexdigest(),
body = document,
refresh = True
)
#print(response)
except Exception as e:
print(str(e))
def query(index, text):
query = {
"size": args.count,
"from": args.offset,
"query": {
"query_string": {
"query": text,
"fields": ["inurl^100","intitle^50","intext^5"],
"default_operator": "AND",
"fuzziness": "AUTO",
"analyzer": "russian"
}
},
"highlight": {
"order": "score",
"fields": {
"*": {
"pre_tags" : [ Fore.RED ],
"post_tags" : [ Fore.RESET ],
"fragment_size": 50,
"number_of_fragments": 3
}
}
}
}
response = client.search(
index = index,
body = query
)
for result in response['hits']['hits']:
print("{G}{uri} {B}{cache}{R}".format(
uri=result['highlight']['inurl'][0] if result['highlight'].get('inurl') else result['_source']['inurl'],
cache=result['_id'],
G=Fore.GREEN, B=Fore.LIGHTBLACK_EX, R=Fore.RESET))
print(" ... ".join(result['highlight'].get('intext',[])))
def cache(index, _id):
result = client.get(index='test',id=_id)
print(result["_source"]["intext"])
def delete(index, source):
csv.field_size_limit(2**32)
reader = csv.reader(open(source, errors="surrogateescape"), delimiter=',', quotechar='"')
for row in reader:
try:
timestamp,filepath,ext,filetype,content,*_ = row
response = client.delete(
index = index,
id = md5(filepath.encode()).hexdigest(),
)
print(response)
except Exception as e:
print(str(e))
def drop(index):
response = client.indices.delete(
index = index
)
print(response)
if args.init:
init(index=args.index)
elif args.drop:
drop(index=args.index)
elif args.file_import:
add(index=args.index, source=args.file_import)
elif args.file_delete:
delete(index=args.index, source=args.file_delete)
elif args.query:
query(index=args.index, text=args.query)
elif args.cache:
cache(index=args.index, _id=args.cache)
else:
if args.index:
info(index=args.index)
else:
indexes()
Executable
+38
View File
@@ -0,0 +1,38 @@
#!/bin/bash
GREEN=$'\x1b[32m'
RESET=$'\x1b[39m'
MATCH=50
LIMIT=10
OFFSET=0
URI='%'
while getopts "m:c:o:u:" opt
do
case $opt in
m) MATCH=$OPTARG;;
c) LIMIT=$OPTARG;;
o) OFFSET=$OPTARG;;
u) URI=$OPTARG;;
esac
done
[[ $(($#-$OPTIND)) -lt 1 ]] && [[ $URI = '%' ]] && {
echo $0 [opts] words.db QUERY
echo "opts:"
echo " -m match"
echo " -c count"
echo " -o offset"
echo " -u fragment"
exit
}
DB="${@:$OPTIND:1}"
shift $OPTIND
IFS='=%='
echo "SELECT uri,text FROM words WHERE uri LIKE '$URI' and text LIKE '%$*%' limit $LIMIT offset $OFFSET;" | sqlite3 -separator '=%=' "$DB" | while read uri text
do
echo $GREEN"$uri"$RESET
echo "$text" | grep -i -o -P ".{0,$MATCH}$*..{0,$MATCH}" | grep -i --color=auto "$*"
done
Executable
+24
View File
@@ -0,0 +1,24 @@
#!/bin/bash
USERAGENT="Mozilla"
IGNORE_EXT="gif,GIF,jpg,JPG,png,PNG,ico,ICO,svg,SVG,woff,ttf,eot"
[ $# -lt 1 ] && {
echo "$0 url [/usr/bin/wget options]"
echo "example: $0 --level 5 --wait 2 --domains www.site.com --limit-size=10000000 -A html,php -R pdf,jpg -X uploads --no-parent http://site.com/path/to"
echo "example: $0 --level 2 --wait 1 --limit-size=500k ftp://target.com/"
exit
}
function crawl(){
$(dirname "$0")/bin/wget --no-check-certificate --recursive --spider -e robots=off -U $USERAGENT -O "/tmp/spider" --no-verbose $* 2>&1 | sed -rn 's|.*URL:[ ]*([^ ]+).*|\1|p'
}
function save(){
$(dirname "$0")/bin/wget --no-check-certificate --recursive -N -e robots=off -U $USERAGENT --no-verbose -R "$IGNORE_EXT" $* 2>&1 | sed -rn 's|.*URL:[ ]*([^ ]+).*|\1|p'
}
#crawl $*
save $*
#https://yurichev.com/wget.html
-7
View File
@@ -1,7 +0,0 @@
cd c:\path\to\crawl\windows
.\crawl.ps1 ..\path\to > out.log
.\grep.ps1 ..\path\to s3cr3t
cme smb -d dom -u adm -p pas -X '.\grep.ps1 c:\users s3cr3t > c:\grep.log' targets.txt
sleep 3600
cme smb -d dom -u adm -p pas -x 'type c:\grep.log' targets.txt
-66
View File
@@ -1,66 +0,0 @@
echo "begin $PID"
$ErrorActionPreference = 'SilentlyContinue'
$TIMEOUT=15
$haystack = $args[0]
$needle = $args[1]
$files = 0
$exts = @()
$exts += @("*.doc","*.docx")
$exts += @("*.xls","*.xlsx")
$exts += @("*.pdf")
$exts += @("*.zip")
$exts += @("*.txt","*.bat","*.vbs","*.ps1","*.reg","*.cfg","*.conf","*.xml","*.log")
#$exts += @("*.exe","*.dll")
$opts = @{
"Path" = $haystack
"Recurse" = $true
"Include" = $exts
}
Get-ChildItem @opts 2> $null | % {
if((Get-Item $_.FullName) -isnot [System.IO.DirectoryInfo]) {
$files += 1
}
}
$i = 1
Get-ChildItem @opts 2> $null | % {
if((Get-Item $_.FullName) -isnot [System.IO.DirectoryInfo]) {
$file = @{}
$file.name = $_.Name
$file.path = $_.FullName
$file.ext = $_.Extension
$file.content = ""
#echo "[*] $($file.path)"
$job = $null
switch -regex ($file.ext) {
'.txt|.bat|.vbs|.ps1|.reg|.cfg|.conf|.xml' { $job = Start-Job -FilePath .\lib\plaintext.ps1 -argumentlist $file.path }
'.doc*' { $job = Start-Job -FilePath .\lib\word.ps1 -argumentlist $file.path }
'.xls*' { $job = Start-Job -FilePath .\lib\excel.ps1 -argumentlist $file.path }
'.pdf' { $job = Start-Job -FilePath .\lib\pdf.ps1 -argumentlist $file.path -Init ([ScriptBlock]::Create("Set-Location '$pwd'")) }
'.zip|.7z|.tar|.gz|.gzip|.gz' { $job = Start-Job -FilePath .\lib\archive.ps1 -argumentlist $file.path,"grep.ps1",$needle -Init ([ScriptBlock]::Create("Set-Location '$pwd'")) }
'.exe|.dll' { $job = Start-Job -FilePath .\lib\executable.ps1 -argumentlist $file.path -Init ([ScriptBlock]::Create("Set-Location '$pwd'")) }
}
if($job)
{
Wait-Job -timeout $TIMEOUT $job > $null
$file.content = Receive-Job $job
#echo $file.content
Stop-Job $job
Remove-Job $job
}
if(echo $file.content | select-string $needle) {
Write-Output "[+] [$i/$files] $($file.path)"
echo $file.content | select-string -Pattern $needle
#Write-Host -ForegroundColor green (echo $file.content | select-string -Pattern $needle)
#highlight(echo $file.content | select-string $needle)
}
elseif($file.content -eq 0) {
echo "[!] [$i/$files] $($file.path)"
}
elseif($i % 1 -eq 0) {
echo "[*] [$i/$files] $($file.path)"
}
$i += 1
}
}
echo 'done'
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More