reordering structure, removed windows scripts
This commit is contained in:
+37
@@ -0,0 +1,37 @@
|
||||
FROM debian
|
||||
|
||||
WORKDIR /opt/crawl
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends sudo tmux iproute2 nano less iputils-ping locales && \
|
||||
apt install -y --no-install-recommends wget curl file sqlite3 cifs-utils python3 python3-pip xz-utils && \
|
||||
apt install -y --no-install-recommends lynx uchardet catdoc unzip python3-pdfminer p7zip-full && \
|
||||
apt install -y --no-install-recommends maildir-utils mpack libemail-outlook-message-perl libemail-sender-perl binwalk && \
|
||||
apt install -y --no-install-recommends graphicsmagick-imagemagick-compat tesseract-ocr tesseract-ocr-eng tesseract-ocr-rus ffmpeg && \
|
||||
pip3 install --break-system-packages vosk && \
|
||||
wget https://github.com/radareorg/radare2/releases/download/5.8.8/radare2-5.8.8-static.tar.xz -O /tmp/radare2.tar.xz && tar xvf /tmp/radare2.tar.xz -C /opt/ && rm /tmp/radare2.tar.xz && ln -s /opt/r2-static/usr/bin/rabin2 /usr/local/bin/rabin2
|
||||
|
||||
COPY bin bin
|
||||
COPY cron cron
|
||||
COPY www www
|
||||
COPY spider.sh .
|
||||
COPY crawl.sh .
|
||||
COPY import.sh .
|
||||
COPY search.sh .
|
||||
COPY opensearch.py .
|
||||
|
||||
RUN apt install -y --no-install-recommends nodejs npm openjdk-17-jre && \
|
||||
pip3 install --break-system-packages opensearch-py colorama && \
|
||||
cd www/ && npm install && cd - && \
|
||||
wget https://artifacts.opensearch.org/releases/bundle/opensearch/2.11.0/opensearch-2.11.0-linux-x64.tar.gz -O /tmp/opensearch.tar.gz && tar xvf /tmp/opensearch.tar.gz -C /opt/ && rm /tmp/opensearch.tar.gz
|
||||
|
||||
RUN echo 'LANG="ru_RU.UTF-8"' > /etc/default/locale && \
|
||||
localedef -i ru_RU -f UTF-8 ru_RU.UTF-8 && \
|
||||
locale-gen && \
|
||||
echo 241 | dpkg-reconfigure locales && \
|
||||
echo "LANG=ru_RU.UTF-8" > /etc/default/locale && \
|
||||
useradd -s /bin/bash -g users -N -M -d /opt/crawl user && \
|
||||
chown -R user.users /opt/ && \
|
||||
chmod +w /etc/sudoers && echo 'user ALL=(root) NOPASSWD: ALL' >> /etc/sudoers && chmod -w /etc/sudoers
|
||||
|
||||
EXPOSE 8080
|
||||
@@ -0,0 +1,127 @@
|
||||
## Crawling
|
||||
|
||||
Each crawler goes through some source and pulls out exclusively useful data - text. Does not depend on extension. Easily customizable.
|
||||
Supported file types: `text`, `html`, `doc`/`docx`, `xls`/`xlsx`, `pdf`, `archives`, `exe`/`bin`, `eml`/`msg`, `images`, `sounds`.
|
||||
You can easily add your own file types (GNU power)
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
## Installation
|
||||
|
||||
### System
|
||||
|
||||
Depends:
|
||||
|
||||
* lynx, uchardet - html
|
||||
* catdoc - doc
|
||||
* xls2csv - xls
|
||||
* unzip - docx,xlsx
|
||||
* pdf2txt - pdf
|
||||
* rabin2 - exe,dll
|
||||
* 7z - archives
|
||||
* identify, tesseract - images
|
||||
* vosk-transcriber - audios
|
||||
* msgconvert, munpack, mu - emails
|
||||
* binwalk - disk images
|
||||
|
||||
```
|
||||
sudo apt install sqlite3 cifs-utils
|
||||
sudo apt install file uchardet cifs-utils lynx catdoc unzip python3-pdfminer radare2 p7zip-full
|
||||
sudo apt install maildir-utils mpack libemail-outlook-message-perl libemail-sender-perl binwalk
|
||||
sudo apt install graphicsmagick-imagemagick-compat tesseract-ocr tesseract-ocr-eng tesseract-ocr-rus ffmpeg
|
||||
sudo pip3 install vosk
|
||||
```
|
||||
|
||||
### Docker
|
||||
|
||||
```
|
||||
sudo docker build -t crawl .
|
||||
sudo docker run --privileged --cap-add SYS_ADMIN --cap-add DAC_READ_SEARCH --cap-add NET_BIND_SERVICE --cap-add CAP_SYSLOG -u 1000 -p 8080:8080 --name crawl -it crawl /bin/bash
|
||||
```
|
||||
|
||||
### SMB crawling
|
||||
|
||||
Making a network drive local and crawl it:
|
||||
|
||||
```
|
||||
mount.cifs "//10.10.10.10/Docs" /mnt/Docs -o ro,dom=corp.net,user=username,pass=password
|
||||
./crawl.sh /mnt/Docs -size -10M
|
||||
```
|
||||
|
||||
It will create `Docs.csv` index file.
|
||||
|
||||
### Web crawling
|
||||
|
||||
Depends:
|
||||
|
||||
* wget with controllable download limit (https://yurichev.com/wget.html)
|
||||
|
||||
Making site content local and crawl it:
|
||||
|
||||
```
|
||||
./spider.sh --limit-size=500k http://target.com/
|
||||
./crawl.sh target.com/
|
||||
```
|
||||
|
||||
It will create `target.com.csv` index file.
|
||||
|
||||
### FTP crawling
|
||||
|
||||
Making FTP content local and crawl it:
|
||||
|
||||
```
|
||||
./spider.sh --limit-size=500k ftp://target.com/`
|
||||
./crawl.sh target.com/
|
||||
```
|
||||
|
||||
It will create `target.com.csv` index file.
|
||||
|
||||
## Searching
|
||||
|
||||
After crawling, the extracted text is stored in `csv` files.
|
||||
Data can be searched using simple `grep`:
|
||||
|
||||
`grep -ia -o -P ".{0,100}password..{0,100}" *.csv | grep -ai --color=auto "password"`
|
||||
|
||||
Or search for data using a fuzzy search (written with errors):
|
||||
|
||||
`tre-agrep -i -E 2 passw *.csv`
|
||||
|
||||
### Searching CLI (pentesters)
|
||||
|
||||
Data can be converted into a `sqlite3` database with full-text search support:
|
||||
|
||||
`./import.sh INBOX.csv`
|
||||
|
||||
Searching for data in the database is now more convenient:
|
||||
|
||||
```
|
||||
./search.sh INBOX.db 's3cr3t'
|
||||
./search.sh INBOX.db 'password' -c 10 -o 20
|
||||
./search.sh INBOX.db 'password' -m 'admin'
|
||||
```
|
||||
|
||||
### Searching GUI (enterprise)
|
||||
|
||||
Depends:
|
||||
|
||||
```
|
||||
sudo apt install nodejs npm openjdk-17-jre
|
||||
cd www && npm install
|
||||
wget wget https://artifacts.opensearch.org/releases/bundle/opensearch/2.11.0/opensearch-2.11.0-linux-x64.tar.gz -O /tmp/opensearch.tar.gz && tar xvf /tmp/opensearch.tar.gz -C /opt/
|
||||
JAVA_LIBRARY_PATH=/opt/opensearch/plugins/opensearch-knn/lib /opt/opensearch/opensearch-tar-install.sh
|
||||
```
|
||||
|
||||
Searching for data using opensearch:
|
||||
|
||||
```
|
||||
JAVA_LIBRARY_PATH=/opt/opensearch/plugins/opensearch-knn/lib /opt/opensearch/bin/opensearch
|
||||
./opensearch.py localhost:9200 -i test -init
|
||||
./opensearch.py localhost:9200 -i test -import INBOX.csv
|
||||
cd www && node index.js
|
||||
chrome http://localhost:8080/test/
|
||||
```
|
||||
|
||||
Continuous crawling (your Google in local area) - just use a few easy cron scripts cron/README.md
|
||||
+74
-45
@@ -7,7 +7,9 @@ RESET=$'\x1b[39m'
|
||||
|
||||
[[ $# -lt 1 ]] && {
|
||||
echo "$0 where/ [/usr/bin/find options]"
|
||||
echo "example: $0 /mnt/share/ -size -10M ! -iname '*.wav' ! -iname '*.mp3'"
|
||||
echo "example: $0 /mnt/share/ -size -10M -not -iname '*.wav' -not -iname '*.mp3'"
|
||||
echo "example: $0 /mnt/share/ -not -ipath '*/Program Files*/*' -not -ipath '*/Windows/*'"
|
||||
echo "example: $0 /mnt/share/ -newermt '2012-12-21 00:00'"
|
||||
exit
|
||||
}
|
||||
|
||||
@@ -64,10 +66,10 @@ find "$where" "${opts[@]}" -type f -print 2> /dev/null |
|
||||
while read path
|
||||
do
|
||||
[[ $is_resume = 1 && $(session_is_file_done $path) = 1 ]] && {
|
||||
echo "(skip $path)"
|
||||
echo $GREY"$path"$RESET
|
||||
continue
|
||||
}
|
||||
printf "\n" >> "$index"
|
||||
[[ -s "$index" ]] && printf "\n" >> "$index"
|
||||
echo -n "$(date +%s)," >> "$index"
|
||||
echo -n "$path"
|
||||
echo -n "$path" | escape >> "$index"
|
||||
@@ -78,115 +80,142 @@ do
|
||||
[[ $filename = $ext ]] && ext=''
|
||||
echo -n "$ext" | escape >> "$index"
|
||||
echo -n "," >> "$index"
|
||||
mime=$(file -bi "$path")
|
||||
mime=${mime%' '*}
|
||||
mime=$(file -b --mime-type "$path")
|
||||
case $mime in
|
||||
*/xml\;)
|
||||
echo -n "xml," >> "$index"
|
||||
cat "$path" | escape >> "$index"
|
||||
echo $GREEN " [xml]" $RESET
|
||||
;;
|
||||
*/*html*)
|
||||
*/*html*|application/javascript)
|
||||
echo -n "html," >> "$index"
|
||||
codepage=$(uchardet "$path")
|
||||
cat "$path" | iconv -f $codepage | lynx -nolist -dump -stdin | escape >> "$index"
|
||||
echo $GREEN " [html]" $RESET
|
||||
;;
|
||||
text/*|*/*script\;)
|
||||
text/*|*/*script|*/xml|*/json|*-ini)
|
||||
echo -n "text," >> "$index"
|
||||
cat "$path" | escape >> "$index"
|
||||
codepage=$(uchardet "$path")
|
||||
cat "$path" | iconv -f $codepage | escape >> "$index"
|
||||
echo $GREEN " [text]" $RESET
|
||||
;;
|
||||
application/msword\;)
|
||||
application/msword)
|
||||
echo -n "doc," >> "$index"
|
||||
catdoc "$path" | escape >> "$index"
|
||||
echo $GREEN " [doc]" $RESET
|
||||
;;
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.document\;)
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.document)
|
||||
echo -n "doc," >> "$index"
|
||||
unzip -p "$path" | grep -a '<w:r' | sed 's/<w:p[^<\/]*>/ /g' | sed 's/<[^<]*>//g' | grep -a -v '^[[:space:]]*$' | sed G | escape >> "$index"
|
||||
unzip -p "$path" 2> /dev/null | grep -a '<w:r' | sed 's/<w:p[^<\/]*>/ /g' | sed 's/<[^<]*>//g' | grep -a -v '^[[:space:]]*$' | sed G | escape >> "$index"
|
||||
echo $GREEN " [docx]" $RESET
|
||||
if unzip -l "$path" | grep -q 'word/media/'; then
|
||||
temp=$(tempfile 2>/dev/null)
|
||||
rm $temp && mkdir -p "$temp/$path"
|
||||
unzip "$path" 'word/media/*' -d "$temp/$path" > /dev/null
|
||||
fork "$temp"
|
||||
rm -r "$temp"
|
||||
#session_file_done $path
|
||||
fi
|
||||
;;
|
||||
application/vnd.ms-excel\;)
|
||||
application/vnd.ms-excel)
|
||||
echo -n "xls," >> "$index"
|
||||
xls2csv -x "$path" | escape >> "$index"
|
||||
echo $GREEN " [xls]" $RESET
|
||||
;;
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\;)
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet)
|
||||
echo -n "xlsx," >> "$index"
|
||||
unzip -p "$path" | grep -a -e '<si><t>' -e '<vt:lpstr>' | sed 's/<[^<\/]*>/ /g' | sed 's/<[^<]*>//g' | escape >> "$index"
|
||||
#libreoffice --convert-to csv "$path" out.csv
|
||||
unzip -p "$path" 2> /dev/null | grep -a -e '<si><t' -e '<vt:lpstr>' | sed 's/<[^<\/]*>/ /g' | sed 's/<[^<]*>//g' | escape >> "$index"
|
||||
echo $GREEN " [xlsx]" $RESET
|
||||
;;
|
||||
application/pdf\;)
|
||||
application/pdf)
|
||||
echo -n "pdf," >> "$index"
|
||||
pdf2txt -t text "$path" 2> /dev/null | escape >> "$index"
|
||||
echo $GREEN " [pdf]" $RESET
|
||||
;;
|
||||
application/x-executable\;|application/x*dos*)
|
||||
application/x-executable|application/*microsoft*-executable|application/x*dos*)
|
||||
echo -n "exe," >> "$index"
|
||||
rabin2 -z "$path" 2> /dev/null | escape >> "$index"
|
||||
echo $GREEN " [exe]" $RESET
|
||||
;;
|
||||
application/x-object\;|application/x-sharedlib|application/x-executable\;)
|
||||
application/x-object|application/x-sharedlib|application/x-executable)
|
||||
echo -n "elf," >> "$index"
|
||||
rabin2 -z "$path" 2> /dev/null | escape >> "$index"
|
||||
echo $GREEN " [elf]" $RESET
|
||||
;;
|
||||
application/*compressed*|application/*zip*|application/*rar*|application/*tar*|application/*gzip*)
|
||||
application/*compressed*|application/*zip*|application/*rar*|application/*tar*|application/*gzip*|application/*-msi|*/java-archive)
|
||||
echo -n "zip," >> "$index"
|
||||
7z l "$path" | tail -n +13 | escape >> "$index"
|
||||
7z l -p '' "$path" 2> /dev/null | tail -n +13 | escape >> "$index"
|
||||
echo $GREEN " [archive]" $RESET
|
||||
temp=$(tempfile)
|
||||
temp=$(tempfile 2>/dev/null)
|
||||
rm $temp && mkdir -p "$temp/$path"
|
||||
7z x "$path" -o"$temp/$path" 1> /dev/null 2> /dev/null
|
||||
7z x -p '' "$path" -o"$temp/$path" 1> /dev/null 2> /dev/null
|
||||
fork "$temp"
|
||||
rm -r "$temp"
|
||||
session_file_done $path
|
||||
#break
|
||||
#session_file_done $path
|
||||
;;
|
||||
image/*)
|
||||
echo -n "image," >> "$index"
|
||||
identify -verbose "$path" 2> /dev/null | escape >> "$index"
|
||||
#tesseract "$path" stdout -l eng >> "$index"
|
||||
#tesseract "$path" stdout -l rus >> "$index"
|
||||
#identify -verbose "$path" 2> /dev/null | escape >> "$index"
|
||||
tesseract "$path" stdout -l eng 2> /dev/null | escape >> "$index"
|
||||
tesseract "$path" stdout -l rus 2> /dev/null | escape >> "$index"
|
||||
#curl -X POST --form "photo=@$path" http://10.250.153.11/ | escape >> "$index"
|
||||
echo $GREEN " [img]" $RESET
|
||||
;;
|
||||
audio/*)
|
||||
echo -n "audio," >> "$index"
|
||||
vosk-transcriber --lang en-us --input "$path" 2> /dev/null | escape >> "$index"
|
||||
echo $GREEN " [snd]" $RESET
|
||||
;;
|
||||
application/vnd.ms-outlook)
|
||||
echo -n "message," >> "$index"
|
||||
temp=$(tempfile 2>/dev/null)
|
||||
rm $temp && mkdir -p "$temp/$path"
|
||||
msgconvert --outfile "$temp/$path/out.eml" "$path" 2> /dev/null
|
||||
mu view "$temp/$path/out.eml" 2> /dev/null | escape >> "$index"
|
||||
echo $GREEN " [message]" $RESET
|
||||
munpack -t -f -C "$(realpath $temp/$path)" 'out.eml' > /dev/null
|
||||
rm "$temp/$path/out.eml"
|
||||
fork "$temp"
|
||||
rm -r "$temp"
|
||||
#session_file_done $path
|
||||
;;
|
||||
message/*)
|
||||
echo -n "message," >> "$index"
|
||||
mu view "$path" | escape >> "$index"
|
||||
mu view "$path" 2> /dev/null | escape >> "$index"
|
||||
echo $GREEN " [message]" $RESET
|
||||
temp=$(tempfile)
|
||||
temp=$(tempfile 2>/dev/null)
|
||||
rm $temp && mkdir -p "$temp/$path"
|
||||
cp "$path" "$temp/$path/"
|
||||
munpack -t -f -C "$(realpath $temp/$path)" "$(basename $path)" > /dev/null
|
||||
rm "$temp/$path/$(basename $path)"
|
||||
fork "$temp"
|
||||
rm -r "$temp"
|
||||
session_file_done $path
|
||||
#break
|
||||
#session_file_done $path
|
||||
;;
|
||||
application/octet-stream\;)
|
||||
echo -n "raw," >> "$index"
|
||||
#strings "$path" | escape >> "$index"
|
||||
echo -n "," >> "$index"
|
||||
echo $GREEN " [raw]" $RESET
|
||||
*.tcpdump.pcap)
|
||||
echo -n "pcap," >> "$index"
|
||||
tcpdump -r "$path" -nn -A | escape >> "$index"
|
||||
echo $GREEN " [pcap]" $RESET
|
||||
;;
|
||||
application/x-raw-disk-image\;)
|
||||
application/x-raw-disk-image)
|
||||
echo -n "disk," >> "$index"
|
||||
binwalk "$path" | escape >> "$index"
|
||||
echo $GREEN " [disk]" $RESET
|
||||
;;
|
||||
application/octet-stream)
|
||||
echo -n "raw," >> "$index"
|
||||
#strings "$path" | escape >> "$index"
|
||||
echo -n "" >> "$index"
|
||||
echo $GREEN " [raw]" $RESET
|
||||
;;
|
||||
*)
|
||||
echo -n "unknown," >> "$index"
|
||||
file "$path" | grep text > /dev/null &&
|
||||
{
|
||||
echo -n "text," >> "$index"
|
||||
cat "$path" | escape >> "$index"
|
||||
echo $GREY " [unknown]" $RESET
|
||||
echo $GREEN " [text]" $RESET
|
||||
} || {
|
||||
echo -n "unknown," >> "$index"
|
||||
#strings "$path" >> "$index"
|
||||
echo -n "," >> "$index"
|
||||
echo -n "" >> "$index"
|
||||
echo $RED " [unknown]" $RESET
|
||||
echo "$path $mime" >> unknown_mime.log
|
||||
echo $RED " [error]" $RESET
|
||||
}
|
||||
;;
|
||||
esac
|
||||
Binary file not shown.
Binary file not shown.
Executable
BIN
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 150 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,15 @@
|
||||
## Continuous crawling
|
||||
|
||||
```
|
||||
JAVA_LIBRARY_PATH=/opt/opensearch/plugins/opensearch-knn/lib /opt/opensearch/bin/opensearch
|
||||
cd /opt/crawl/www && node index.js
|
||||
```
|
||||
|
||||
`/opt/crawl/opensearch.py localhost:9200 -i $INDEX -init`
|
||||
|
||||
```
|
||||
crontab -e
|
||||
30 11 * * * tmux new-session -d '/opt/crawl/cron/targets.sh ; timeout 3600 /opt/crawl/cron/scan.sh ; tmux new-window -d 'timeout $[3600*8] /opt/crawl/cron/www.sh' & tmux new-window -d 'timeout $[3600*8] /opt/crawl/cron/ftp.sh' & tmux new-window -d 'timeout $[3600*8] /opt/crawl/cron/smb.sh'
|
||||
0 23 * * * tmux new-session -d '/opt/crawl/cron/import.sh'
|
||||
0 0 * * 1 /opt/crawl/cron/clean.sh
|
||||
```
|
||||
Executable
+9
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
rm smb-hosts.txt
|
||||
rm www-hosts.txt
|
||||
rm ftp-hosts.txt
|
||||
|
||||
rm *.csv
|
||||
rm crawl.log
|
||||
rm .*.sess
|
||||
Executable
+7
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
cat ftp-hosts.txt | while read ip
|
||||
do echo "$ip"
|
||||
timeout 300 /opt/crawl/spider.sh "ftp://$ip/"
|
||||
timeout 300 /opt/crawl/crawl.sh "$ip"
|
||||
done
|
||||
Executable
+8
@@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
INDEX="company"
|
||||
|
||||
for csv in *.csv
|
||||
do echo $csv
|
||||
/opt/crawl/opensearch.py localhost:9200 -i $INDEX -import "$csv"
|
||||
done
|
||||
Executable
+13
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
#PORTS_WWW="80,443,8080,8443,8000,8088,8880,8808,8888,6443,7443,9443,10443,8081"
|
||||
PORTS_WWW="80,8080"
|
||||
PORTS_FTP='21'
|
||||
PORTS_SMB='445'
|
||||
|
||||
for net in $(cat nets.txt)
|
||||
do echo "$net"
|
||||
#nmap -Pn -n --max-retries 0 --max-rate 5 "$net" -p "$PORTS_WWW" --open -oG - | grep 'open' | tr '/' ' ' | awk '{print $2 " " $5}' >> www-hosts.txt
|
||||
#nmap -Pn -n --max-retries 0 --max-rate 5 "$net" -p "$PORTS_FTP" --open -oG - | grep 'open' | tr '/' ' ' | awk '{print $2}' >> ftp-hosts.txt
|
||||
nmap -Pn -n --max-retries 0 --max-rate 5 "$net" -p "$PORTS_SMB" --open -oG - | grep 'open' | tr '/' ' ' | awk '{print $2}' >> smb-hosts.txt
|
||||
done
|
||||
Executable
+23
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
|
||||
DOMAIN='company.org'
|
||||
USER='iivanov'
|
||||
PASS='password'
|
||||
|
||||
#cme -t 1 smb --shares smb-hosts.txt | grep ' READ ' | sed -rn 's/SMB\s+([^\s]+)\s+445\s+([^\s]+)\s+(.*)\s+READ.+/\1\t\2\t\3/p' > shares-anon.txt
|
||||
cme -t 1 smb -d "$DOMAIN" -u "$USER" -p "$PASS" --shares smb-hosts.txt | grep ' READ ' | sed -rn 's/SMB\s+([^\s]+)\s+445\s+([^\s]+)\s+(.*)\s+READ.+/\1\t\2\t\3/p' > shares-user.txt
|
||||
|
||||
IFS=$'\t'
|
||||
for depth in {1..10}
|
||||
do
|
||||
cat shares-user.txt | grep -v 'IPC$' | while read ip name share
|
||||
do echo "$ip" "$share"
|
||||
fgrep -q "+ $depth //$ip/$share" crawl.log 2> /dev/null && continue
|
||||
mkdir "/mnt/$ip-$share"
|
||||
sudo timeout 5 mount.cifs "//$ip/$share" "/mnt/$ip-$share" -o ro,dom="$DOMAIN",user="$USER",pass="$PASS" || { echo "- $depth //$ip/$share" >> crawl.log; continue; }
|
||||
timeout 300 /opt/crawl/crawl.sh "/mnt/$ip-$share" -mindepth "$depth" -maxdepth "$depth" -size -100k
|
||||
sudo umount "/mnt/$ip-$share"
|
||||
rm -r "/mnt/$ip-$share"
|
||||
echo "+ $DEPTH //$ip/$share" >> crawl.log
|
||||
done
|
||||
done
|
||||
Executable
+14
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
USER='iivanov'
|
||||
PASS='password'
|
||||
DOMAIN='company.org'
|
||||
DC='192.168.12.6'
|
||||
DNS=$DC
|
||||
|
||||
namespace=$(curl -s ldap://$DC | grep 'namingContexts:' | head -n 1 | awk '{print $2}')
|
||||
ldapsearch -o ldif-wrap=no -E pr=10000/noprompt -D "$USER@$DOMAIN" -w "$PASS" -x -H ldap://"$DC" -b "$namespace" '(objectClass=computer)' dnshostname | grep dNSHostName | awk '{print $2}' > hosts.txt
|
||||
|
||||
cat hosts.txt | while read host
|
||||
do host "$host" "$DNS" | grep 'has address' | awk '{print $4}'
|
||||
done | sed -rn 's/([0-9]+\.[0-9]+\.[0-9]+)\.[0-9]+./\1\.0\/24/p' | sort | uniq -c | sort -n -r | awk '{print $2}' > nets.txt
|
||||
Executable
+7
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
cat www-hosts.txt | while read ip port
|
||||
do echo "$ip $port"
|
||||
timeout 300 /opt/crawl/spider.sh "http://$ip:$port/"
|
||||
timeout 300 /opt/crawl/crawl.sh "$ip:$port"
|
||||
done
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 70 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 83 KiB |
@@ -1,25 +0,0 @@
|
||||
cd path/to/crawl/linux
|
||||
|
||||
### Local crawling
|
||||
|
||||
PATH=$PATH:bin ./crawl.sh /home/ -size -10M
|
||||
|
||||
PATH=$PATH:bin ./grep.sh 'pass' / -size -10M
|
||||
|
||||
./import.sh results.csv
|
||||
|
||||
./search.sh results.db 's3cr3t'
|
||||
|
||||
### Web crawling
|
||||
|
||||
./spider.sh http://target.com/
|
||||
|
||||
cd /tmp/spider/
|
||||
|
||||
./crawl.sh target.com/ -size -10M
|
||||
|
||||
### Mails crawling
|
||||
|
||||
./imap.sh imap://server.com user:pass
|
||||
|
||||
./crawl.sh INBOX
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,205 +0,0 @@
|
||||
#!/usr/bin/python3
|
||||
"""A command line tool for extracting text and images from PDF and
|
||||
output it to plain text, html, xml or tags."""
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
sys.path = ['.'] + sys.path
|
||||
|
||||
import pdfminer.high_level
|
||||
import pdfminer.layout
|
||||
|
||||
logging.basicConfig()
|
||||
|
||||
OUTPUT_TYPES = ((".htm", "html"),
|
||||
(".html", "html"),
|
||||
(".xml", "xml"),
|
||||
(".tag", "tag"))
|
||||
|
||||
|
||||
def float_or_disabled(x):
|
||||
if x.lower().strip() == "disabled":
|
||||
return x
|
||||
try:
|
||||
x = float(x)
|
||||
except ValueError:
|
||||
raise argparse.ArgumentTypeError("invalid float value: {}".format(x))
|
||||
|
||||
|
||||
def extract_text(files=[], outfile='-',
|
||||
no_laparams=False, all_texts=None, detect_vertical=None,
|
||||
word_margin=None, char_margin=None, line_margin=None,
|
||||
boxes_flow=None, output_type='text', codec='utf-8',
|
||||
strip_control=False, maxpages=0, page_numbers=None,
|
||||
password="", scale=1.0, rotation=0, layoutmode='normal',
|
||||
output_dir=None, debug=False, disable_caching=False,
|
||||
**kwargs):
|
||||
if not files:
|
||||
raise ValueError("Must provide files to work upon!")
|
||||
|
||||
# If any LAParams group arguments were passed,
|
||||
# create an LAParams object and
|
||||
# populate with given args. Otherwise, set it to None.
|
||||
if not no_laparams:
|
||||
laparams = pdfminer.layout.LAParams()
|
||||
for param in ("all_texts", "detect_vertical", "word_margin",
|
||||
"char_margin", "line_margin", "boxes_flow"):
|
||||
paramv = locals().get(param, None)
|
||||
if paramv is not None:
|
||||
setattr(laparams, param, paramv)
|
||||
else:
|
||||
laparams = None
|
||||
|
||||
if output_type == "text" and outfile != "-":
|
||||
for override, alttype in OUTPUT_TYPES:
|
||||
if outfile.endswith(override):
|
||||
output_type = alttype
|
||||
|
||||
if outfile == "-":
|
||||
outfp = sys.stdout
|
||||
if outfp.encoding is not None:
|
||||
codec = 'utf-8'
|
||||
else:
|
||||
outfp = open(outfile, "wb")
|
||||
|
||||
for fname in files:
|
||||
with open(fname, "rb") as fp:
|
||||
pdfminer.high_level.extract_text_to_fp(fp, **locals())
|
||||
return outfp
|
||||
|
||||
|
||||
def maketheparser():
|
||||
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
|
||||
parser.add_argument(
|
||||
"files", type=str, default=None, nargs="+",
|
||||
help="One or more paths to PDF files.")
|
||||
|
||||
parser.add_argument(
|
||||
"--version", "-v", action="version",
|
||||
version="pdfminer.six v{}".format(pdfminer.__version__))
|
||||
parser.add_argument(
|
||||
"--debug", "-d", default=False, action="store_true",
|
||||
help="Use debug logging level.")
|
||||
parser.add_argument(
|
||||
"--disable-caching", "-C", default=False, action="store_true",
|
||||
help="If caching or resources, such as fonts, should be disabled.")
|
||||
|
||||
parse_params = parser.add_argument_group(
|
||||
'Parser', description='Used during PDF parsing')
|
||||
parse_params.add_argument(
|
||||
"--page-numbers", type=int, default=None, nargs="+",
|
||||
help="A space-seperated list of page numbers to parse.")
|
||||
parse_params.add_argument(
|
||||
"--pagenos", "-p", type=str,
|
||||
help="A comma-separated list of page numbers to parse. "
|
||||
"Included for legacy applications, use --page-numbers "
|
||||
"for more idiomatic argument entry.")
|
||||
parse_params.add_argument(
|
||||
"--maxpages", "-m", type=int, default=0,
|
||||
help="The maximum number of pages to parse.")
|
||||
parse_params.add_argument(
|
||||
"--password", "-P", type=str, default="",
|
||||
help="The password to use for decrypting PDF file.")
|
||||
parse_params.add_argument(
|
||||
"--rotation", "-R", default=0, type=int,
|
||||
help="The number of degrees to rotate the PDF "
|
||||
"before other types of processing.")
|
||||
|
||||
la_params = parser.add_argument_group(
|
||||
'Layout analysis', description='Used during layout analysis.')
|
||||
la_params.add_argument(
|
||||
"--no-laparams", "-n", default=False, action="store_true",
|
||||
help="If layout analysis parameters should be ignored.")
|
||||
la_params.add_argument(
|
||||
"--detect-vertical", "-V", default=False, action="store_true",
|
||||
help="If vertical text should be considered during layout analysis")
|
||||
la_params.add_argument(
|
||||
"--char-margin", "-M", type=float, default=2.0,
|
||||
help="If two characters are closer together than this margin they "
|
||||
"are considered to be part of the same line. The margin is "
|
||||
"specified relative to the width of the character.")
|
||||
la_params.add_argument(
|
||||
"--word-margin", "-W", type=float, default=0.1,
|
||||
help="If two characters on the same line are further apart than this "
|
||||
"margin then they are considered to be two separate words, and "
|
||||
"an intermediate space will be added for readability. The margin "
|
||||
"is specified relative to the width of the character.")
|
||||
la_params.add_argument(
|
||||
"--line-margin", "-L", type=float, default=0.5,
|
||||
help="If two lines are are close together they are considered to "
|
||||
"be part of the same paragraph. The margin is specified "
|
||||
"relative to the height of a line.")
|
||||
la_params.add_argument(
|
||||
"--boxes-flow", "-F", type=float_or_disabled, default=0.5,
|
||||
help="Specifies how much a horizontal and vertical position of a "
|
||||
"text matters when determining the order of lines. The value "
|
||||
"should be within the range of -1.0 (only horizontal position "
|
||||
"matters) to +1.0 (only vertical position matters). You can also "
|
||||
"pass `disabled` to disable advanced layout analysis, and "
|
||||
"instead return text based on the position of the bottom left "
|
||||
"corner of the text box.")
|
||||
la_params.add_argument(
|
||||
"--all-texts", "-A", default=False, action="store_true",
|
||||
help="If layout analysis should be performed on text in figures.")
|
||||
|
||||
output_params = parser.add_argument_group(
|
||||
'Output', description='Used during output generation.')
|
||||
output_params.add_argument(
|
||||
"--outfile", "-o", type=str, default="-",
|
||||
help="Path to file where output is written. "
|
||||
"Or \"-\" (default) to write to stdout.")
|
||||
output_params.add_argument(
|
||||
"--output_type", "-t", type=str, default="text",
|
||||
help="Type of output to generate {text,html,xml,tag}.")
|
||||
output_params.add_argument(
|
||||
"--codec", "-c", type=str, default="utf-8",
|
||||
help="Text encoding to use in output file.")
|
||||
output_params.add_argument(
|
||||
"--output-dir", "-O", default=None,
|
||||
help="The output directory to put extracted images in. If not given, "
|
||||
"images are not extracted.")
|
||||
output_params.add_argument(
|
||||
"--layoutmode", "-Y", default="normal",
|
||||
type=str, help="Type of layout to use when generating html "
|
||||
"{normal,exact,loose}. If normal,each line is"
|
||||
" positioned separately in the html. If exact"
|
||||
", each character is positioned separately in"
|
||||
" the html. If loose, same result as normal "
|
||||
"but with an additional newline after each "
|
||||
"text line. Only used when output_type is html.")
|
||||
output_params.add_argument(
|
||||
"--scale", "-s", type=float, default=1.0,
|
||||
help="The amount of zoom to use when generating html file. "
|
||||
"Only used when output_type is html.")
|
||||
output_params.add_argument(
|
||||
"--strip-control", "-S", default=False, action="store_true",
|
||||
help="Remove control statement from text. "
|
||||
"Only used when output_type is xml.")
|
||||
return parser
|
||||
|
||||
|
||||
# main
|
||||
|
||||
|
||||
def main(args=None):
|
||||
|
||||
P = maketheparser()
|
||||
A = P.parse_args(args=args)
|
||||
|
||||
if A.page_numbers:
|
||||
A.page_numbers = {x-1 for x in A.page_numbers}
|
||||
if A.pagenos:
|
||||
A.page_numbers = {int(x)-1 for x in A.pagenos.split(",")}
|
||||
|
||||
if A.output_type == "text" and A.outfile != "-":
|
||||
for override, alttype in OUTPUT_TYPES:
|
||||
if A.outfile.endswith(override):
|
||||
A.output_type = alttype
|
||||
|
||||
outfp = extract_text(**vars(A))
|
||||
outfp.close()
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
@@ -1,12 +0,0 @@
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
|
||||
__version__ = '20201018'
|
||||
|
||||
if sys.version_info < (3, 6):
|
||||
warnings.warn('Python 3.4 and 3.5 are deprecated. '
|
||||
'Please upgrade to Python 3.6 or newer.')
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(__version__)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,35 +0,0 @@
|
||||
""" Python implementation of Arcfour encryption algorithm.
|
||||
See https://en.wikipedia.org/wiki/RC4
|
||||
This code is in the public domain.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
class Arcfour:
|
||||
|
||||
def __init__(self, key):
|
||||
# because Py3 range is not indexable
|
||||
s = [i for i in range(256)]
|
||||
j = 0
|
||||
klen = len(key)
|
||||
for i in range(256):
|
||||
j = (j + s[i] + key[i % klen]) % 256
|
||||
(s[i], s[j]) = (s[j], s[i])
|
||||
self.s = s
|
||||
(self.i, self.j) = (0, 0)
|
||||
return
|
||||
|
||||
def process(self, data):
|
||||
(i, j) = (self.i, self.j)
|
||||
s = self.s
|
||||
r = b''
|
||||
for c in iter(data):
|
||||
i = (i+1) % 256
|
||||
j = (j+s[i]) % 256
|
||||
(s[i], s[j]) = (s[j], s[i])
|
||||
k = s[(s[i]+s[j]) % 256]
|
||||
r += bytes((c ^ k,))
|
||||
(self.i, self.j) = (i, j)
|
||||
return r
|
||||
|
||||
encrypt = decrypt = process
|
||||
@@ -1,71 +0,0 @@
|
||||
""" Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
|
||||
|
||||
This code is in the public domain.
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
import struct
|
||||
|
||||
|
||||
# ascii85decode(data)
|
||||
def ascii85decode(data):
|
||||
"""
|
||||
In ASCII85 encoding, every four bytes are encoded with five ASCII
|
||||
letters, using 85 different types of characters (as 256**4 < 85**5).
|
||||
When the length of the original bytes is not a multiple of 4, a special
|
||||
rule is used for round up.
|
||||
|
||||
The Adobe's ASCII85 implementation is slightly different from
|
||||
its original in handling the last characters.
|
||||
|
||||
"""
|
||||
n = b = 0
|
||||
out = b''
|
||||
for i in iter(data):
|
||||
c = bytes((i,))
|
||||
if b'!' <= c and c <= b'u':
|
||||
n += 1
|
||||
b = b*85+(ord(c)-33)
|
||||
if n == 5:
|
||||
out += struct.pack('>L', b)
|
||||
n = b = 0
|
||||
elif c == b'z':
|
||||
assert n == 0, str(n)
|
||||
out += b'\0\0\0\0'
|
||||
elif c == b'~':
|
||||
if n:
|
||||
for _ in range(5-n):
|
||||
b = b*85+84
|
||||
out += struct.pack('>L', b)[:n-1]
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
# asciihexdecode(data)
|
||||
hex_re = re.compile(br'([a-f\d]{2})', re.IGNORECASE)
|
||||
trail_re = re.compile(br'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
|
||||
|
||||
|
||||
def asciihexdecode(data):
|
||||
"""
|
||||
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
|
||||
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
|
||||
ASCIIHexDecode filter produces one byte of binary data. All white-space
|
||||
characters are ignored. A right angle bracket character (>) indicates
|
||||
EOD. Any other characters will cause an error. If the filter encounters
|
||||
the EOD marker after reading an odd number of hexadecimal digits, it
|
||||
will behave as if a 0 followed the last digit.
|
||||
"""
|
||||
def decode(x):
|
||||
i = int(x, 16)
|
||||
return bytes((i,))
|
||||
|
||||
out = b''
|
||||
for x in hex_re.findall(data):
|
||||
out += decode(x)
|
||||
|
||||
m = trail_re.search(data)
|
||||
if m:
|
||||
out += decode(m.group(1)+b'0')
|
||||
return out
|
||||
@@ -1,593 +0,0 @@
|
||||
# CCITT Fax decoder
|
||||
#
|
||||
# Bugs: uncompressed mode untested.
|
||||
#
|
||||
# cf.
|
||||
# ITU-T Recommendation T.4
|
||||
# "Standardization of Group 3 facsimile terminals
|
||||
# for document transmission"
|
||||
# ITU-T Recommendation T.6
|
||||
# "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS
|
||||
# FOR GROUP 4 FACSIMILE APPARATUS"
|
||||
|
||||
|
||||
import sys
|
||||
import array
|
||||
|
||||
|
||||
def get_bytes(data):
|
||||
yield from data
|
||||
|
||||
|
||||
class BitParser:
|
||||
def __init__(self):
|
||||
self._pos = 0
|
||||
return
|
||||
|
||||
@classmethod
|
||||
def add(cls, root, v, bits):
|
||||
p = root
|
||||
b = None
|
||||
for i in range(len(bits)):
|
||||
if 0 < i:
|
||||
if p[b] is None:
|
||||
p[b] = [None, None]
|
||||
p = p[b]
|
||||
if bits[i] == '1':
|
||||
b = 1
|
||||
else:
|
||||
b = 0
|
||||
p[b] = v
|
||||
return
|
||||
|
||||
def feedbytes(self, data):
|
||||
for byte in get_bytes(data):
|
||||
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
||||
self._parse_bit(byte & m)
|
||||
return
|
||||
|
||||
def _parse_bit(self, x):
|
||||
if x:
|
||||
v = self._state[1]
|
||||
else:
|
||||
v = self._state[0]
|
||||
self._pos += 1
|
||||
if isinstance(v, list):
|
||||
self._state = v
|
||||
else:
|
||||
self._state = self._accept(v)
|
||||
return
|
||||
|
||||
|
||||
class CCITTG4Parser(BitParser):
|
||||
|
||||
MODE = [None, None]
|
||||
BitParser.add(MODE, 0, '1')
|
||||
BitParser.add(MODE, +1, '011')
|
||||
BitParser.add(MODE, -1, '010')
|
||||
BitParser.add(MODE, 'h', '001')
|
||||
BitParser.add(MODE, 'p', '0001')
|
||||
BitParser.add(MODE, +2, '000011')
|
||||
BitParser.add(MODE, -2, '000010')
|
||||
BitParser.add(MODE, +3, '0000011')
|
||||
BitParser.add(MODE, -3, '0000010')
|
||||
BitParser.add(MODE, 'u', '0000001111')
|
||||
BitParser.add(MODE, 'x1', '0000001000')
|
||||
BitParser.add(MODE, 'x2', '0000001001')
|
||||
BitParser.add(MODE, 'x3', '0000001010')
|
||||
BitParser.add(MODE, 'x4', '0000001011')
|
||||
BitParser.add(MODE, 'x5', '0000001100')
|
||||
BitParser.add(MODE, 'x6', '0000001101')
|
||||
BitParser.add(MODE, 'x7', '0000001110')
|
||||
BitParser.add(MODE, 'e', '000000000001000000000001')
|
||||
|
||||
WHITE = [None, None]
|
||||
BitParser.add(WHITE, 0, '00110101')
|
||||
BitParser.add(WHITE, 1, '000111')
|
||||
BitParser.add(WHITE, 2, '0111')
|
||||
BitParser.add(WHITE, 3, '1000')
|
||||
BitParser.add(WHITE, 4, '1011')
|
||||
BitParser.add(WHITE, 5, '1100')
|
||||
BitParser.add(WHITE, 6, '1110')
|
||||
BitParser.add(WHITE, 7, '1111')
|
||||
BitParser.add(WHITE, 8, '10011')
|
||||
BitParser.add(WHITE, 9, '10100')
|
||||
BitParser.add(WHITE, 10, '00111')
|
||||
BitParser.add(WHITE, 11, '01000')
|
||||
BitParser.add(WHITE, 12, '001000')
|
||||
BitParser.add(WHITE, 13, '000011')
|
||||
BitParser.add(WHITE, 14, '110100')
|
||||
BitParser.add(WHITE, 15, '110101')
|
||||
BitParser.add(WHITE, 16, '101010')
|
||||
BitParser.add(WHITE, 17, '101011')
|
||||
BitParser.add(WHITE, 18, '0100111')
|
||||
BitParser.add(WHITE, 19, '0001100')
|
||||
BitParser.add(WHITE, 20, '0001000')
|
||||
BitParser.add(WHITE, 21, '0010111')
|
||||
BitParser.add(WHITE, 22, '0000011')
|
||||
BitParser.add(WHITE, 23, '0000100')
|
||||
BitParser.add(WHITE, 24, '0101000')
|
||||
BitParser.add(WHITE, 25, '0101011')
|
||||
BitParser.add(WHITE, 26, '0010011')
|
||||
BitParser.add(WHITE, 27, '0100100')
|
||||
BitParser.add(WHITE, 28, '0011000')
|
||||
BitParser.add(WHITE, 29, '00000010')
|
||||
BitParser.add(WHITE, 30, '00000011')
|
||||
BitParser.add(WHITE, 31, '00011010')
|
||||
BitParser.add(WHITE, 32, '00011011')
|
||||
BitParser.add(WHITE, 33, '00010010')
|
||||
BitParser.add(WHITE, 34, '00010011')
|
||||
BitParser.add(WHITE, 35, '00010100')
|
||||
BitParser.add(WHITE, 36, '00010101')
|
||||
BitParser.add(WHITE, 37, '00010110')
|
||||
BitParser.add(WHITE, 38, '00010111')
|
||||
BitParser.add(WHITE, 39, '00101000')
|
||||
BitParser.add(WHITE, 40, '00101001')
|
||||
BitParser.add(WHITE, 41, '00101010')
|
||||
BitParser.add(WHITE, 42, '00101011')
|
||||
BitParser.add(WHITE, 43, '00101100')
|
||||
BitParser.add(WHITE, 44, '00101101')
|
||||
BitParser.add(WHITE, 45, '00000100')
|
||||
BitParser.add(WHITE, 46, '00000101')
|
||||
BitParser.add(WHITE, 47, '00001010')
|
||||
BitParser.add(WHITE, 48, '00001011')
|
||||
BitParser.add(WHITE, 49, '01010010')
|
||||
BitParser.add(WHITE, 50, '01010011')
|
||||
BitParser.add(WHITE, 51, '01010100')
|
||||
BitParser.add(WHITE, 52, '01010101')
|
||||
BitParser.add(WHITE, 53, '00100100')
|
||||
BitParser.add(WHITE, 54, '00100101')
|
||||
BitParser.add(WHITE, 55, '01011000')
|
||||
BitParser.add(WHITE, 56, '01011001')
|
||||
BitParser.add(WHITE, 57, '01011010')
|
||||
BitParser.add(WHITE, 58, '01011011')
|
||||
BitParser.add(WHITE, 59, '01001010')
|
||||
BitParser.add(WHITE, 60, '01001011')
|
||||
BitParser.add(WHITE, 61, '00110010')
|
||||
BitParser.add(WHITE, 62, '00110011')
|
||||
BitParser.add(WHITE, 63, '00110100')
|
||||
BitParser.add(WHITE, 64, '11011')
|
||||
BitParser.add(WHITE, 128, '10010')
|
||||
BitParser.add(WHITE, 192, '010111')
|
||||
BitParser.add(WHITE, 256, '0110111')
|
||||
BitParser.add(WHITE, 320, '00110110')
|
||||
BitParser.add(WHITE, 384, '00110111')
|
||||
BitParser.add(WHITE, 448, '01100100')
|
||||
BitParser.add(WHITE, 512, '01100101')
|
||||
BitParser.add(WHITE, 576, '01101000')
|
||||
BitParser.add(WHITE, 640, '01100111')
|
||||
BitParser.add(WHITE, 704, '011001100')
|
||||
BitParser.add(WHITE, 768, '011001101')
|
||||
BitParser.add(WHITE, 832, '011010010')
|
||||
BitParser.add(WHITE, 896, '011010011')
|
||||
BitParser.add(WHITE, 960, '011010100')
|
||||
BitParser.add(WHITE, 1024, '011010101')
|
||||
BitParser.add(WHITE, 1088, '011010110')
|
||||
BitParser.add(WHITE, 1152, '011010111')
|
||||
BitParser.add(WHITE, 1216, '011011000')
|
||||
BitParser.add(WHITE, 1280, '011011001')
|
||||
BitParser.add(WHITE, 1344, '011011010')
|
||||
BitParser.add(WHITE, 1408, '011011011')
|
||||
BitParser.add(WHITE, 1472, '010011000')
|
||||
BitParser.add(WHITE, 1536, '010011001')
|
||||
BitParser.add(WHITE, 1600, '010011010')
|
||||
BitParser.add(WHITE, 1664, '011000')
|
||||
BitParser.add(WHITE, 1728, '010011011')
|
||||
BitParser.add(WHITE, 1792, '00000001000')
|
||||
BitParser.add(WHITE, 1856, '00000001100')
|
||||
BitParser.add(WHITE, 1920, '00000001101')
|
||||
BitParser.add(WHITE, 1984, '000000010010')
|
||||
BitParser.add(WHITE, 2048, '000000010011')
|
||||
BitParser.add(WHITE, 2112, '000000010100')
|
||||
BitParser.add(WHITE, 2176, '000000010101')
|
||||
BitParser.add(WHITE, 2240, '000000010110')
|
||||
BitParser.add(WHITE, 2304, '000000010111')
|
||||
BitParser.add(WHITE, 2368, '000000011100')
|
||||
BitParser.add(WHITE, 2432, '000000011101')
|
||||
BitParser.add(WHITE, 2496, '000000011110')
|
||||
BitParser.add(WHITE, 2560, '000000011111')
|
||||
|
||||
BLACK = [None, None]
|
||||
BitParser.add(BLACK, 0, '0000110111')
|
||||
BitParser.add(BLACK, 1, '010')
|
||||
BitParser.add(BLACK, 2, '11')
|
||||
BitParser.add(BLACK, 3, '10')
|
||||
BitParser.add(BLACK, 4, '011')
|
||||
BitParser.add(BLACK, 5, '0011')
|
||||
BitParser.add(BLACK, 6, '0010')
|
||||
BitParser.add(BLACK, 7, '00011')
|
||||
BitParser.add(BLACK, 8, '000101')
|
||||
BitParser.add(BLACK, 9, '000100')
|
||||
BitParser.add(BLACK, 10, '0000100')
|
||||
BitParser.add(BLACK, 11, '0000101')
|
||||
BitParser.add(BLACK, 12, '0000111')
|
||||
BitParser.add(BLACK, 13, '00000100')
|
||||
BitParser.add(BLACK, 14, '00000111')
|
||||
BitParser.add(BLACK, 15, '000011000')
|
||||
BitParser.add(BLACK, 16, '0000010111')
|
||||
BitParser.add(BLACK, 17, '0000011000')
|
||||
BitParser.add(BLACK, 18, '0000001000')
|
||||
BitParser.add(BLACK, 19, '00001100111')
|
||||
BitParser.add(BLACK, 20, '00001101000')
|
||||
BitParser.add(BLACK, 21, '00001101100')
|
||||
BitParser.add(BLACK, 22, '00000110111')
|
||||
BitParser.add(BLACK, 23, '00000101000')
|
||||
BitParser.add(BLACK, 24, '00000010111')
|
||||
BitParser.add(BLACK, 25, '00000011000')
|
||||
BitParser.add(BLACK, 26, '000011001010')
|
||||
BitParser.add(BLACK, 27, '000011001011')
|
||||
BitParser.add(BLACK, 28, '000011001100')
|
||||
BitParser.add(BLACK, 29, '000011001101')
|
||||
BitParser.add(BLACK, 30, '000001101000')
|
||||
BitParser.add(BLACK, 31, '000001101001')
|
||||
BitParser.add(BLACK, 32, '000001101010')
|
||||
BitParser.add(BLACK, 33, '000001101011')
|
||||
BitParser.add(BLACK, 34, '000011010010')
|
||||
BitParser.add(BLACK, 35, '000011010011')
|
||||
BitParser.add(BLACK, 36, '000011010100')
|
||||
BitParser.add(BLACK, 37, '000011010101')
|
||||
BitParser.add(BLACK, 38, '000011010110')
|
||||
BitParser.add(BLACK, 39, '000011010111')
|
||||
BitParser.add(BLACK, 40, '000001101100')
|
||||
BitParser.add(BLACK, 41, '000001101101')
|
||||
BitParser.add(BLACK, 42, '000011011010')
|
||||
BitParser.add(BLACK, 43, '000011011011')
|
||||
BitParser.add(BLACK, 44, '000001010100')
|
||||
BitParser.add(BLACK, 45, '000001010101')
|
||||
BitParser.add(BLACK, 46, '000001010110')
|
||||
BitParser.add(BLACK, 47, '000001010111')
|
||||
BitParser.add(BLACK, 48, '000001100100')
|
||||
BitParser.add(BLACK, 49, '000001100101')
|
||||
BitParser.add(BLACK, 50, '000001010010')
|
||||
BitParser.add(BLACK, 51, '000001010011')
|
||||
BitParser.add(BLACK, 52, '000000100100')
|
||||
BitParser.add(BLACK, 53, '000000110111')
|
||||
BitParser.add(BLACK, 54, '000000111000')
|
||||
BitParser.add(BLACK, 55, '000000100111')
|
||||
BitParser.add(BLACK, 56, '000000101000')
|
||||
BitParser.add(BLACK, 57, '000001011000')
|
||||
BitParser.add(BLACK, 58, '000001011001')
|
||||
BitParser.add(BLACK, 59, '000000101011')
|
||||
BitParser.add(BLACK, 60, '000000101100')
|
||||
BitParser.add(BLACK, 61, '000001011010')
|
||||
BitParser.add(BLACK, 62, '000001100110')
|
||||
BitParser.add(BLACK, 63, '000001100111')
|
||||
BitParser.add(BLACK, 64, '0000001111')
|
||||
BitParser.add(BLACK, 128, '000011001000')
|
||||
BitParser.add(BLACK, 192, '000011001001')
|
||||
BitParser.add(BLACK, 256, '000001011011')
|
||||
BitParser.add(BLACK, 320, '000000110011')
|
||||
BitParser.add(BLACK, 384, '000000110100')
|
||||
BitParser.add(BLACK, 448, '000000110101')
|
||||
BitParser.add(BLACK, 512, '0000001101100')
|
||||
BitParser.add(BLACK, 576, '0000001101101')
|
||||
BitParser.add(BLACK, 640, '0000001001010')
|
||||
BitParser.add(BLACK, 704, '0000001001011')
|
||||
BitParser.add(BLACK, 768, '0000001001100')
|
||||
BitParser.add(BLACK, 832, '0000001001101')
|
||||
BitParser.add(BLACK, 896, '0000001110010')
|
||||
BitParser.add(BLACK, 960, '0000001110011')
|
||||
BitParser.add(BLACK, 1024, '0000001110100')
|
||||
BitParser.add(BLACK, 1088, '0000001110101')
|
||||
BitParser.add(BLACK, 1152, '0000001110110')
|
||||
BitParser.add(BLACK, 1216, '0000001110111')
|
||||
BitParser.add(BLACK, 1280, '0000001010010')
|
||||
BitParser.add(BLACK, 1344, '0000001010011')
|
||||
BitParser.add(BLACK, 1408, '0000001010100')
|
||||
BitParser.add(BLACK, 1472, '0000001010101')
|
||||
BitParser.add(BLACK, 1536, '0000001011010')
|
||||
BitParser.add(BLACK, 1600, '0000001011011')
|
||||
BitParser.add(BLACK, 1664, '0000001100100')
|
||||
BitParser.add(BLACK, 1728, '0000001100101')
|
||||
BitParser.add(BLACK, 1792, '00000001000')
|
||||
BitParser.add(BLACK, 1856, '00000001100')
|
||||
BitParser.add(BLACK, 1920, '00000001101')
|
||||
BitParser.add(BLACK, 1984, '000000010010')
|
||||
BitParser.add(BLACK, 2048, '000000010011')
|
||||
BitParser.add(BLACK, 2112, '000000010100')
|
||||
BitParser.add(BLACK, 2176, '000000010101')
|
||||
BitParser.add(BLACK, 2240, '000000010110')
|
||||
BitParser.add(BLACK, 2304, '000000010111')
|
||||
BitParser.add(BLACK, 2368, '000000011100')
|
||||
BitParser.add(BLACK, 2432, '000000011101')
|
||||
BitParser.add(BLACK, 2496, '000000011110')
|
||||
BitParser.add(BLACK, 2560, '000000011111')
|
||||
|
||||
UNCOMPRESSED = [None, None]
|
||||
BitParser.add(UNCOMPRESSED, '1', '1')
|
||||
BitParser.add(UNCOMPRESSED, '01', '01')
|
||||
BitParser.add(UNCOMPRESSED, '001', '001')
|
||||
BitParser.add(UNCOMPRESSED, '0001', '0001')
|
||||
BitParser.add(UNCOMPRESSED, '00001', '00001')
|
||||
BitParser.add(UNCOMPRESSED, '00000', '000001')
|
||||
BitParser.add(UNCOMPRESSED, 'T00', '00000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T10', '00000010')
|
||||
BitParser.add(UNCOMPRESSED, 'T000', '000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T100', '000000010')
|
||||
BitParser.add(UNCOMPRESSED, 'T0000', '0000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T1000', '0000000010')
|
||||
BitParser.add(UNCOMPRESSED, 'T00000', '00000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T10000', '00000000010')
|
||||
|
||||
class EOFB(Exception):
|
||||
pass
|
||||
|
||||
class InvalidData(Exception):
|
||||
pass
|
||||
|
||||
class ByteSkip(Exception):
|
||||
pass
|
||||
|
||||
def __init__(self, width, bytealign=False):
|
||||
BitParser.__init__(self)
|
||||
self.width = width
|
||||
self.bytealign = bytealign
|
||||
self.reset()
|
||||
return
|
||||
|
||||
def feedbytes(self, data):
|
||||
for byte in get_bytes(data):
|
||||
try:
|
||||
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
||||
self._parse_bit(byte & m)
|
||||
except self.ByteSkip:
|
||||
self._accept = self._parse_mode
|
||||
self._state = self.MODE
|
||||
except self.EOFB:
|
||||
break
|
||||
return
|
||||
|
||||
def _parse_mode(self, mode):
|
||||
if mode == 'p':
|
||||
self._do_pass()
|
||||
self._flush_line()
|
||||
return self.MODE
|
||||
elif mode == 'h':
|
||||
self._n1 = 0
|
||||
self._accept = self._parse_horiz1
|
||||
if self._color:
|
||||
return self.WHITE
|
||||
else:
|
||||
return self.BLACK
|
||||
elif mode == 'u':
|
||||
self._accept = self._parse_uncompressed
|
||||
return self.UNCOMPRESSED
|
||||
elif mode == 'e':
|
||||
raise self.EOFB
|
||||
elif isinstance(mode, int):
|
||||
self._do_vertical(mode)
|
||||
self._flush_line()
|
||||
return self.MODE
|
||||
else:
|
||||
raise self.InvalidData(mode)
|
||||
|
||||
def _parse_horiz1(self, n):
|
||||
if n is None:
|
||||
raise self.InvalidData
|
||||
self._n1 += n
|
||||
if n < 64:
|
||||
self._n2 = 0
|
||||
self._color = 1-self._color
|
||||
self._accept = self._parse_horiz2
|
||||
if self._color:
|
||||
return self.WHITE
|
||||
else:
|
||||
return self.BLACK
|
||||
|
||||
def _parse_horiz2(self, n):
|
||||
if n is None:
|
||||
raise self.InvalidData
|
||||
self._n2 += n
|
||||
if n < 64:
|
||||
self._color = 1-self._color
|
||||
self._accept = self._parse_mode
|
||||
self._do_horizontal(self._n1, self._n2)
|
||||
self._flush_line()
|
||||
return self.MODE
|
||||
elif self._color:
|
||||
return self.WHITE
|
||||
else:
|
||||
return self.BLACK
|
||||
|
||||
def _parse_uncompressed(self, bits):
|
||||
if not bits:
|
||||
raise self.InvalidData
|
||||
if bits.startswith('T'):
|
||||
self._accept = self._parse_mode
|
||||
self._color = int(bits[1])
|
||||
self._do_uncompressed(bits[2:])
|
||||
return self.MODE
|
||||
else:
|
||||
self._do_uncompressed(bits)
|
||||
return self.UNCOMPRESSED
|
||||
|
||||
def _get_bits(self):
|
||||
return ''.join(str(b) for b in self._curline[:self._curpos])
|
||||
|
||||
def _get_refline(self, i):
|
||||
if i < 0:
|
||||
return '[]'+''.join(str(b) for b in self._refline)
|
||||
elif len(self._refline) <= i:
|
||||
return ''.join(str(b) for b in self._refline)+'[]'
|
||||
else:
|
||||
return (''.join(str(b) for b in self._refline[:i]) +
|
||||
'['+str(self._refline[i])+']' +
|
||||
''.join(str(b) for b in self._refline[i+1:]))
|
||||
|
||||
def reset(self):
|
||||
self._y = 0
|
||||
self._curline = array.array('b', [1]*self.width)
|
||||
self._reset_line()
|
||||
self._accept = self._parse_mode
|
||||
self._state = self.MODE
|
||||
return
|
||||
|
||||
def output_line(self, y, bits):
|
||||
print(y, ''.join(str(b) for b in bits))
|
||||
return
|
||||
|
||||
def _reset_line(self):
|
||||
self._refline = self._curline
|
||||
self._curline = array.array('b', [1]*self.width)
|
||||
self._curpos = -1
|
||||
self._color = 1
|
||||
return
|
||||
|
||||
def _flush_line(self):
|
||||
if self.width <= self._curpos:
|
||||
self.output_line(self._y, self._curline)
|
||||
self._y += 1
|
||||
self._reset_line()
|
||||
if self.bytealign:
|
||||
raise self.ByteSkip
|
||||
return
|
||||
|
||||
def _do_vertical(self, dx):
|
||||
x1 = self._curpos+1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
if (self._color == 1 and self._refline[x1] != self._color):
|
||||
break
|
||||
elif x1 == len(self._refline):
|
||||
break
|
||||
elif (self._refline[x1-1] == self._color and
|
||||
self._refline[x1] != self._color):
|
||||
break
|
||||
x1 += 1
|
||||
x1 += dx
|
||||
x0 = max(0, self._curpos)
|
||||
x1 = max(0, min(self.width, x1))
|
||||
if x1 < x0:
|
||||
for x in range(x1, x0):
|
||||
self._curline[x] = self._color
|
||||
elif x0 < x1:
|
||||
for x in range(x0, x1):
|
||||
self._curline[x] = self._color
|
||||
self._curpos = x1
|
||||
self._color = 1-self._color
|
||||
return
|
||||
|
||||
def _do_pass(self):
|
||||
x1 = self._curpos+1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
if (self._color == 1 and self._refline[x1] != self._color):
|
||||
break
|
||||
elif x1 == len(self._refline):
|
||||
break
|
||||
elif (self._refline[x1-1] == self._color and
|
||||
self._refline[x1] != self._color):
|
||||
break
|
||||
x1 += 1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
if (self._color == 0 and self._refline[x1] == self._color):
|
||||
break
|
||||
elif x1 == len(self._refline):
|
||||
break
|
||||
elif (self._refline[x1-1] != self._color and
|
||||
self._refline[x1] == self._color):
|
||||
break
|
||||
x1 += 1
|
||||
for x in range(self._curpos, x1):
|
||||
self._curline[x] = self._color
|
||||
self._curpos = x1
|
||||
return
|
||||
|
||||
def _do_horizontal(self, n1, n2):
|
||||
if self._curpos < 0:
|
||||
self._curpos = 0
|
||||
x = self._curpos
|
||||
for _ in range(n1):
|
||||
if len(self._curline) <= x:
|
||||
break
|
||||
self._curline[x] = self._color
|
||||
x += 1
|
||||
for _ in range(n2):
|
||||
if len(self._curline) <= x:
|
||||
break
|
||||
self._curline[x] = 1-self._color
|
||||
x += 1
|
||||
self._curpos = x
|
||||
return
|
||||
|
||||
def _do_uncompressed(self, bits):
|
||||
for c in bits:
|
||||
self._curline[self._curpos] = int(c)
|
||||
self._curpos += 1
|
||||
self._flush_line()
|
||||
return
|
||||
|
||||
|
||||
class CCITTFaxDecoder(CCITTG4Parser):
|
||||
|
||||
def __init__(self, width, bytealign=False, reversed=False):
|
||||
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
||||
self.reversed = reversed
|
||||
self._buf = b''
|
||||
return
|
||||
|
||||
def close(self):
|
||||
return self._buf
|
||||
|
||||
def output_line(self, y, bits):
|
||||
bytes = array.array('B', [0]*((len(bits)+7)//8))
|
||||
if self.reversed:
|
||||
bits = [1-b for b in bits]
|
||||
for (i, b) in enumerate(bits):
|
||||
if b:
|
||||
bytes[i//8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
|
||||
self._buf += bytes.tostring()
|
||||
return
|
||||
|
||||
|
||||
def ccittfaxdecode(data, params):
|
||||
K = params.get('K')
|
||||
cols = params.get('Columns')
|
||||
bytealign = params.get('EncodedByteAlign')
|
||||
reversed = params.get('BlackIs1')
|
||||
if K == -1:
|
||||
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
|
||||
else:
|
||||
raise ValueError(K)
|
||||
parser.feedbytes(data)
|
||||
return parser.close()
|
||||
|
||||
|
||||
# test
|
||||
def main(argv):
|
||||
if not argv[1:]:
|
||||
import unittest
|
||||
return unittest.main()
|
||||
|
||||
class Parser(CCITTG4Parser):
|
||||
def __init__(self, width, bytealign=False):
|
||||
import pygame
|
||||
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
||||
self.img = pygame.Surface((self.width, 1000))
|
||||
return
|
||||
|
||||
def output_line(self, y, bits):
|
||||
for (x, b) in enumerate(bits):
|
||||
if b:
|
||||
self.img.set_at((x, y), (255, 255, 255))
|
||||
else:
|
||||
self.img.set_at((x, y), (0, 0, 0))
|
||||
return
|
||||
|
||||
def close(self):
|
||||
import pygame
|
||||
pygame.image.save(self.img, 'out.bmp')
|
||||
return
|
||||
for path in argv[1:]:
|
||||
fp = open(path, 'rb')
|
||||
(_, _, k, w, h, _) = path.split('.')
|
||||
parser = Parser(int(w))
|
||||
parser.feedbytes(fp.read())
|
||||
parser.close()
|
||||
fp.close()
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
@@ -1,425 +0,0 @@
|
||||
""" Adobe character mapping (CMap) support.
|
||||
|
||||
CMaps provide the mapping between character codes and Unicode
|
||||
code-points to character ids (CIDs).
|
||||
|
||||
More information is available on the Adobe website:
|
||||
|
||||
http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
|
||||
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import os.path
|
||||
import gzip
|
||||
import pickle as pickle
|
||||
import struct
|
||||
import logging
|
||||
from .psparser import PSStackParser
|
||||
from .psparser import PSSyntaxError
|
||||
from .psparser import PSEOF
|
||||
from .psparser import PSLiteral
|
||||
from .psparser import literal_name
|
||||
from .psparser import KWD
|
||||
from .encodingdb import name2unicode
|
||||
from .utils import choplist
|
||||
from .utils import nunpack
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CMapError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class CMapBase:
|
||||
|
||||
debug = 0
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.attrs = kwargs.copy()
|
||||
return
|
||||
|
||||
def is_vertical(self):
|
||||
return self.attrs.get('WMode', 0) != 0
|
||||
|
||||
def set_attr(self, k, v):
|
||||
self.attrs[k] = v
|
||||
return
|
||||
|
||||
def add_code2cid(self, code, cid):
|
||||
return
|
||||
|
||||
def add_cid2unichr(self, cid, code):
|
||||
return
|
||||
|
||||
def use_cmap(self, cmap):
|
||||
return
|
||||
|
||||
|
||||
class CMap(CMapBase):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
CMapBase.__init__(self, **kwargs)
|
||||
self.code2cid = {}
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<CMap: %s>' % self.attrs.get('CMapName')
|
||||
|
||||
def use_cmap(self, cmap):
|
||||
assert isinstance(cmap, CMap), str(type(cmap))
|
||||
|
||||
def copy(dst, src):
|
||||
for (k, v) in src.items():
|
||||
if isinstance(v, dict):
|
||||
d = {}
|
||||
dst[k] = d
|
||||
copy(d, v)
|
||||
else:
|
||||
dst[k] = v
|
||||
copy(self.code2cid, cmap.code2cid)
|
||||
return
|
||||
|
||||
def decode(self, code):
|
||||
log.debug('decode: %r, %r', self, code)
|
||||
d = self.code2cid
|
||||
for i in iter(code):
|
||||
if i in d:
|
||||
d = d[i]
|
||||
if isinstance(d, int):
|
||||
yield d
|
||||
d = self.code2cid
|
||||
else:
|
||||
d = self.code2cid
|
||||
return
|
||||
|
||||
def dump(self, out=sys.stdout, code2cid=None, code=None):
|
||||
if code2cid is None:
|
||||
code2cid = self.code2cid
|
||||
code = ()
|
||||
for (k, v) in sorted(code2cid.items()):
|
||||
c = code+(k,)
|
||||
if isinstance(v, int):
|
||||
out.write('code %r = cid %d\n' % (c, v))
|
||||
else:
|
||||
self.dump(out=out, code2cid=v, code=c)
|
||||
return
|
||||
|
||||
|
||||
class IdentityCMap(CMapBase):
|
||||
|
||||
def decode(self, code):
|
||||
n = len(code)//2
|
||||
if n:
|
||||
return struct.unpack('>%dH' % n, code)
|
||||
else:
|
||||
return ()
|
||||
|
||||
|
||||
class IdentityCMapByte(IdentityCMap):
|
||||
|
||||
def decode(self, code):
|
||||
n = len(code)
|
||||
if n:
|
||||
return struct.unpack('>%dB' % n, code)
|
||||
else:
|
||||
return ()
|
||||
|
||||
|
||||
class UnicodeMap(CMapBase):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
CMapBase.__init__(self, **kwargs)
|
||||
self.cid2unichr = {}
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
|
||||
|
||||
def get_unichr(self, cid):
|
||||
log.debug('get_unichr: %r, %r', self, cid)
|
||||
return self.cid2unichr[cid]
|
||||
|
||||
def dump(self, out=sys.stdout):
|
||||
for (k, v) in sorted(self.cid2unichr.items()):
|
||||
out.write('cid %d = unicode %r\n' % (k, v))
|
||||
return
|
||||
|
||||
|
||||
class FileCMap(CMap):
|
||||
|
||||
def add_code2cid(self, code, cid):
|
||||
assert isinstance(code, str) and isinstance(cid, int),\
|
||||
str((type(code), type(cid)))
|
||||
d = self.code2cid
|
||||
for c in code[:-1]:
|
||||
c = ord(c)
|
||||
if c in d:
|
||||
d = d[c]
|
||||
else:
|
||||
t = {}
|
||||
d[c] = t
|
||||
d = t
|
||||
c = ord(code[-1])
|
||||
d[c] = cid
|
||||
return
|
||||
|
||||
|
||||
class FileUnicodeMap(UnicodeMap):
|
||||
|
||||
def add_cid2unichr(self, cid, code):
|
||||
assert isinstance(cid, int), str(type(cid))
|
||||
if isinstance(code, PSLiteral):
|
||||
# Interpret as an Adobe glyph name.
|
||||
self.cid2unichr[cid] = name2unicode(code.name)
|
||||
elif isinstance(code, bytes):
|
||||
# Interpret as UTF-16BE.
|
||||
self.cid2unichr[cid] = code.decode('UTF-16BE', 'ignore')
|
||||
elif isinstance(code, int):
|
||||
self.cid2unichr[cid] = chr(code)
|
||||
else:
|
||||
raise TypeError(code)
|
||||
return
|
||||
|
||||
|
||||
class PyCMap(CMap):
|
||||
|
||||
def __init__(self, name, module):
|
||||
CMap.__init__(self, CMapName=name)
|
||||
self.code2cid = module.CODE2CID
|
||||
if module.IS_VERTICAL:
|
||||
self.attrs['WMode'] = 1
|
||||
return
|
||||
|
||||
|
||||
class PyUnicodeMap(UnicodeMap):
|
||||
|
||||
def __init__(self, name, module, vertical):
|
||||
UnicodeMap.__init__(self, CMapName=name)
|
||||
if vertical:
|
||||
self.cid2unichr = module.CID2UNICHR_V
|
||||
self.attrs['WMode'] = 1
|
||||
else:
|
||||
self.cid2unichr = module.CID2UNICHR_H
|
||||
return
|
||||
|
||||
|
||||
class CMapDB:
|
||||
|
||||
_cmap_cache = {}
|
||||
_umap_cache = {}
|
||||
|
||||
class CMapNotFound(CMapError):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def _load_data(cls, name):
|
||||
name = name.replace("\0", "")
|
||||
filename = '%s.pickle.gz' % name
|
||||
log.info('loading: %r', name)
|
||||
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
|
||||
os.path.join(os.path.dirname(__file__), 'cmap'),)
|
||||
for directory in cmap_paths:
|
||||
path = os.path.join(directory, filename)
|
||||
if os.path.exists(path):
|
||||
gzfile = gzip.open(path)
|
||||
try:
|
||||
return type(str(name), (), pickle.loads(gzfile.read()))
|
||||
finally:
|
||||
gzfile.close()
|
||||
else:
|
||||
raise CMapDB.CMapNotFound(name)
|
||||
|
||||
@classmethod
|
||||
def get_cmap(cls, name):
|
||||
if name == 'Identity-H':
|
||||
return IdentityCMap(WMode=0)
|
||||
elif name == 'Identity-V':
|
||||
return IdentityCMap(WMode=1)
|
||||
elif name == 'OneByteIdentityH':
|
||||
return IdentityCMapByte(WMode=0)
|
||||
elif name == 'OneByteIdentityV':
|
||||
return IdentityCMapByte(WMode=1)
|
||||
try:
|
||||
return cls._cmap_cache[name]
|
||||
except KeyError:
|
||||
pass
|
||||
data = cls._load_data(name)
|
||||
cls._cmap_cache[name] = cmap = PyCMap(name, data)
|
||||
return cmap
|
||||
|
||||
@classmethod
|
||||
def get_unicode_map(cls, name, vertical=False):
|
||||
try:
|
||||
return cls._umap_cache[name][vertical]
|
||||
except KeyError:
|
||||
pass
|
||||
data = cls._load_data('to-unicode-%s' % name)
|
||||
cls._umap_cache[name] = [PyUnicodeMap(name, data, v)
|
||||
for v in (False, True)]
|
||||
return cls._umap_cache[name][vertical]
|
||||
|
||||
|
||||
class CMapParser(PSStackParser):
|
||||
|
||||
def __init__(self, cmap, fp):
|
||||
PSStackParser.__init__(self, fp)
|
||||
self.cmap = cmap
|
||||
# some ToUnicode maps don't have "begincmap" keyword.
|
||||
self._in_cmap = True
|
||||
return
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
self.nextobject()
|
||||
except PSEOF:
|
||||
pass
|
||||
return
|
||||
|
||||
KEYWORD_BEGINCMAP = KWD(b'begincmap')
|
||||
KEYWORD_ENDCMAP = KWD(b'endcmap')
|
||||
KEYWORD_USECMAP = KWD(b'usecmap')
|
||||
KEYWORD_DEF = KWD(b'def')
|
||||
KEYWORD_BEGINCODESPACERANGE = KWD(b'begincodespacerange')
|
||||
KEYWORD_ENDCODESPACERANGE = KWD(b'endcodespacerange')
|
||||
KEYWORD_BEGINCIDRANGE = KWD(b'begincidrange')
|
||||
KEYWORD_ENDCIDRANGE = KWD(b'endcidrange')
|
||||
KEYWORD_BEGINCIDCHAR = KWD(b'begincidchar')
|
||||
KEYWORD_ENDCIDCHAR = KWD(b'endcidchar')
|
||||
KEYWORD_BEGINBFRANGE = KWD(b'beginbfrange')
|
||||
KEYWORD_ENDBFRANGE = KWD(b'endbfrange')
|
||||
KEYWORD_BEGINBFCHAR = KWD(b'beginbfchar')
|
||||
KEYWORD_ENDBFCHAR = KWD(b'endbfchar')
|
||||
KEYWORD_BEGINNOTDEFRANGE = KWD(b'beginnotdefrange')
|
||||
KEYWORD_ENDNOTDEFRANGE = KWD(b'endnotdefrange')
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
if token is self.KEYWORD_BEGINCMAP:
|
||||
self._in_cmap = True
|
||||
self.popall()
|
||||
return
|
||||
elif token is self.KEYWORD_ENDCMAP:
|
||||
self._in_cmap = False
|
||||
return
|
||||
if not self._in_cmap:
|
||||
return
|
||||
#
|
||||
if token is self.KEYWORD_DEF:
|
||||
try:
|
||||
((_, k), (_, v)) = self.pop(2)
|
||||
self.cmap.set_attr(literal_name(k), v)
|
||||
except PSSyntaxError:
|
||||
pass
|
||||
return
|
||||
|
||||
if token is self.KEYWORD_USECMAP:
|
||||
try:
|
||||
((_, cmapname),) = self.pop(1)
|
||||
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
|
||||
except PSSyntaxError:
|
||||
pass
|
||||
except CMapDB.CMapNotFound:
|
||||
pass
|
||||
return
|
||||
|
||||
if token is self.KEYWORD_BEGINCODESPACERANGE:
|
||||
self.popall()
|
||||
return
|
||||
if token is self.KEYWORD_ENDCODESPACERANGE:
|
||||
self.popall()
|
||||
return
|
||||
|
||||
if token is self.KEYWORD_BEGINCIDRANGE:
|
||||
self.popall()
|
||||
return
|
||||
if token is self.KEYWORD_ENDCIDRANGE:
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (s, e, cid) in choplist(3, objs):
|
||||
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||
not isinstance(cid, int) or len(s) != len(e)):
|
||||
continue
|
||||
sprefix = s[:-4]
|
||||
eprefix = e[:-4]
|
||||
if sprefix != eprefix:
|
||||
continue
|
||||
svar = s[-4:]
|
||||
evar = e[-4:]
|
||||
s1 = nunpack(svar)
|
||||
e1 = nunpack(evar)
|
||||
vlen = len(svar)
|
||||
for i in range(e1-s1+1):
|
||||
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
|
||||
self.cmap.add_code2cid(x, cid+i)
|
||||
return
|
||||
|
||||
if token is self.KEYWORD_BEGINCIDCHAR:
|
||||
self.popall()
|
||||
return
|
||||
if token is self.KEYWORD_ENDCIDCHAR:
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (cid, code) in choplist(2, objs):
|
||||
if isinstance(code, str) and isinstance(cid, str):
|
||||
self.cmap.add_code2cid(code, nunpack(cid))
|
||||
return
|
||||
|
||||
if token is self.KEYWORD_BEGINBFRANGE:
|
||||
self.popall()
|
||||
return
|
||||
if token is self.KEYWORD_ENDBFRANGE:
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (s, e, code) in choplist(3, objs):
|
||||
if (not isinstance(s, bytes) or not isinstance(e, bytes) or
|
||||
len(s) != len(e)):
|
||||
continue
|
||||
s1 = nunpack(s)
|
||||
e1 = nunpack(e)
|
||||
if isinstance(code, list):
|
||||
for i in range(e1-s1+1):
|
||||
self.cmap.add_cid2unichr(s1+i, code[i])
|
||||
else:
|
||||
var = code[-4:]
|
||||
base = nunpack(var)
|
||||
prefix = code[:-4]
|
||||
vlen = len(var)
|
||||
for i in range(e1-s1+1):
|
||||
x = prefix+struct.pack('>L', base+i)[-vlen:]
|
||||
self.cmap.add_cid2unichr(s1+i, x)
|
||||
return
|
||||
|
||||
if token is self.KEYWORD_BEGINBFCHAR:
|
||||
self.popall()
|
||||
return
|
||||
if token is self.KEYWORD_ENDBFCHAR:
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (cid, code) in choplist(2, objs):
|
||||
if isinstance(cid, bytes) and isinstance(code, bytes):
|
||||
self.cmap.add_cid2unichr(nunpack(cid), code)
|
||||
return
|
||||
|
||||
if token is self.KEYWORD_BEGINNOTDEFRANGE:
|
||||
self.popall()
|
||||
return
|
||||
if token is self.KEYWORD_ENDNOTDEFRANGE:
|
||||
self.popall()
|
||||
return
|
||||
|
||||
self.push((pos, token))
|
||||
return
|
||||
|
||||
|
||||
def main(argv):
|
||||
args = argv[1:]
|
||||
for fname in args:
|
||||
fp = open(fname, 'rb')
|
||||
cmap = FileUnicodeMap()
|
||||
CMapParser(cmap, fp).run()
|
||||
fp.close()
|
||||
cmap.dump()
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
@@ -1,587 +0,0 @@
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from .pdfdevice import PDFTextDevice
|
||||
from .pdffont import PDFUnicodeNotDefined
|
||||
from .layout import LTContainer
|
||||
from .layout import LTPage
|
||||
from .layout import LTText
|
||||
from .layout import LTLine
|
||||
from .layout import LTRect
|
||||
from .layout import LTCurve
|
||||
from .layout import LTFigure
|
||||
from .layout import LTImage
|
||||
from .layout import LTChar
|
||||
from .layout import LTTextLine
|
||||
from .layout import LTTextBox
|
||||
from .layout import LTTextBoxVertical
|
||||
from .layout import LTTextGroup
|
||||
from .utils import apply_matrix_pt
|
||||
from .utils import mult_matrix
|
||||
from .utils import enc
|
||||
from .utils import bbox2str
|
||||
from . import utils
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PDFLayoutAnalyzer(PDFTextDevice):
|
||||
|
||||
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
||||
PDFTextDevice.__init__(self, rsrcmgr)
|
||||
self.pageno = pageno
|
||||
self.laparams = laparams
|
||||
self._stack = []
|
||||
return
|
||||
|
||||
def begin_page(self, page, ctm):
|
||||
(x0, y0, x1, y1) = page.mediabox
|
||||
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
|
||||
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
|
||||
mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
|
||||
self.cur_item = LTPage(self.pageno, mediabox)
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
assert not self._stack, str(len(self._stack))
|
||||
assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
|
||||
if self.laparams is not None:
|
||||
self.cur_item.analyze(self.laparams)
|
||||
self.pageno += 1
|
||||
self.receive_layout(self.cur_item)
|
||||
return
|
||||
|
||||
def begin_figure(self, name, bbox, matrix):
|
||||
self._stack.append(self.cur_item)
|
||||
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
|
||||
return
|
||||
|
||||
def end_figure(self, _):
|
||||
fig = self.cur_item
|
||||
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
|
||||
self.cur_item = self._stack.pop()
|
||||
self.cur_item.add(fig)
|
||||
return
|
||||
|
||||
def render_image(self, name, stream):
|
||||
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
|
||||
item = LTImage(name, stream,
|
||||
(self.cur_item.x0, self.cur_item.y0,
|
||||
self.cur_item.x1, self.cur_item.y1))
|
||||
self.cur_item.add(item)
|
||||
return
|
||||
|
||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||
"""Paint paths described in section 4.4 of the PDF reference manual"""
|
||||
shape = ''.join(x[0] for x in path)
|
||||
|
||||
if shape.count('m') > 1:
|
||||
# recurse if there are multiple m's in this shape
|
||||
for m in re.finditer(r'm[^m]+', shape):
|
||||
subpath = path[m.start(0):m.end(0)]
|
||||
self.paint_path(gstate, stroke, fill, evenodd, subpath)
|
||||
|
||||
else:
|
||||
if shape == 'ml':
|
||||
# single line segment
|
||||
(x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
|
||||
(x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
|
||||
if x0 == x1 or y0 == y1:
|
||||
line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke,
|
||||
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||
self.cur_item.add(line)
|
||||
|
||||
elif shape == 'mlllh':
|
||||
(x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
|
||||
(x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
|
||||
(x2, y2) = apply_matrix_pt(self.ctm, path[2][1:])
|
||||
(x3, y3) = apply_matrix_pt(self.ctm, path[3][1:])
|
||||
|
||||
if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \
|
||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0):
|
||||
rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke,
|
||||
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||
self.cur_item.add(rect)
|
||||
else:
|
||||
curve = self._create_curve(gstate, stroke, fill, evenodd,
|
||||
path)
|
||||
self.cur_item.add(curve)
|
||||
|
||||
else:
|
||||
curve = self._create_curve(gstate, stroke, fill, evenodd, path)
|
||||
self.cur_item.add(curve)
|
||||
|
||||
def _create_curve(self, gstate, stroke, fill, evenodd, path):
|
||||
"""Create a `LTCurve` object for the paint path operator"""
|
||||
pts = [
|
||||
apply_matrix_pt(self.ctm, point)
|
||||
for p in path
|
||||
for point in zip(p[1::2], p[2::2])
|
||||
]
|
||||
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
|
||||
gstate.scolor, gstate.ncolor)
|
||||
return curve
|
||||
|
||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
||||
graphicstate):
|
||||
try:
|
||||
text = font.to_unichr(cid)
|
||||
assert isinstance(text, str), str(type(text))
|
||||
except PDFUnicodeNotDefined:
|
||||
text = self.handle_undefined_char(font, cid)
|
||||
textwidth = font.char_width(cid)
|
||||
textdisp = font.char_disp(cid)
|
||||
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth,
|
||||
textdisp, ncs, graphicstate)
|
||||
self.cur_item.add(item)
|
||||
return item.adv
|
||||
|
||||
def handle_undefined_char(self, font, cid):
|
||||
log.info('undefined: %r, %r', font, cid)
|
||||
return '(cid:%d)' % cid
|
||||
|
||||
def receive_layout(self, ltpage):
|
||||
return
|
||||
|
||||
|
||||
class PDFPageAggregator(PDFLayoutAnalyzer):
|
||||
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
||||
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
|
||||
laparams=laparams)
|
||||
self.result = None
|
||||
return
|
||||
|
||||
def receive_layout(self, ltpage):
|
||||
self.result = ltpage
|
||||
return
|
||||
|
||||
def get_result(self):
|
||||
return self.result
|
||||
|
||||
|
||||
class PDFConverter(PDFLayoutAnalyzer):
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
|
||||
laparams=None):
|
||||
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
|
||||
laparams=laparams)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
if hasattr(self.outfp, 'mode'):
|
||||
if 'b' in self.outfp.mode:
|
||||
self.outfp_binary = True
|
||||
else:
|
||||
self.outfp_binary = False
|
||||
else:
|
||||
import io
|
||||
if isinstance(self.outfp, io.BytesIO):
|
||||
self.outfp_binary = True
|
||||
elif isinstance(self.outfp, io.StringIO):
|
||||
self.outfp_binary = False
|
||||
else:
|
||||
try:
|
||||
self.outfp.write("é")
|
||||
self.outfp_binary = False
|
||||
except TypeError:
|
||||
self.outfp_binary = True
|
||||
return
|
||||
|
||||
|
||||
class TextConverter(PDFConverter):
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
showpageno=False, imagewriter=None):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
|
||||
laparams=laparams)
|
||||
self.showpageno = showpageno
|
||||
self.imagewriter = imagewriter
|
||||
return
|
||||
|
||||
def write_text(self, text):
|
||||
text = utils.compatible_encode_method(text, self.codec, 'ignore')
|
||||
if self.outfp_binary:
|
||||
text = text.encode()
|
||||
self.outfp.write(text)
|
||||
return
|
||||
|
||||
def receive_layout(self, ltpage):
|
||||
def render(item):
|
||||
if isinstance(item, LTContainer):
|
||||
for child in item:
|
||||
render(child)
|
||||
elif isinstance(item, LTText):
|
||||
self.write_text(item.get_text())
|
||||
if isinstance(item, LTTextBox):
|
||||
self.write_text('\n')
|
||||
elif isinstance(item, LTImage):
|
||||
if self.imagewriter is not None:
|
||||
self.imagewriter.export_image(item)
|
||||
if self.showpageno:
|
||||
self.write_text('Page %s\n' % ltpage.pageid)
|
||||
render(ltpage)
|
||||
self.write_text('\f')
|
||||
return
|
||||
|
||||
# Some dummy functions to save memory/CPU when all that is wanted
|
||||
# is text. This stops all the image and drawing output from being
|
||||
# recorded and taking up RAM.
|
||||
def render_image(self, name, stream):
|
||||
if self.imagewriter is None:
|
||||
return
|
||||
PDFConverter.render_image(self, name, stream)
|
||||
return
|
||||
|
||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||
return
|
||||
|
||||
|
||||
class HTMLConverter(PDFConverter):
|
||||
RECT_COLORS = {
|
||||
'figure': 'yellow',
|
||||
'textline': 'magenta',
|
||||
'textbox': 'cyan',
|
||||
'textgroup': 'red',
|
||||
'curve': 'black',
|
||||
'page': 'gray',
|
||||
}
|
||||
|
||||
TEXT_COLORS = {
|
||||
'textbox': 'blue',
|
||||
'char': 'black',
|
||||
}
|
||||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
|
||||
pagemargin=50, imagewriter=None, debug=0, rect_colors=None,
|
||||
text_colors=None):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
|
||||
laparams=laparams)
|
||||
if text_colors is None:
|
||||
text_colors = {'char': 'black'}
|
||||
if rect_colors is None:
|
||||
rect_colors = {'curve': 'black', 'page': 'gray'}
|
||||
|
||||
self.scale = scale
|
||||
self.fontscale = fontscale
|
||||
self.layoutmode = layoutmode
|
||||
self.showpageno = showpageno
|
||||
self.pagemargin = pagemargin
|
||||
self.imagewriter = imagewriter
|
||||
self.rect_colors = rect_colors
|
||||
self.text_colors = text_colors
|
||||
if debug:
|
||||
self.rect_colors.update(self.RECT_COLORS)
|
||||
self.text_colors.update(self.TEXT_COLORS)
|
||||
self._yoffset = self.pagemargin
|
||||
self._font = None
|
||||
self._fontstack = []
|
||||
self.write_header()
|
||||
return
|
||||
|
||||
def write(self, text):
|
||||
if self.codec:
|
||||
text = text.encode(self.codec)
|
||||
if sys.version_info < (3, 0):
|
||||
text = str(text)
|
||||
self.outfp.write(text)
|
||||
return
|
||||
|
||||
def write_header(self):
|
||||
self.write('<html><head>\n')
|
||||
if self.codec:
|
||||
s = '<meta http-equiv="Content-Type" content="text/html; ' \
|
||||
'charset=%s">\n' % self.codec
|
||||
else:
|
||||
s = '<meta http-equiv="Content-Type" content="text/html">\n'
|
||||
self.write(s)
|
||||
self.write('</head><body>\n')
|
||||
return
|
||||
|
||||
def write_footer(self):
|
||||
page_links = ['<a href="#{}">{}</a>'.format(i, i)
|
||||
for i in range(1, self.pageno)]
|
||||
s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % \
|
||||
', '.join(page_links)
|
||||
self.write(s)
|
||||
self.write('</body></html>\n')
|
||||
return
|
||||
|
||||
def write_text(self, text):
|
||||
self.write(enc(text))
|
||||
return
|
||||
|
||||
def place_rect(self, color, borderwidth, x, y, w, h):
|
||||
color = self.rect_colors.get(color)
|
||||
if color is not None:
|
||||
s = '<span style="position:absolute; border: %s %dpx solid; ' \
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % \
|
||||
(color, borderwidth, x * self.scale,
|
||||
(self._yoffset - y) * self.scale, w * self.scale,
|
||||
h * self.scale)
|
||||
self.write(
|
||||
s)
|
||||
return
|
||||
|
||||
def place_border(self, color, borderwidth, item):
|
||||
self.place_rect(color, borderwidth, item.x0, item.y1, item.width,
|
||||
item.height)
|
||||
return
|
||||
|
||||
def place_image(self, item, borderwidth, x, y, w, h):
|
||||
if self.imagewriter is not None:
|
||||
name = self.imagewriter.export_image(item)
|
||||
s = '<img src="%s" border="%d" style="position:absolute; ' \
|
||||
'left:%dpx; top:%dpx;" width="%d" height="%d" />\n' % \
|
||||
(enc(name), borderwidth, x * self.scale,
|
||||
(self._yoffset - y) * self.scale, w * self.scale,
|
||||
h * self.scale)
|
||||
self.write(s)
|
||||
return
|
||||
|
||||
def place_text(self, color, text, x, y, size):
|
||||
color = self.text_colors.get(color)
|
||||
if color is not None:
|
||||
s = '<span style="position:absolute; color:%s; left:%dpx; ' \
|
||||
'top:%dpx; font-size:%dpx;">' % \
|
||||
(color, x * self.scale, (self._yoffset - y) * self.scale,
|
||||
size * self.scale * self.fontscale)
|
||||
self.write(s)
|
||||
self.write_text(text)
|
||||
self.write('</span>\n')
|
||||
return
|
||||
|
||||
def begin_div(self, color, borderwidth, x, y, w, h, writing_mode=False):
|
||||
self._fontstack.append(self._font)
|
||||
self._font = None
|
||||
s = '<div style="position:absolute; border: %s %dpx solid; ' \
|
||||
'writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; ' \
|
||||
'height:%dpx;">' % \
|
||||
(color, borderwidth, writing_mode, x * self.scale,
|
||||
(self._yoffset - y) * self.scale, w * self.scale, h * self.scale)
|
||||
self.write(s)
|
||||
return
|
||||
|
||||
def end_div(self, color):
|
||||
if self._font is not None:
|
||||
self.write('</span>')
|
||||
self._font = self._fontstack.pop()
|
||||
self.write('</div>')
|
||||
return
|
||||
|
||||
def put_text(self, text, fontname, fontsize):
|
||||
font = (fontname, fontsize)
|
||||
if font != self._font:
|
||||
if self._font is not None:
|
||||
self.write('</span>')
|
||||
# Remove subset tag from fontname, see PDF Reference 5.5.3
|
||||
fontname_without_subset_tag = fontname.split('+')[-1]
|
||||
self.write('<span style="font-family: %s; font-size:%dpx">' %
|
||||
(fontname_without_subset_tag,
|
||||
fontsize * self.scale * self.fontscale))
|
||||
self._font = font
|
||||
self.write_text(text)
|
||||
return
|
||||
|
||||
def put_newline(self):
|
||||
self.write('<br>')
|
||||
return
|
||||
|
||||
def receive_layout(self, ltpage):
|
||||
def show_group(item):
|
||||
if isinstance(item, LTTextGroup):
|
||||
self.place_border('textgroup', 1, item)
|
||||
for child in item:
|
||||
show_group(child)
|
||||
return
|
||||
|
||||
def render(item):
|
||||
if isinstance(item, LTPage):
|
||||
self._yoffset += item.y1
|
||||
self.place_border('page', 1, item)
|
||||
if self.showpageno:
|
||||
self.write('<div style="position:absolute; top:%dpx;">' %
|
||||
((self._yoffset-item.y1)*self.scale))
|
||||
self.write('<a name="{}">Page {}</a></div>\n'
|
||||
.format(item.pageid, item.pageid))
|
||||
for child in item:
|
||||
render(child)
|
||||
if item.groups is not None:
|
||||
for group in item.groups:
|
||||
show_group(group)
|
||||
elif isinstance(item, LTCurve):
|
||||
self.place_border('curve', 1, item)
|
||||
elif isinstance(item, LTFigure):
|
||||
self.begin_div('figure', 1, item.x0, item.y1, item.width,
|
||||
item.height)
|
||||
for child in item:
|
||||
render(child)
|
||||
self.end_div('figure')
|
||||
elif isinstance(item, LTImage):
|
||||
self.place_image(item, 1, item.x0, item.y1, item.width,
|
||||
item.height)
|
||||
else:
|
||||
if self.layoutmode == 'exact':
|
||||
if isinstance(item, LTTextLine):
|
||||
self.place_border('textline', 1, item)
|
||||
for child in item:
|
||||
render(child)
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.place_border('textbox', 1, item)
|
||||
self.place_text('textbox', str(item.index+1), item.x0,
|
||||
item.y1, 20)
|
||||
for child in item:
|
||||
render(child)
|
||||
elif isinstance(item, LTChar):
|
||||
self.place_border('char', 1, item)
|
||||
self.place_text('char', item.get_text(), item.x0,
|
||||
item.y1, item.size)
|
||||
else:
|
||||
if isinstance(item, LTTextLine):
|
||||
for child in item:
|
||||
render(child)
|
||||
if self.layoutmode != 'loose':
|
||||
self.put_newline()
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.begin_div('textbox', 1, item.x0, item.y1,
|
||||
item.width, item.height,
|
||||
item.get_writing_mode())
|
||||
for child in item:
|
||||
render(child)
|
||||
self.end_div('textbox')
|
||||
elif isinstance(item, LTChar):
|
||||
self.put_text(item.get_text(), item.fontname,
|
||||
item.size)
|
||||
elif isinstance(item, LTText):
|
||||
self.write_text(item.get_text())
|
||||
return
|
||||
render(ltpage)
|
||||
self._yoffset += self.pagemargin
|
||||
return
|
||||
|
||||
def close(self):
|
||||
self.write_footer()
|
||||
return
|
||||
|
||||
|
||||
class XMLConverter(PDFConverter):
|
||||
|
||||
CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]')
|
||||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
imagewriter=None, stripcontrol=False):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
|
||||
laparams=laparams)
|
||||
self.imagewriter = imagewriter
|
||||
self.stripcontrol = stripcontrol
|
||||
self.write_header()
|
||||
return
|
||||
|
||||
def write(self, text):
|
||||
if self.codec:
|
||||
text = text.encode(self.codec)
|
||||
self.outfp.write(text)
|
||||
return
|
||||
|
||||
def write_header(self):
|
||||
if self.codec:
|
||||
self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
|
||||
else:
|
||||
self.write('<?xml version="1.0" ?>\n')
|
||||
self.write('<pages>\n')
|
||||
return
|
||||
|
||||
def write_footer(self):
|
||||
self.write('</pages>\n')
|
||||
return
|
||||
|
||||
def write_text(self, text):
|
||||
if self.stripcontrol:
|
||||
text = self.CONTROL.sub('', text)
|
||||
self.write(enc(text))
|
||||
return
|
||||
|
||||
def receive_layout(self, ltpage):
|
||||
def show_group(item):
|
||||
if isinstance(item, LTTextBox):
|
||||
self.write('<textbox id="%d" bbox="%s" />\n' %
|
||||
(item.index, bbox2str(item.bbox)))
|
||||
elif isinstance(item, LTTextGroup):
|
||||
self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
|
||||
for child in item:
|
||||
show_group(child)
|
||||
self.write('</textgroup>\n')
|
||||
return
|
||||
|
||||
def render(item):
|
||||
if isinstance(item, LTPage):
|
||||
s = '<page id="%s" bbox="%s" rotate="%d">\n' % \
|
||||
(item.pageid, bbox2str(item.bbox), item.rotate)
|
||||
self.write(s)
|
||||
for child in item:
|
||||
render(child)
|
||||
if item.groups is not None:
|
||||
self.write('<layout>\n')
|
||||
for group in item.groups:
|
||||
show_group(group)
|
||||
self.write('</layout>\n')
|
||||
self.write('</page>\n')
|
||||
elif isinstance(item, LTLine):
|
||||
s = '<line linewidth="%d" bbox="%s" />\n' % \
|
||||
(item.linewidth, bbox2str(item.bbox))
|
||||
self.write(s)
|
||||
elif isinstance(item, LTRect):
|
||||
s = '<rect linewidth="%d" bbox="%s" />\n' % \
|
||||
(item.linewidth, bbox2str(item.bbox))
|
||||
self.write(s)
|
||||
elif isinstance(item, LTCurve):
|
||||
s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % \
|
||||
(item.linewidth, bbox2str(item.bbox), item.get_pts())
|
||||
self.write(s)
|
||||
elif isinstance(item, LTFigure):
|
||||
s = '<figure name="%s" bbox="%s">\n' % \
|
||||
(item.name, bbox2str(item.bbox))
|
||||
self.write(s)
|
||||
for child in item:
|
||||
render(child)
|
||||
self.write('</figure>\n')
|
||||
elif isinstance(item, LTTextLine):
|
||||
self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.write('</textline>\n')
|
||||
elif isinstance(item, LTTextBox):
|
||||
wmode = ''
|
||||
if isinstance(item, LTTextBoxVertical):
|
||||
wmode = ' wmode="vertical"'
|
||||
s = '<textbox id="%d" bbox="%s"%s>\n' %\
|
||||
(item.index, bbox2str(item.bbox), wmode)
|
||||
self.write(s)
|
||||
for child in item:
|
||||
render(child)
|
||||
self.write('</textbox>\n')
|
||||
elif isinstance(item, LTChar):
|
||||
s = '<text font="%s" bbox="%s" colourspace="%s" ' \
|
||||
'ncolour="%s" size="%.3f">' % \
|
||||
(enc(item.fontname), bbox2str(item.bbox),
|
||||
item.ncs.name, item.graphicstate.ncolor, item.size)
|
||||
self.write(s)
|
||||
self.write_text(item.get_text())
|
||||
self.write('</text>\n')
|
||||
elif isinstance(item, LTText):
|
||||
self.write('<text>%s</text>\n' % item.get_text())
|
||||
elif isinstance(item, LTImage):
|
||||
if self.imagewriter is not None:
|
||||
name = self.imagewriter.export_image(item)
|
||||
self.write('<image src="%s" width="%d" height="%d" />\n' %
|
||||
(enc(name), item.width, item.height))
|
||||
else:
|
||||
self.write('<image width="%d" height="%d" />\n' %
|
||||
(item.width, item.height))
|
||||
else:
|
||||
assert False, str(('Unhandled', item))
|
||||
return
|
||||
render(ltpage)
|
||||
return
|
||||
|
||||
def close(self):
|
||||
self.write_footer()
|
||||
return
|
||||
@@ -1,112 +0,0 @@
|
||||
import logging
|
||||
import re
|
||||
|
||||
from .glyphlist import glyphname2unicode
|
||||
from .latin_enc import ENCODING
|
||||
from .psparser import PSLiteral
|
||||
|
||||
HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def name2unicode(name):
|
||||
"""Converts Adobe glyph names to Unicode numbers.
|
||||
|
||||
In contrast to the specification, this raises a KeyError instead of return
|
||||
an empty string when the key is unknown.
|
||||
This way the caller must explicitly define what to do
|
||||
when there is not a match.
|
||||
|
||||
Reference:
|
||||
https://github.com/adobe-type-tools/agl-specification#2-the-mapping
|
||||
|
||||
:returns unicode character if name resembles something,
|
||||
otherwise a KeyError
|
||||
"""
|
||||
name = name.split('.')[0]
|
||||
components = name.split('_')
|
||||
|
||||
if len(components) > 1:
|
||||
return ''.join(map(name2unicode, components))
|
||||
|
||||
else:
|
||||
if name in glyphname2unicode:
|
||||
return glyphname2unicode.get(name)
|
||||
|
||||
elif name.startswith('uni'):
|
||||
name_without_uni = name.strip('uni')
|
||||
|
||||
if HEXADECIMAL.match(name_without_uni) and \
|
||||
len(name_without_uni) % 4 == 0:
|
||||
unicode_digits = [int(name_without_uni[i:i + 4], base=16)
|
||||
for i in range(0, len(name_without_uni), 4)]
|
||||
for digit in unicode_digits:
|
||||
raise_key_error_for_invalid_unicode(digit)
|
||||
characters = map(chr, unicode_digits)
|
||||
return ''.join(characters)
|
||||
|
||||
elif name.startswith('u'):
|
||||
name_without_u = name.strip('u')
|
||||
|
||||
if HEXADECIMAL.match(name_without_u) and \
|
||||
4 <= len(name_without_u) <= 6:
|
||||
unicode_digit = int(name_without_u, base=16)
|
||||
raise_key_error_for_invalid_unicode(unicode_digit)
|
||||
return chr(unicode_digit)
|
||||
|
||||
raise KeyError('Could not convert unicode name "%s" to character because '
|
||||
'it does not match specification' % name)
|
||||
|
||||
|
||||
def raise_key_error_for_invalid_unicode(unicode_digit):
|
||||
"""Unicode values should not be in the range D800 through DFFF because
|
||||
that is used for surrogate pairs in UTF-16
|
||||
|
||||
:raises KeyError if unicode digit is invalid
|
||||
"""
|
||||
if 55295 < unicode_digit < 57344:
|
||||
raise KeyError('Unicode digit %d is invalid because '
|
||||
'it is in the range D800 through DFFF' % unicode_digit)
|
||||
|
||||
|
||||
class EncodingDB:
|
||||
|
||||
std2unicode = {}
|
||||
mac2unicode = {}
|
||||
win2unicode = {}
|
||||
pdf2unicode = {}
|
||||
for (name, std, mac, win, pdf) in ENCODING:
|
||||
c = name2unicode(name)
|
||||
if std:
|
||||
std2unicode[std] = c
|
||||
if mac:
|
||||
mac2unicode[mac] = c
|
||||
if win:
|
||||
win2unicode[win] = c
|
||||
if pdf:
|
||||
pdf2unicode[pdf] = c
|
||||
|
||||
encodings = {
|
||||
'StandardEncoding': std2unicode,
|
||||
'MacRomanEncoding': mac2unicode,
|
||||
'WinAnsiEncoding': win2unicode,
|
||||
'PDFDocEncoding': pdf2unicode,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def get_encoding(cls, name, diff=None):
|
||||
cid2unicode = cls.encodings.get(name, cls.std2unicode)
|
||||
if diff:
|
||||
cid2unicode = cid2unicode.copy()
|
||||
cid = 0
|
||||
for x in diff:
|
||||
if isinstance(x, int):
|
||||
cid = x
|
||||
elif isinstance(x, PSLiteral):
|
||||
try:
|
||||
cid2unicode[cid] = name2unicode(x.name)
|
||||
except (KeyError, ValueError) as e:
|
||||
log.debug(str(e))
|
||||
cid += 1
|
||||
return cid2unicode
|
||||
@@ -1,46 +0,0 @@
|
||||
""" Font metrics for the Adobe core 14 fonts.
|
||||
|
||||
Font metrics are used to compute the boundary of each character
|
||||
written with a proportional font.
|
||||
|
||||
The following data were extracted from the AFM files:
|
||||
|
||||
http://www.ctan.org/tex-archive/fonts/adobe/afm/
|
||||
|
||||
"""
|
||||
|
||||
### BEGIN Verbatim copy of the license part
|
||||
|
||||
#
|
||||
# Adobe Core 35 AFM Files with 314 Glyph Entries - ReadMe
|
||||
#
|
||||
# This file and the 35 PostScript(R) AFM files it accompanies may be
|
||||
# used, copied, and distributed for any purpose and without charge,
|
||||
# with or without modification, provided that all copyright notices
|
||||
# are retained; that the AFM files are not distributed without this
|
||||
# file; that all modifications to this file or any of the AFM files
|
||||
# are prominently noted in the modified file(s); and that this
|
||||
# paragraph is not modified. Adobe Systems has no responsibility or
|
||||
# obligation to support the use of the AFM files.
|
||||
#
|
||||
|
||||
### END Verbatim copy of the license part
|
||||
|
||||
# flake8: noqa
|
||||
|
||||
FONT_METRICS = {
|
||||
'Courier': ({'FontName': 'Courier', 'Descent': -194.0, 'FontBBox': (-6.0, -249.0, 639.0, 803.0), 'FontWeight': 'Medium', 'CapHeight': 572.0, 'FontFamily': 'Courier', 'Flags': 64, 'XHeight': 434.0, 'ItalicAngle': 0.0, 'Ascent': 627.0}, {' ': 600, '!': 600, '"': 600, '#': 600, '$': 600, '%': 600, '&': 600, "'": 600, '(': 600, ')': 600, '*': 600, '+': 600, ',': 600, '-': 600, '.': 600, '/': 600, '0': 600, '1': 600, '2': 600, '3': 600, '4': 600, '5': 600, '6': 600, '7': 600, '8': 600, '9': 600, ':': 600, ';': 600, '<': 600, '=': 600, '>': 600, '?': 600, '@': 600, 'A': 600, 'B': 600, 'C': 600, 'D': 600, 'E': 600, 'F': 600, 'G': 600, 'H': 600, 'I': 600, 'J': 600, 'K': 600, 'L': 600, 'M': 600, 'N': 600, 'O': 600, 'P': 600, 'Q': 600, 'R': 600, 'S': 600, 'T': 600, 'U': 600, 'V': 600, 'W': 600, 'X': 600, 'Y': 600, 'Z': 600, '[': 600, '\\': 600, ']': 600, '^': 600, '_': 600, '`': 600, 'a': 600, 'b': 600, 'c': 600, 'd': 600, 'e': 600, 'f': 600, 'g': 600, 'h': 600, 'i': 600, 'j': 600, 'k': 600, 'l': 600, 'm': 600, 'n': 600, 'o': 600, 'p': 600, 'q': 600, 'r': 600, 's': 600, 't': 600, 'u': 600, 'v': 600, 'w': 600, 'x': 600, 'y': 600, 'z': 600, '{': 600, '|': 600, '}': 600, '~': 600, '\xa1': 600, '\xa2': 600, '\xa3': 600, '\xa4': 600, '\xa5': 600, '\xa6': 600, '\xa7': 600, '\xa8': 600, '\xa9': 600, '\xaa': 600, '\xab': 600, '\xac': 600, '\xae': 600, '\xaf': 600, '\xb0': 600, '\xb1': 600, '\xb2': 600, '\xb3': 600, '\xb4': 600, '\xb5': 600, '\xb6': 600, '\xb7': 600, '\xb8': 600, '\xb9': 600, '\xba': 600, '\xbb': 600, '\xbc': 600, '\xbd': 600, '\xbe': 600, '\xbf': 600, '\xc0': 600, '\xc1': 600, '\xc2': 600, '\xc3': 600, '\xc4': 600, '\xc5': 600, '\xc6': 600, '\xc7': 600, '\xc8': 600, '\xc9': 600, '\xca': 600, '\xcb': 600, '\xcc': 600, '\xcd': 600, '\xce': 600, '\xcf': 600, '\xd0': 600, '\xd1': 600, '\xd2': 600, '\xd3': 600, '\xd4': 600, '\xd5': 600, '\xd6': 600, '\xd7': 600, '\xd8': 600, '\xd9': 600, '\xda': 600, '\xdb': 600, '\xdc': 600, '\xdd': 600, '\xde': 600, '\xdf': 600, '\xe0': 600, '\xe1': 600, '\xe2': 600, '\xe3': 600, '\xe4': 600, '\xe5': 600, '\xe6': 600, '\xe7': 600, '\xe8': 600, '\xe9': 600, '\xea': 600, '\xeb': 600, '\xec': 600, '\xed': 600, '\xee': 600, '\xef': 600, '\xf0': 600, '\xf1': 600, '\xf2': 600, '\xf3': 600, '\xf4': 600, '\xf5': 600, '\xf6': 600, '\xf7': 600, '\xf8': 600, '\xf9': 600, '\xfa': 600, '\xfb': 600, '\xfc': 600, '\xfd': 600, '\xfe': 600, '\xff': 600, '\u0100': 600, '\u0101': 600, '\u0102': 600, '\u0103': 600, '\u0104': 600, '\u0105': 600, '\u0106': 600, '\u0107': 600, '\u010c': 600, '\u010d': 600, '\u010e': 600, '\u010f': 600, '\u0110': 600, '\u0111': 600, '\u0112': 600, '\u0113': 600, '\u0116': 600, '\u0117': 600, '\u0118': 600, '\u0119': 600, '\u011a': 600, '\u011b': 600, '\u011e': 600, '\u011f': 600, '\u0122': 600, '\u0123': 600, '\u012a': 600, '\u012b': 600, '\u012e': 600, '\u012f': 600, '\u0130': 600, '\u0131': 600, '\u0136': 600, '\u0137': 600, '\u0139': 600, '\u013a': 600, '\u013b': 600, '\u013c': 600, '\u013d': 600, '\u013e': 600, '\u0141': 600, '\u0142': 600, '\u0143': 600, '\u0144': 600, '\u0145': 600, '\u0146': 600, '\u0147': 600, '\u0148': 600, '\u014c': 600, '\u014d': 600, '\u0150': 600, '\u0151': 600, '\u0152': 600, '\u0153': 600, '\u0154': 600, '\u0155': 600, '\u0156': 600, '\u0157': 600, '\u0158': 600, '\u0159': 600, '\u015a': 600, '\u015b': 600, '\u015e': 600, '\u015f': 600, '\u0160': 600, '\u0161': 600, '\u0162': 600, '\u0163': 600, '\u0164': 600, '\u0165': 600, '\u016a': 600, '\u016b': 600, '\u016e': 600, '\u016f': 600, '\u0170': 600, '\u0171': 600, '\u0172': 600, '\u0173': 600, '\u0178': 600, '\u0179': 600, '\u017a': 600, '\u017b': 600, '\u017c': 600, '\u017d': 600, '\u017e': 600, '\u0192': 600, '\u0218': 600, '\u0219': 600, '\u02c6': 600, '\u02c7': 600, '\u02d8': 600, '\u02d9': 600, '\u02da': 600, '\u02db': 600, '\u02dc': 600, '\u02dd': 600, '\u2013': 600, '\u2014': 600, '\u2018': 600, '\u2019': 600, '\u201a': 600, '\u201c': 600, '\u201d': 600, '\u201e': 600, '\u2020': 600, '\u2021': 600, '\u2022': 600, '\u2026': 600, '\u2030': 600, '\u2039': 600, '\u203a': 600, '\u2044': 600, '\u2122': 600, '\u2202': 600, '\u2206': 600, '\u2211': 600, '\u2212': 600, '\u221a': 600, '\u2260': 600, '\u2264': 600, '\u2265': 600, '\u25ca': 600, '\uf6c3': 600, '\ufb01': 600, '\ufb02': 600}),
|
||||
'Courier-Bold': ({'FontName': 'Courier-Bold', 'Descent': -194.0, 'FontBBox': (-88.0, -249.0, 697.0, 811.0), 'FontWeight': 'Bold', 'CapHeight': 572.0, 'FontFamily': 'Courier', 'Flags': 64, 'XHeight': 434.0, 'ItalicAngle': 0.0, 'Ascent': 627.0}, {' ': 600, '!': 600, '"': 600, '#': 600, '$': 600, '%': 600, '&': 600, "'": 600, '(': 600, ')': 600, '*': 600, '+': 600, ',': 600, '-': 600, '.': 600, '/': 600, '0': 600, '1': 600, '2': 600, '3': 600, '4': 600, '5': 600, '6': 600, '7': 600, '8': 600, '9': 600, ':': 600, ';': 600, '<': 600, '=': 600, '>': 600, '?': 600, '@': 600, 'A': 600, 'B': 600, 'C': 600, 'D': 600, 'E': 600, 'F': 600, 'G': 600, 'H': 600, 'I': 600, 'J': 600, 'K': 600, 'L': 600, 'M': 600, 'N': 600, 'O': 600, 'P': 600, 'Q': 600, 'R': 600, 'S': 600, 'T': 600, 'U': 600, 'V': 600, 'W': 600, 'X': 600, 'Y': 600, 'Z': 600, '[': 600, '\\': 600, ']': 600, '^': 600, '_': 600, '`': 600, 'a': 600, 'b': 600, 'c': 600, 'd': 600, 'e': 600, 'f': 600, 'g': 600, 'h': 600, 'i': 600, 'j': 600, 'k': 600, 'l': 600, 'm': 600, 'n': 600, 'o': 600, 'p': 600, 'q': 600, 'r': 600, 's': 600, 't': 600, 'u': 600, 'v': 600, 'w': 600, 'x': 600, 'y': 600, 'z': 600, '{': 600, '|': 600, '}': 600, '~': 600, '\xa1': 600, '\xa2': 600, '\xa3': 600, '\xa4': 600, '\xa5': 600, '\xa6': 600, '\xa7': 600, '\xa8': 600, '\xa9': 600, '\xaa': 600, '\xab': 600, '\xac': 600, '\xae': 600, '\xaf': 600, '\xb0': 600, '\xb1': 600, '\xb2': 600, '\xb3': 600, '\xb4': 600, '\xb5': 600, '\xb6': 600, '\xb7': 600, '\xb8': 600, '\xb9': 600, '\xba': 600, '\xbb': 600, '\xbc': 600, '\xbd': 600, '\xbe': 600, '\xbf': 600, '\xc0': 600, '\xc1': 600, '\xc2': 600, '\xc3': 600, '\xc4': 600, '\xc5': 600, '\xc6': 600, '\xc7': 600, '\xc8': 600, '\xc9': 600, '\xca': 600, '\xcb': 600, '\xcc': 600, '\xcd': 600, '\xce': 600, '\xcf': 600, '\xd0': 600, '\xd1': 600, '\xd2': 600, '\xd3': 600, '\xd4': 600, '\xd5': 600, '\xd6': 600, '\xd7': 600, '\xd8': 600, '\xd9': 600, '\xda': 600, '\xdb': 600, '\xdc': 600, '\xdd': 600, '\xde': 600, '\xdf': 600, '\xe0': 600, '\xe1': 600, '\xe2': 600, '\xe3': 600, '\xe4': 600, '\xe5': 600, '\xe6': 600, '\xe7': 600, '\xe8': 600, '\xe9': 600, '\xea': 600, '\xeb': 600, '\xec': 600, '\xed': 600, '\xee': 600, '\xef': 600, '\xf0': 600, '\xf1': 600, '\xf2': 600, '\xf3': 600, '\xf4': 600, '\xf5': 600, '\xf6': 600, '\xf7': 600, '\xf8': 600, '\xf9': 600, '\xfa': 600, '\xfb': 600, '\xfc': 600, '\xfd': 600, '\xfe': 600, '\xff': 600, '\u0100': 600, '\u0101': 600, '\u0102': 600, '\u0103': 600, '\u0104': 600, '\u0105': 600, '\u0106': 600, '\u0107': 600, '\u010c': 600, '\u010d': 600, '\u010e': 600, '\u010f': 600, '\u0110': 600, '\u0111': 600, '\u0112': 600, '\u0113': 600, '\u0116': 600, '\u0117': 600, '\u0118': 600, '\u0119': 600, '\u011a': 600, '\u011b': 600, '\u011e': 600, '\u011f': 600, '\u0122': 600, '\u0123': 600, '\u012a': 600, '\u012b': 600, '\u012e': 600, '\u012f': 600, '\u0130': 600, '\u0131': 600, '\u0136': 600, '\u0137': 600, '\u0139': 600, '\u013a': 600, '\u013b': 600, '\u013c': 600, '\u013d': 600, '\u013e': 600, '\u0141': 600, '\u0142': 600, '\u0143': 600, '\u0144': 600, '\u0145': 600, '\u0146': 600, '\u0147': 600, '\u0148': 600, '\u014c': 600, '\u014d': 600, '\u0150': 600, '\u0151': 600, '\u0152': 600, '\u0153': 600, '\u0154': 600, '\u0155': 600, '\u0156': 600, '\u0157': 600, '\u0158': 600, '\u0159': 600, '\u015a': 600, '\u015b': 600, '\u015e': 600, '\u015f': 600, '\u0160': 600, '\u0161': 600, '\u0162': 600, '\u0163': 600, '\u0164': 600, '\u0165': 600, '\u016a': 600, '\u016b': 600, '\u016e': 600, '\u016f': 600, '\u0170': 600, '\u0171': 600, '\u0172': 600, '\u0173': 600, '\u0178': 600, '\u0179': 600, '\u017a': 600, '\u017b': 600, '\u017c': 600, '\u017d': 600, '\u017e': 600, '\u0192': 600, '\u0218': 600, '\u0219': 600, '\u02c6': 600, '\u02c7': 600, '\u02d8': 600, '\u02d9': 600, '\u02da': 600, '\u02db': 600, '\u02dc': 600, '\u02dd': 600, '\u2013': 600, '\u2014': 600, '\u2018': 600, '\u2019': 600, '\u201a': 600, '\u201c': 600, '\u201d': 600, '\u201e': 600, '\u2020': 600, '\u2021': 600, '\u2022': 600, '\u2026': 600, '\u2030': 600, '\u2039': 600, '\u203a': 600, '\u2044': 600, '\u2122': 600, '\u2202': 600, '\u2206': 600, '\u2211': 600, '\u2212': 600, '\u221a': 600, '\u2260': 600, '\u2264': 600, '\u2265': 600, '\u25ca': 600, '\uf6c3': 600, '\ufb01': 600, '\ufb02': 600}),
|
||||
'Courier-BoldOblique': ({'FontName': 'Courier-BoldOblique', 'Descent': -194.0, 'FontBBox': (-49.0, -249.0, 758.0, 811.0), 'FontWeight': 'Bold', 'CapHeight': 572.0, 'FontFamily': 'Courier', 'Flags': 64, 'XHeight': 434.0, 'ItalicAngle': -11.0, 'Ascent': 627.0}, {' ': 600, '!': 600, '"': 600, '#': 600, '$': 600, '%': 600, '&': 600, "'": 600, '(': 600, ')': 600, '*': 600, '+': 600, ',': 600, '-': 600, '.': 600, '/': 600, '0': 600, '1': 600, '2': 600, '3': 600, '4': 600, '5': 600, '6': 600, '7': 600, '8': 600, '9': 600, ':': 600, ';': 600, '<': 600, '=': 600, '>': 600, '?': 600, '@': 600, 'A': 600, 'B': 600, 'C': 600, 'D': 600, 'E': 600, 'F': 600, 'G': 600, 'H': 600, 'I': 600, 'J': 600, 'K': 600, 'L': 600, 'M': 600, 'N': 600, 'O': 600, 'P': 600, 'Q': 600, 'R': 600, 'S': 600, 'T': 600, 'U': 600, 'V': 600, 'W': 600, 'X': 600, 'Y': 600, 'Z': 600, '[': 600, '\\': 600, ']': 600, '^': 600, '_': 600, '`': 600, 'a': 600, 'b': 600, 'c': 600, 'd': 600, 'e': 600, 'f': 600, 'g': 600, 'h': 600, 'i': 600, 'j': 600, 'k': 600, 'l': 600, 'm': 600, 'n': 600, 'o': 600, 'p': 600, 'q': 600, 'r': 600, 's': 600, 't': 600, 'u': 600, 'v': 600, 'w': 600, 'x': 600, 'y': 600, 'z': 600, '{': 600, '|': 600, '}': 600, '~': 600, '\xa1': 600, '\xa2': 600, '\xa3': 600, '\xa4': 600, '\xa5': 600, '\xa6': 600, '\xa7': 600, '\xa8': 600, '\xa9': 600, '\xaa': 600, '\xab': 600, '\xac': 600, '\xae': 600, '\xaf': 600, '\xb0': 600, '\xb1': 600, '\xb2': 600, '\xb3': 600, '\xb4': 600, '\xb5': 600, '\xb6': 600, '\xb7': 600, '\xb8': 600, '\xb9': 600, '\xba': 600, '\xbb': 600, '\xbc': 600, '\xbd': 600, '\xbe': 600, '\xbf': 600, '\xc0': 600, '\xc1': 600, '\xc2': 600, '\xc3': 600, '\xc4': 600, '\xc5': 600, '\xc6': 600, '\xc7': 600, '\xc8': 600, '\xc9': 600, '\xca': 600, '\xcb': 600, '\xcc': 600, '\xcd': 600, '\xce': 600, '\xcf': 600, '\xd0': 600, '\xd1': 600, '\xd2': 600, '\xd3': 600, '\xd4': 600, '\xd5': 600, '\xd6': 600, '\xd7': 600, '\xd8': 600, '\xd9': 600, '\xda': 600, '\xdb': 600, '\xdc': 600, '\xdd': 600, '\xde': 600, '\xdf': 600, '\xe0': 600, '\xe1': 600, '\xe2': 600, '\xe3': 600, '\xe4': 600, '\xe5': 600, '\xe6': 600, '\xe7': 600, '\xe8': 600, '\xe9': 600, '\xea': 600, '\xeb': 600, '\xec': 600, '\xed': 600, '\xee': 600, '\xef': 600, '\xf0': 600, '\xf1': 600, '\xf2': 600, '\xf3': 600, '\xf4': 600, '\xf5': 600, '\xf6': 600, '\xf7': 600, '\xf8': 600, '\xf9': 600, '\xfa': 600, '\xfb': 600, '\xfc': 600, '\xfd': 600, '\xfe': 600, '\xff': 600, '\u0100': 600, '\u0101': 600, '\u0102': 600, '\u0103': 600, '\u0104': 600, '\u0105': 600, '\u0106': 600, '\u0107': 600, '\u010c': 600, '\u010d': 600, '\u010e': 600, '\u010f': 600, '\u0110': 600, '\u0111': 600, '\u0112': 600, '\u0113': 600, '\u0116': 600, '\u0117': 600, '\u0118': 600, '\u0119': 600, '\u011a': 600, '\u011b': 600, '\u011e': 600, '\u011f': 600, '\u0122': 600, '\u0123': 600, '\u012a': 600, '\u012b': 600, '\u012e': 600, '\u012f': 600, '\u0130': 600, '\u0131': 600, '\u0136': 600, '\u0137': 600, '\u0139': 600, '\u013a': 600, '\u013b': 600, '\u013c': 600, '\u013d': 600, '\u013e': 600, '\u0141': 600, '\u0142': 600, '\u0143': 600, '\u0144': 600, '\u0145': 600, '\u0146': 600, '\u0147': 600, '\u0148': 600, '\u014c': 600, '\u014d': 600, '\u0150': 600, '\u0151': 600, '\u0152': 600, '\u0153': 600, '\u0154': 600, '\u0155': 600, '\u0156': 600, '\u0157': 600, '\u0158': 600, '\u0159': 600, '\u015a': 600, '\u015b': 600, '\u015e': 600, '\u015f': 600, '\u0160': 600, '\u0161': 600, '\u0162': 600, '\u0163': 600, '\u0164': 600, '\u0165': 600, '\u016a': 600, '\u016b': 600, '\u016e': 600, '\u016f': 600, '\u0170': 600, '\u0171': 600, '\u0172': 600, '\u0173': 600, '\u0178': 600, '\u0179': 600, '\u017a': 600, '\u017b': 600, '\u017c': 600, '\u017d': 600, '\u017e': 600, '\u0192': 600, '\u0218': 600, '\u0219': 600, '\u02c6': 600, '\u02c7': 600, '\u02d8': 600, '\u02d9': 600, '\u02da': 600, '\u02db': 600, '\u02dc': 600, '\u02dd': 600, '\u2013': 600, '\u2014': 600, '\u2018': 600, '\u2019': 600, '\u201a': 600, '\u201c': 600, '\u201d': 600, '\u201e': 600, '\u2020': 600, '\u2021': 600, '\u2022': 600, '\u2026': 600, '\u2030': 600, '\u2039': 600, '\u203a': 600, '\u2044': 600, '\u2122': 600, '\u2202': 600, '\u2206': 600, '\u2211': 600, '\u2212': 600, '\u221a': 600, '\u2260': 600, '\u2264': 600, '\u2265': 600, '\u25ca': 600, '\uf6c3': 600, '\ufb01': 600, '\ufb02': 600}),
|
||||
'Courier-Oblique': ({'FontName': 'Courier-Oblique', 'Descent': -194.0, 'FontBBox': (-49.0, -249.0, 749.0, 803.0), 'FontWeight': 'Medium', 'CapHeight': 572.0, 'FontFamily': 'Courier', 'Flags': 64, 'XHeight': 434.0, 'ItalicAngle': -11.0, 'Ascent': 627.0}, {' ': 600, '!': 600, '"': 600, '#': 600, '$': 600, '%': 600, '&': 600, "'": 600, '(': 600, ')': 600, '*': 600, '+': 600, ',': 600, '-': 600, '.': 600, '/': 600, '0': 600, '1': 600, '2': 600, '3': 600, '4': 600, '5': 600, '6': 600, '7': 600, '8': 600, '9': 600, ':': 600, ';': 600, '<': 600, '=': 600, '>': 600, '?': 600, '@': 600, 'A': 600, 'B': 600, 'C': 600, 'D': 600, 'E': 600, 'F': 600, 'G': 600, 'H': 600, 'I': 600, 'J': 600, 'K': 600, 'L': 600, 'M': 600, 'N': 600, 'O': 600, 'P': 600, 'Q': 600, 'R': 600, 'S': 600, 'T': 600, 'U': 600, 'V': 600, 'W': 600, 'X': 600, 'Y': 600, 'Z': 600, '[': 600, '\\': 600, ']': 600, '^': 600, '_': 600, '`': 600, 'a': 600, 'b': 600, 'c': 600, 'd': 600, 'e': 600, 'f': 600, 'g': 600, 'h': 600, 'i': 600, 'j': 600, 'k': 600, 'l': 600, 'm': 600, 'n': 600, 'o': 600, 'p': 600, 'q': 600, 'r': 600, 's': 600, 't': 600, 'u': 600, 'v': 600, 'w': 600, 'x': 600, 'y': 600, 'z': 600, '{': 600, '|': 600, '}': 600, '~': 600, '\xa1': 600, '\xa2': 600, '\xa3': 600, '\xa4': 600, '\xa5': 600, '\xa6': 600, '\xa7': 600, '\xa8': 600, '\xa9': 600, '\xaa': 600, '\xab': 600, '\xac': 600, '\xae': 600, '\xaf': 600, '\xb0': 600, '\xb1': 600, '\xb2': 600, '\xb3': 600, '\xb4': 600, '\xb5': 600, '\xb6': 600, '\xb7': 600, '\xb8': 600, '\xb9': 600, '\xba': 600, '\xbb': 600, '\xbc': 600, '\xbd': 600, '\xbe': 600, '\xbf': 600, '\xc0': 600, '\xc1': 600, '\xc2': 600, '\xc3': 600, '\xc4': 600, '\xc5': 600, '\xc6': 600, '\xc7': 600, '\xc8': 600, '\xc9': 600, '\xca': 600, '\xcb': 600, '\xcc': 600, '\xcd': 600, '\xce': 600, '\xcf': 600, '\xd0': 600, '\xd1': 600, '\xd2': 600, '\xd3': 600, '\xd4': 600, '\xd5': 600, '\xd6': 600, '\xd7': 600, '\xd8': 600, '\xd9': 600, '\xda': 600, '\xdb': 600, '\xdc': 600, '\xdd': 600, '\xde': 600, '\xdf': 600, '\xe0': 600, '\xe1': 600, '\xe2': 600, '\xe3': 600, '\xe4': 600, '\xe5': 600, '\xe6': 600, '\xe7': 600, '\xe8': 600, '\xe9': 600, '\xea': 600, '\xeb': 600, '\xec': 600, '\xed': 600, '\xee': 600, '\xef': 600, '\xf0': 600, '\xf1': 600, '\xf2': 600, '\xf3': 600, '\xf4': 600, '\xf5': 600, '\xf6': 600, '\xf7': 600, '\xf8': 600, '\xf9': 600, '\xfa': 600, '\xfb': 600, '\xfc': 600, '\xfd': 600, '\xfe': 600, '\xff': 600, '\u0100': 600, '\u0101': 600, '\u0102': 600, '\u0103': 600, '\u0104': 600, '\u0105': 600, '\u0106': 600, '\u0107': 600, '\u010c': 600, '\u010d': 600, '\u010e': 600, '\u010f': 600, '\u0110': 600, '\u0111': 600, '\u0112': 600, '\u0113': 600, '\u0116': 600, '\u0117': 600, '\u0118': 600, '\u0119': 600, '\u011a': 600, '\u011b': 600, '\u011e': 600, '\u011f': 600, '\u0122': 600, '\u0123': 600, '\u012a': 600, '\u012b': 600, '\u012e': 600, '\u012f': 600, '\u0130': 600, '\u0131': 600, '\u0136': 600, '\u0137': 600, '\u0139': 600, '\u013a': 600, '\u013b': 600, '\u013c': 600, '\u013d': 600, '\u013e': 600, '\u0141': 600, '\u0142': 600, '\u0143': 600, '\u0144': 600, '\u0145': 600, '\u0146': 600, '\u0147': 600, '\u0148': 600, '\u014c': 600, '\u014d': 600, '\u0150': 600, '\u0151': 600, '\u0152': 600, '\u0153': 600, '\u0154': 600, '\u0155': 600, '\u0156': 600, '\u0157': 600, '\u0158': 600, '\u0159': 600, '\u015a': 600, '\u015b': 600, '\u015e': 600, '\u015f': 600, '\u0160': 600, '\u0161': 600, '\u0162': 600, '\u0163': 600, '\u0164': 600, '\u0165': 600, '\u016a': 600, '\u016b': 600, '\u016e': 600, '\u016f': 600, '\u0170': 600, '\u0171': 600, '\u0172': 600, '\u0173': 600, '\u0178': 600, '\u0179': 600, '\u017a': 600, '\u017b': 600, '\u017c': 600, '\u017d': 600, '\u017e': 600, '\u0192': 600, '\u0218': 600, '\u0219': 600, '\u02c6': 600, '\u02c7': 600, '\u02d8': 600, '\u02d9': 600, '\u02da': 600, '\u02db': 600, '\u02dc': 600, '\u02dd': 600, '\u2013': 600, '\u2014': 600, '\u2018': 600, '\u2019': 600, '\u201a': 600, '\u201c': 600, '\u201d': 600, '\u201e': 600, '\u2020': 600, '\u2021': 600, '\u2022': 600, '\u2026': 600, '\u2030': 600, '\u2039': 600, '\u203a': 600, '\u2044': 600, '\u2122': 600, '\u2202': 600, '\u2206': 600, '\u2211': 600, '\u2212': 600, '\u221a': 600, '\u2260': 600, '\u2264': 600, '\u2265': 600, '\u25ca': 600, '\uf6c3': 600, '\ufb01': 600, '\ufb02': 600}),
|
||||
'Helvetica': ({'FontName': 'Helvetica', 'Descent': -207.0, 'FontBBox': (-166.0, -225.0, 1000.0, 931.0), 'FontWeight': 'Medium', 'CapHeight': 718.0, 'FontFamily': 'Helvetica', 'Flags': 0, 'XHeight': 523.0, 'ItalicAngle': 0.0, 'Ascent': 718.0}, {' ': 278, '!': 278, '"': 355, '#': 556, '$': 556, '%': 889, '&': 667, "'": 191, '(': 333, ')': 333, '*': 389, '+': 584, ',': 278, '-': 333, '.': 278, '/': 278, '0': 556, '1': 556, '2': 556, '3': 556, '4': 556, '5': 556, '6': 556, '7': 556, '8': 556, '9': 556, ':': 278, ';': 278, '<': 584, '=': 584, '>': 584, '?': 556, '@': 1015, 'A': 667, 'B': 667, 'C': 722, 'D': 722, 'E': 667, 'F': 611, 'G': 778, 'H': 722, 'I': 278, 'J': 500, 'K': 667, 'L': 556, 'M': 833, 'N': 722, 'O': 778, 'P': 667, 'Q': 778, 'R': 722, 'S': 667, 'T': 611, 'U': 722, 'V': 667, 'W': 944, 'X': 667, 'Y': 667, 'Z': 611, '[': 278, '\\': 278, ']': 278, '^': 469, '_': 556, '`': 333, 'a': 556, 'b': 556, 'c': 500, 'd': 556, 'e': 556, 'f': 278, 'g': 556, 'h': 556, 'i': 222, 'j': 222, 'k': 500, 'l': 222, 'm': 833, 'n': 556, 'o': 556, 'p': 556, 'q': 556, 'r': 333, 's': 500, 't': 278, 'u': 556, 'v': 500, 'w': 722, 'x': 500, 'y': 500, 'z': 500, '{': 334, '|': 260, '}': 334, '~': 584, '\xa1': 333, '\xa2': 556, '\xa3': 556, '\xa4': 556, '\xa5': 556, '\xa6': 260, '\xa7': 556, '\xa8': 333, '\xa9': 737, '\xaa': 370, '\xab': 556, '\xac': 584, '\xae': 737, '\xaf': 333, '\xb0': 400, '\xb1': 584, '\xb2': 333, '\xb3': 333, '\xb4': 333, '\xb5': 556, '\xb6': 537, '\xb7': 278, '\xb8': 333, '\xb9': 333, '\xba': 365, '\xbb': 556, '\xbc': 834, '\xbd': 834, '\xbe': 834, '\xbf': 611, '\xc0': 667, '\xc1': 667, '\xc2': 667, '\xc3': 667, '\xc4': 667, '\xc5': 667, '\xc6': 1000, '\xc7': 722, '\xc8': 667, '\xc9': 667, '\xca': 667, '\xcb': 667, '\xcc': 278, '\xcd': 278, '\xce': 278, '\xcf': 278, '\xd0': 722, '\xd1': 722, '\xd2': 778, '\xd3': 778, '\xd4': 778, '\xd5': 778, '\xd6': 778, '\xd7': 584, '\xd8': 778, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 667, '\xde': 667, '\xdf': 611, '\xe0': 556, '\xe1': 556, '\xe2': 556, '\xe3': 556, '\xe4': 556, '\xe5': 556, '\xe6': 889, '\xe7': 500, '\xe8': 556, '\xe9': 556, '\xea': 556, '\xeb': 556, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 556, '\xf1': 556, '\xf2': 556, '\xf3': 556, '\xf4': 556, '\xf5': 556, '\xf6': 556, '\xf7': 584, '\xf8': 611, '\xf9': 556, '\xfa': 556, '\xfb': 556, '\xfc': 556, '\xfd': 500, '\xfe': 556, '\xff': 500, '\u0100': 667, '\u0101': 556, '\u0102': 667, '\u0103': 556, '\u0104': 667, '\u0105': 556, '\u0106': 722, '\u0107': 500, '\u010c': 722, '\u010d': 500, '\u010e': 722, '\u010f': 643, '\u0110': 722, '\u0111': 556, '\u0112': 667, '\u0113': 556, '\u0116': 667, '\u0117': 556, '\u0118': 667, '\u0119': 556, '\u011a': 667, '\u011b': 556, '\u011e': 778, '\u011f': 556, '\u0122': 778, '\u0123': 556, '\u012a': 278, '\u012b': 278, '\u012e': 278, '\u012f': 222, '\u0130': 278, '\u0131': 278, '\u0136': 667, '\u0137': 500, '\u0139': 556, '\u013a': 222, '\u013b': 556, '\u013c': 222, '\u013d': 556, '\u013e': 299, '\u0141': 556, '\u0142': 222, '\u0143': 722, '\u0144': 556, '\u0145': 722, '\u0146': 556, '\u0147': 722, '\u0148': 556, '\u014c': 778, '\u014d': 556, '\u0150': 778, '\u0151': 556, '\u0152': 1000, '\u0153': 944, '\u0154': 722, '\u0155': 333, '\u0156': 722, '\u0157': 333, '\u0158': 722, '\u0159': 333, '\u015a': 667, '\u015b': 500, '\u015e': 667, '\u015f': 500, '\u0160': 667, '\u0161': 500, '\u0162': 611, '\u0163': 278, '\u0164': 611, '\u0165': 317, '\u016a': 722, '\u016b': 556, '\u016e': 722, '\u016f': 556, '\u0170': 722, '\u0171': 556, '\u0172': 722, '\u0173': 556, '\u0178': 667, '\u0179': 611, '\u017a': 500, '\u017b': 611, '\u017c': 500, '\u017d': 611, '\u017e': 500, '\u0192': 556, '\u0218': 667, '\u0219': 500, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 556, '\u2014': 1000, '\u2018': 222, '\u2019': 222, '\u201a': 222, '\u201c': 333, '\u201d': 333, '\u201e': 333, '\u2020': 556, '\u2021': 556, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 1000, '\u2202': 476, '\u2206': 612, '\u2211': 600, '\u2212': 584, '\u221a': 453, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 471, '\uf6c3': 250, '\ufb01': 500, '\ufb02': 500}),
|
||||
'Helvetica-Bold': ({'FontName': 'Helvetica-Bold', 'Descent': -207.0, 'FontBBox': (-170.0, -228.0, 1003.0, 962.0), 'FontWeight': 'Bold', 'CapHeight': 718.0, 'FontFamily': 'Helvetica', 'Flags': 0, 'XHeight': 532.0, 'ItalicAngle': 0.0, 'Ascent': 718.0}, {' ': 278, '!': 333, '"': 474, '#': 556, '$': 556, '%': 889, '&': 722, "'": 238, '(': 333, ')': 333, '*': 389, '+': 584, ',': 278, '-': 333, '.': 278, '/': 278, '0': 556, '1': 556, '2': 556, '3': 556, '4': 556, '5': 556, '6': 556, '7': 556, '8': 556, '9': 556, ':': 333, ';': 333, '<': 584, '=': 584, '>': 584, '?': 611, '@': 975, 'A': 722, 'B': 722, 'C': 722, 'D': 722, 'E': 667, 'F': 611, 'G': 778, 'H': 722, 'I': 278, 'J': 556, 'K': 722, 'L': 611, 'M': 833, 'N': 722, 'O': 778, 'P': 667, 'Q': 778, 'R': 722, 'S': 667, 'T': 611, 'U': 722, 'V': 667, 'W': 944, 'X': 667, 'Y': 667, 'Z': 611, '[': 333, '\\': 278, ']': 333, '^': 584, '_': 556, '`': 333, 'a': 556, 'b': 611, 'c': 556, 'd': 611, 'e': 556, 'f': 333, 'g': 611, 'h': 611, 'i': 278, 'j': 278, 'k': 556, 'l': 278, 'm': 889, 'n': 611, 'o': 611, 'p': 611, 'q': 611, 'r': 389, 's': 556, 't': 333, 'u': 611, 'v': 556, 'w': 778, 'x': 556, 'y': 556, 'z': 500, '{': 389, '|': 280, '}': 389, '~': 584, '\xa1': 333, '\xa2': 556, '\xa3': 556, '\xa4': 556, '\xa5': 556, '\xa6': 280, '\xa7': 556, '\xa8': 333, '\xa9': 737, '\xaa': 370, '\xab': 556, '\xac': 584, '\xae': 737, '\xaf': 333, '\xb0': 400, '\xb1': 584, '\xb2': 333, '\xb3': 333, '\xb4': 333, '\xb5': 611, '\xb6': 556, '\xb7': 278, '\xb8': 333, '\xb9': 333, '\xba': 365, '\xbb': 556, '\xbc': 834, '\xbd': 834, '\xbe': 834, '\xbf': 611, '\xc0': 722, '\xc1': 722, '\xc2': 722, '\xc3': 722, '\xc4': 722, '\xc5': 722, '\xc6': 1000, '\xc7': 722, '\xc8': 667, '\xc9': 667, '\xca': 667, '\xcb': 667, '\xcc': 278, '\xcd': 278, '\xce': 278, '\xcf': 278, '\xd0': 722, '\xd1': 722, '\xd2': 778, '\xd3': 778, '\xd4': 778, '\xd5': 778, '\xd6': 778, '\xd7': 584, '\xd8': 778, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 667, '\xde': 667, '\xdf': 611, '\xe0': 556, '\xe1': 556, '\xe2': 556, '\xe3': 556, '\xe4': 556, '\xe5': 556, '\xe6': 889, '\xe7': 556, '\xe8': 556, '\xe9': 556, '\xea': 556, '\xeb': 556, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 611, '\xf1': 611, '\xf2': 611, '\xf3': 611, '\xf4': 611, '\xf5': 611, '\xf6': 611, '\xf7': 584, '\xf8': 611, '\xf9': 611, '\xfa': 611, '\xfb': 611, '\xfc': 611, '\xfd': 556, '\xfe': 611, '\xff': 556, '\u0100': 722, '\u0101': 556, '\u0102': 722, '\u0103': 556, '\u0104': 722, '\u0105': 556, '\u0106': 722, '\u0107': 556, '\u010c': 722, '\u010d': 556, '\u010e': 722, '\u010f': 743, '\u0110': 722, '\u0111': 611, '\u0112': 667, '\u0113': 556, '\u0116': 667, '\u0117': 556, '\u0118': 667, '\u0119': 556, '\u011a': 667, '\u011b': 556, '\u011e': 778, '\u011f': 611, '\u0122': 778, '\u0123': 611, '\u012a': 278, '\u012b': 278, '\u012e': 278, '\u012f': 278, '\u0130': 278, '\u0131': 278, '\u0136': 722, '\u0137': 556, '\u0139': 611, '\u013a': 278, '\u013b': 611, '\u013c': 278, '\u013d': 611, '\u013e': 400, '\u0141': 611, '\u0142': 278, '\u0143': 722, '\u0144': 611, '\u0145': 722, '\u0146': 611, '\u0147': 722, '\u0148': 611, '\u014c': 778, '\u014d': 611, '\u0150': 778, '\u0151': 611, '\u0152': 1000, '\u0153': 944, '\u0154': 722, '\u0155': 389, '\u0156': 722, '\u0157': 389, '\u0158': 722, '\u0159': 389, '\u015a': 667, '\u015b': 556, '\u015e': 667, '\u015f': 556, '\u0160': 667, '\u0161': 556, '\u0162': 611, '\u0163': 333, '\u0164': 611, '\u0165': 389, '\u016a': 722, '\u016b': 611, '\u016e': 722, '\u016f': 611, '\u0170': 722, '\u0171': 611, '\u0172': 722, '\u0173': 611, '\u0178': 667, '\u0179': 611, '\u017a': 500, '\u017b': 611, '\u017c': 500, '\u017d': 611, '\u017e': 500, '\u0192': 556, '\u0218': 667, '\u0219': 556, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 556, '\u2014': 1000, '\u2018': 278, '\u2019': 278, '\u201a': 278, '\u201c': 500, '\u201d': 500, '\u201e': 500, '\u2020': 556, '\u2021': 556, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 1000, '\u2202': 494, '\u2206': 612, '\u2211': 600, '\u2212': 584, '\u221a': 549, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 494, '\uf6c3': 250, '\ufb01': 611, '\ufb02': 611}),
|
||||
'Helvetica-BoldOblique': ({'FontName': 'Helvetica-BoldOblique', 'Descent': -207.0, 'FontBBox': (-175.0, -228.0, 1114.0, 962.0), 'FontWeight': 'Bold', 'CapHeight': 718.0, 'FontFamily': 'Helvetica', 'Flags': 0, 'XHeight': 532.0, 'ItalicAngle': -12.0, 'Ascent': 718.0}, {' ': 278, '!': 333, '"': 474, '#': 556, '$': 556, '%': 889, '&': 722, "'": 238, '(': 333, ')': 333, '*': 389, '+': 584, ',': 278, '-': 333, '.': 278, '/': 278, '0': 556, '1': 556, '2': 556, '3': 556, '4': 556, '5': 556, '6': 556, '7': 556, '8': 556, '9': 556, ':': 333, ';': 333, '<': 584, '=': 584, '>': 584, '?': 611, '@': 975, 'A': 722, 'B': 722, 'C': 722, 'D': 722, 'E': 667, 'F': 611, 'G': 778, 'H': 722, 'I': 278, 'J': 556, 'K': 722, 'L': 611, 'M': 833, 'N': 722, 'O': 778, 'P': 667, 'Q': 778, 'R': 722, 'S': 667, 'T': 611, 'U': 722, 'V': 667, 'W': 944, 'X': 667, 'Y': 667, 'Z': 611, '[': 333, '\\': 278, ']': 333, '^': 584, '_': 556, '`': 333, 'a': 556, 'b': 611, 'c': 556, 'd': 611, 'e': 556, 'f': 333, 'g': 611, 'h': 611, 'i': 278, 'j': 278, 'k': 556, 'l': 278, 'm': 889, 'n': 611, 'o': 611, 'p': 611, 'q': 611, 'r': 389, 's': 556, 't': 333, 'u': 611, 'v': 556, 'w': 778, 'x': 556, 'y': 556, 'z': 500, '{': 389, '|': 280, '}': 389, '~': 584, '\xa1': 333, '\xa2': 556, '\xa3': 556, '\xa4': 556, '\xa5': 556, '\xa6': 280, '\xa7': 556, '\xa8': 333, '\xa9': 737, '\xaa': 370, '\xab': 556, '\xac': 584, '\xae': 737, '\xaf': 333, '\xb0': 400, '\xb1': 584, '\xb2': 333, '\xb3': 333, '\xb4': 333, '\xb5': 611, '\xb6': 556, '\xb7': 278, '\xb8': 333, '\xb9': 333, '\xba': 365, '\xbb': 556, '\xbc': 834, '\xbd': 834, '\xbe': 834, '\xbf': 611, '\xc0': 722, '\xc1': 722, '\xc2': 722, '\xc3': 722, '\xc4': 722, '\xc5': 722, '\xc6': 1000, '\xc7': 722, '\xc8': 667, '\xc9': 667, '\xca': 667, '\xcb': 667, '\xcc': 278, '\xcd': 278, '\xce': 278, '\xcf': 278, '\xd0': 722, '\xd1': 722, '\xd2': 778, '\xd3': 778, '\xd4': 778, '\xd5': 778, '\xd6': 778, '\xd7': 584, '\xd8': 778, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 667, '\xde': 667, '\xdf': 611, '\xe0': 556, '\xe1': 556, '\xe2': 556, '\xe3': 556, '\xe4': 556, '\xe5': 556, '\xe6': 889, '\xe7': 556, '\xe8': 556, '\xe9': 556, '\xea': 556, '\xeb': 556, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 611, '\xf1': 611, '\xf2': 611, '\xf3': 611, '\xf4': 611, '\xf5': 611, '\xf6': 611, '\xf7': 584, '\xf8': 611, '\xf9': 611, '\xfa': 611, '\xfb': 611, '\xfc': 611, '\xfd': 556, '\xfe': 611, '\xff': 556, '\u0100': 722, '\u0101': 556, '\u0102': 722, '\u0103': 556, '\u0104': 722, '\u0105': 556, '\u0106': 722, '\u0107': 556, '\u010c': 722, '\u010d': 556, '\u010e': 722, '\u010f': 743, '\u0110': 722, '\u0111': 611, '\u0112': 667, '\u0113': 556, '\u0116': 667, '\u0117': 556, '\u0118': 667, '\u0119': 556, '\u011a': 667, '\u011b': 556, '\u011e': 778, '\u011f': 611, '\u0122': 778, '\u0123': 611, '\u012a': 278, '\u012b': 278, '\u012e': 278, '\u012f': 278, '\u0130': 278, '\u0131': 278, '\u0136': 722, '\u0137': 556, '\u0139': 611, '\u013a': 278, '\u013b': 611, '\u013c': 278, '\u013d': 611, '\u013e': 400, '\u0141': 611, '\u0142': 278, '\u0143': 722, '\u0144': 611, '\u0145': 722, '\u0146': 611, '\u0147': 722, '\u0148': 611, '\u014c': 778, '\u014d': 611, '\u0150': 778, '\u0151': 611, '\u0152': 1000, '\u0153': 944, '\u0154': 722, '\u0155': 389, '\u0156': 722, '\u0157': 389, '\u0158': 722, '\u0159': 389, '\u015a': 667, '\u015b': 556, '\u015e': 667, '\u015f': 556, '\u0160': 667, '\u0161': 556, '\u0162': 611, '\u0163': 333, '\u0164': 611, '\u0165': 389, '\u016a': 722, '\u016b': 611, '\u016e': 722, '\u016f': 611, '\u0170': 722, '\u0171': 611, '\u0172': 722, '\u0173': 611, '\u0178': 667, '\u0179': 611, '\u017a': 500, '\u017b': 611, '\u017c': 500, '\u017d': 611, '\u017e': 500, '\u0192': 556, '\u0218': 667, '\u0219': 556, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 556, '\u2014': 1000, '\u2018': 278, '\u2019': 278, '\u201a': 278, '\u201c': 500, '\u201d': 500, '\u201e': 500, '\u2020': 556, '\u2021': 556, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 1000, '\u2202': 494, '\u2206': 612, '\u2211': 600, '\u2212': 584, '\u221a': 549, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 494, '\uf6c3': 250, '\ufb01': 611, '\ufb02': 611}),
|
||||
'Helvetica-Oblique': ({'FontName': 'Helvetica-Oblique', 'Descent': -207.0, 'FontBBox': (-171.0, -225.0, 1116.0, 931.0), 'FontWeight': 'Medium', 'CapHeight': 718.0, 'FontFamily': 'Helvetica', 'Flags': 0, 'XHeight': 523.0, 'ItalicAngle': -12.0, 'Ascent': 718.0}, {' ': 278, '!': 278, '"': 355, '#': 556, '$': 556, '%': 889, '&': 667, "'": 191, '(': 333, ')': 333, '*': 389, '+': 584, ',': 278, '-': 333, '.': 278, '/': 278, '0': 556, '1': 556, '2': 556, '3': 556, '4': 556, '5': 556, '6': 556, '7': 556, '8': 556, '9': 556, ':': 278, ';': 278, '<': 584, '=': 584, '>': 584, '?': 556, '@': 1015, 'A': 667, 'B': 667, 'C': 722, 'D': 722, 'E': 667, 'F': 611, 'G': 778, 'H': 722, 'I': 278, 'J': 500, 'K': 667, 'L': 556, 'M': 833, 'N': 722, 'O': 778, 'P': 667, 'Q': 778, 'R': 722, 'S': 667, 'T': 611, 'U': 722, 'V': 667, 'W': 944, 'X': 667, 'Y': 667, 'Z': 611, '[': 278, '\\': 278, ']': 278, '^': 469, '_': 556, '`': 333, 'a': 556, 'b': 556, 'c': 500, 'd': 556, 'e': 556, 'f': 278, 'g': 556, 'h': 556, 'i': 222, 'j': 222, 'k': 500, 'l': 222, 'm': 833, 'n': 556, 'o': 556, 'p': 556, 'q': 556, 'r': 333, 's': 500, 't': 278, 'u': 556, 'v': 500, 'w': 722, 'x': 500, 'y': 500, 'z': 500, '{': 334, '|': 260, '}': 334, '~': 584, '\xa1': 333, '\xa2': 556, '\xa3': 556, '\xa4': 556, '\xa5': 556, '\xa6': 260, '\xa7': 556, '\xa8': 333, '\xa9': 737, '\xaa': 370, '\xab': 556, '\xac': 584, '\xae': 737, '\xaf': 333, '\xb0': 400, '\xb1': 584, '\xb2': 333, '\xb3': 333, '\xb4': 333, '\xb5': 556, '\xb6': 537, '\xb7': 278, '\xb8': 333, '\xb9': 333, '\xba': 365, '\xbb': 556, '\xbc': 834, '\xbd': 834, '\xbe': 834, '\xbf': 611, '\xc0': 667, '\xc1': 667, '\xc2': 667, '\xc3': 667, '\xc4': 667, '\xc5': 667, '\xc6': 1000, '\xc7': 722, '\xc8': 667, '\xc9': 667, '\xca': 667, '\xcb': 667, '\xcc': 278, '\xcd': 278, '\xce': 278, '\xcf': 278, '\xd0': 722, '\xd1': 722, '\xd2': 778, '\xd3': 778, '\xd4': 778, '\xd5': 778, '\xd6': 778, '\xd7': 584, '\xd8': 778, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 667, '\xde': 667, '\xdf': 611, '\xe0': 556, '\xe1': 556, '\xe2': 556, '\xe3': 556, '\xe4': 556, '\xe5': 556, '\xe6': 889, '\xe7': 500, '\xe8': 556, '\xe9': 556, '\xea': 556, '\xeb': 556, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 556, '\xf1': 556, '\xf2': 556, '\xf3': 556, '\xf4': 556, '\xf5': 556, '\xf6': 556, '\xf7': 584, '\xf8': 611, '\xf9': 556, '\xfa': 556, '\xfb': 556, '\xfc': 556, '\xfd': 500, '\xfe': 556, '\xff': 500, '\u0100': 667, '\u0101': 556, '\u0102': 667, '\u0103': 556, '\u0104': 667, '\u0105': 556, '\u0106': 722, '\u0107': 500, '\u010c': 722, '\u010d': 500, '\u010e': 722, '\u010f': 643, '\u0110': 722, '\u0111': 556, '\u0112': 667, '\u0113': 556, '\u0116': 667, '\u0117': 556, '\u0118': 667, '\u0119': 556, '\u011a': 667, '\u011b': 556, '\u011e': 778, '\u011f': 556, '\u0122': 778, '\u0123': 556, '\u012a': 278, '\u012b': 278, '\u012e': 278, '\u012f': 222, '\u0130': 278, '\u0131': 278, '\u0136': 667, '\u0137': 500, '\u0139': 556, '\u013a': 222, '\u013b': 556, '\u013c': 222, '\u013d': 556, '\u013e': 299, '\u0141': 556, '\u0142': 222, '\u0143': 722, '\u0144': 556, '\u0145': 722, '\u0146': 556, '\u0147': 722, '\u0148': 556, '\u014c': 778, '\u014d': 556, '\u0150': 778, '\u0151': 556, '\u0152': 1000, '\u0153': 944, '\u0154': 722, '\u0155': 333, '\u0156': 722, '\u0157': 333, '\u0158': 722, '\u0159': 333, '\u015a': 667, '\u015b': 500, '\u015e': 667, '\u015f': 500, '\u0160': 667, '\u0161': 500, '\u0162': 611, '\u0163': 278, '\u0164': 611, '\u0165': 317, '\u016a': 722, '\u016b': 556, '\u016e': 722, '\u016f': 556, '\u0170': 722, '\u0171': 556, '\u0172': 722, '\u0173': 556, '\u0178': 667, '\u0179': 611, '\u017a': 500, '\u017b': 611, '\u017c': 500, '\u017d': 611, '\u017e': 500, '\u0192': 556, '\u0218': 667, '\u0219': 500, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 556, '\u2014': 1000, '\u2018': 222, '\u2019': 222, '\u201a': 222, '\u201c': 333, '\u201d': 333, '\u201e': 333, '\u2020': 556, '\u2021': 556, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 1000, '\u2202': 476, '\u2206': 612, '\u2211': 600, '\u2212': 584, '\u221a': 453, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 471, '\uf6c3': 250, '\ufb01': 500, '\ufb02': 500}),
|
||||
'Symbol': ({'FontName': 'Symbol', 'FontBBox': (-180.0, -293.0, 1090.0, 1010.0), 'FontWeight': 'Medium', 'FontFamily': 'Symbol', 'Flags': 0, 'ItalicAngle': 0.0}, {' ': 250, '!': 333, '#': 500, '%': 833, '&': 778, '(': 333, ')': 333, '+': 549, ',': 250, '.': 250, '/': 278, '0': 500, '1': 500, '2': 500, '3': 500, '4': 500, '5': 500, '6': 500, '7': 500, '8': 500, '9': 500, ':': 278, ';': 278, '<': 549, '=': 549, '>': 549, '?': 444, '[': 333, ']': 333, '_': 500, '{': 480, '|': 200, '}': 480, '\xac': 713, '\xb0': 400, '\xb1': 549, '\xb5': 576, '\xd7': 549, '\xf7': 549, '\u0192': 500, '\u0391': 722, '\u0392': 667, '\u0393': 603, '\u0395': 611, '\u0396': 611, '\u0397': 722, '\u0398': 741, '\u0399': 333, '\u039a': 722, '\u039b': 686, '\u039c': 889, '\u039d': 722, '\u039e': 645, '\u039f': 722, '\u03a0': 768, '\u03a1': 556, '\u03a3': 592, '\u03a4': 611, '\u03a5': 690, '\u03a6': 763, '\u03a7': 722, '\u03a8': 795, '\u03b1': 631, '\u03b2': 549, '\u03b3': 411, '\u03b4': 494, '\u03b5': 439, '\u03b6': 494, '\u03b7': 603, '\u03b8': 521, '\u03b9': 329, '\u03ba': 549, '\u03bb': 549, '\u03bd': 521, '\u03be': 493, '\u03bf': 549, '\u03c0': 549, '\u03c1': 549, '\u03c2': 439, '\u03c3': 603, '\u03c4': 439, '\u03c5': 576, '\u03c6': 521, '\u03c7': 549, '\u03c8': 686, '\u03c9': 686, '\u03d1': 631, '\u03d2': 620, '\u03d5': 603, '\u03d6': 713, '\u2022': 460, '\u2026': 1000, '\u2032': 247, '\u2033': 411, '\u2044': 167, '\u20ac': 750, '\u2111': 686, '\u2118': 987, '\u211c': 795, '\u2126': 768, '\u2135': 823, '\u2190': 987, '\u2191': 603, '\u2192': 987, '\u2193': 603, '\u2194': 1042, '\u21b5': 658, '\u21d0': 987, '\u21d1': 603, '\u21d2': 987, '\u21d3': 603, '\u21d4': 1042, '\u2200': 713, '\u2202': 494, '\u2203': 549, '\u2205': 823, '\u2206': 612, '\u2207': 713, '\u2208': 713, '\u2209': 713, '\u220b': 439, '\u220f': 823, '\u2211': 713, '\u2212': 549, '\u2217': 500, '\u221a': 549, '\u221d': 713, '\u221e': 713, '\u2220': 768, '\u2227': 603, '\u2228': 603, '\u2229': 768, '\u222a': 768, '\u222b': 274, '\u2234': 863, '\u223c': 549, '\u2245': 549, '\u2248': 549, '\u2260': 549, '\u2261': 549, '\u2264': 549, '\u2265': 549, '\u2282': 713, '\u2283': 713, '\u2284': 713, '\u2286': 713, '\u2287': 713, '\u2295': 768, '\u2297': 768, '\u22a5': 658, '\u22c5': 250, '\u2320': 686, '\u2321': 686, '\u2329': 329, '\u232a': 329, '\u25ca': 494, '\u2660': 753, '\u2663': 753, '\u2665': 753, '\u2666': 753, '\uf6d9': 790, '\uf6da': 790, '\uf6db': 890, '\uf8e5': 500, '\uf8e6': 603, '\uf8e7': 1000, '\uf8e8': 790, '\uf8e9': 790, '\uf8ea': 786, '\uf8eb': 384, '\uf8ec': 384, '\uf8ed': 384, '\uf8ee': 384, '\uf8ef': 384, '\uf8f0': 384, '\uf8f1': 494, '\uf8f2': 494, '\uf8f3': 494, '\uf8f4': 494, '\uf8f5': 686, '\uf8f6': 384, '\uf8f7': 384, '\uf8f8': 384, '\uf8f9': 384, '\uf8fa': 384, '\uf8fb': 384, '\uf8fc': 494, '\uf8fd': 494, '\uf8fe': 494, '\uf8ff': 790}),
|
||||
'Times-Bold': ({'FontName': 'Times-Bold', 'Descent': -217.0, 'FontBBox': (-168.0, -218.0, 1000.0, 935.0), 'FontWeight': 'Bold', 'CapHeight': 676.0, 'FontFamily': 'Times', 'Flags': 0, 'XHeight': 461.0, 'ItalicAngle': 0.0, 'Ascent': 683.0}, {' ': 250, '!': 333, '"': 555, '#': 500, '$': 500, '%': 1000, '&': 833, "'": 278, '(': 333, ')': 333, '*': 500, '+': 570, ',': 250, '-': 333, '.': 250, '/': 278, '0': 500, '1': 500, '2': 500, '3': 500, '4': 500, '5': 500, '6': 500, '7': 500, '8': 500, '9': 500, ':': 333, ';': 333, '<': 570, '=': 570, '>': 570, '?': 500, '@': 930, 'A': 722, 'B': 667, 'C': 722, 'D': 722, 'E': 667, 'F': 611, 'G': 778, 'H': 778, 'I': 389, 'J': 500, 'K': 778, 'L': 667, 'M': 944, 'N': 722, 'O': 778, 'P': 611, 'Q': 778, 'R': 722, 'S': 556, 'T': 667, 'U': 722, 'V': 722, 'W': 1000, 'X': 722, 'Y': 722, 'Z': 667, '[': 333, '\\': 278, ']': 333, '^': 581, '_': 500, '`': 333, 'a': 500, 'b': 556, 'c': 444, 'd': 556, 'e': 444, 'f': 333, 'g': 500, 'h': 556, 'i': 278, 'j': 333, 'k': 556, 'l': 278, 'm': 833, 'n': 556, 'o': 500, 'p': 556, 'q': 556, 'r': 444, 's': 389, 't': 333, 'u': 556, 'v': 500, 'w': 722, 'x': 500, 'y': 500, 'z': 444, '{': 394, '|': 220, '}': 394, '~': 520, '\xa1': 333, '\xa2': 500, '\xa3': 500, '\xa4': 500, '\xa5': 500, '\xa6': 220, '\xa7': 500, '\xa8': 333, '\xa9': 747, '\xaa': 300, '\xab': 500, '\xac': 570, '\xae': 747, '\xaf': 333, '\xb0': 400, '\xb1': 570, '\xb2': 300, '\xb3': 300, '\xb4': 333, '\xb5': 556, '\xb6': 540, '\xb7': 250, '\xb8': 333, '\xb9': 300, '\xba': 330, '\xbb': 500, '\xbc': 750, '\xbd': 750, '\xbe': 750, '\xbf': 500, '\xc0': 722, '\xc1': 722, '\xc2': 722, '\xc3': 722, '\xc4': 722, '\xc5': 722, '\xc6': 1000, '\xc7': 722, '\xc8': 667, '\xc9': 667, '\xca': 667, '\xcb': 667, '\xcc': 389, '\xcd': 389, '\xce': 389, '\xcf': 389, '\xd0': 722, '\xd1': 722, '\xd2': 778, '\xd3': 778, '\xd4': 778, '\xd5': 778, '\xd6': 778, '\xd7': 570, '\xd8': 778, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 722, '\xde': 611, '\xdf': 556, '\xe0': 500, '\xe1': 500, '\xe2': 500, '\xe3': 500, '\xe4': 500, '\xe5': 500, '\xe6': 722, '\xe7': 444, '\xe8': 444, '\xe9': 444, '\xea': 444, '\xeb': 444, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 500, '\xf1': 556, '\xf2': 500, '\xf3': 500, '\xf4': 500, '\xf5': 500, '\xf6': 500, '\xf7': 570, '\xf8': 500, '\xf9': 556, '\xfa': 556, '\xfb': 556, '\xfc': 556, '\xfd': 500, '\xfe': 556, '\xff': 500, '\u0100': 722, '\u0101': 500, '\u0102': 722, '\u0103': 500, '\u0104': 722, '\u0105': 500, '\u0106': 722, '\u0107': 444, '\u010c': 722, '\u010d': 444, '\u010e': 722, '\u010f': 672, '\u0110': 722, '\u0111': 556, '\u0112': 667, '\u0113': 444, '\u0116': 667, '\u0117': 444, '\u0118': 667, '\u0119': 444, '\u011a': 667, '\u011b': 444, '\u011e': 778, '\u011f': 500, '\u0122': 778, '\u0123': 500, '\u012a': 389, '\u012b': 278, '\u012e': 389, '\u012f': 278, '\u0130': 389, '\u0131': 278, '\u0136': 778, '\u0137': 556, '\u0139': 667, '\u013a': 278, '\u013b': 667, '\u013c': 278, '\u013d': 667, '\u013e': 394, '\u0141': 667, '\u0142': 278, '\u0143': 722, '\u0144': 556, '\u0145': 722, '\u0146': 556, '\u0147': 722, '\u0148': 556, '\u014c': 778, '\u014d': 500, '\u0150': 778, '\u0151': 500, '\u0152': 1000, '\u0153': 722, '\u0154': 722, '\u0155': 444, '\u0156': 722, '\u0157': 444, '\u0158': 722, '\u0159': 444, '\u015a': 556, '\u015b': 389, '\u015e': 556, '\u015f': 389, '\u0160': 556, '\u0161': 389, '\u0162': 667, '\u0163': 333, '\u0164': 667, '\u0165': 416, '\u016a': 722, '\u016b': 556, '\u016e': 722, '\u016f': 556, '\u0170': 722, '\u0171': 556, '\u0172': 722, '\u0173': 556, '\u0178': 722, '\u0179': 667, '\u017a': 444, '\u017b': 667, '\u017c': 444, '\u017d': 667, '\u017e': 444, '\u0192': 500, '\u0218': 556, '\u0219': 389, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 500, '\u2014': 1000, '\u2018': 333, '\u2019': 333, '\u201a': 333, '\u201c': 500, '\u201d': 500, '\u201e': 500, '\u2020': 500, '\u2021': 500, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 1000, '\u2202': 494, '\u2206': 612, '\u2211': 600, '\u2212': 570, '\u221a': 549, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 494, '\uf6c3': 250, '\ufb01': 556, '\ufb02': 556}),
|
||||
'Times-BoldItalic': ({'FontName': 'Times-BoldItalic', 'Descent': -217.0, 'FontBBox': (-200.0, -218.0, 996.0, 921.0), 'FontWeight': 'Bold', 'CapHeight': 669.0, 'FontFamily': 'Times', 'Flags': 0, 'XHeight': 462.0, 'ItalicAngle': -15.0, 'Ascent': 683.0}, {' ': 250, '!': 389, '"': 555, '#': 500, '$': 500, '%': 833, '&': 778, "'": 278, '(': 333, ')': 333, '*': 500, '+': 570, ',': 250, '-': 333, '.': 250, '/': 278, '0': 500, '1': 500, '2': 500, '3': 500, '4': 500, '5': 500, '6': 500, '7': 500, '8': 500, '9': 500, ':': 333, ';': 333, '<': 570, '=': 570, '>': 570, '?': 500, '@': 832, 'A': 667, 'B': 667, 'C': 667, 'D': 722, 'E': 667, 'F': 667, 'G': 722, 'H': 778, 'I': 389, 'J': 500, 'K': 667, 'L': 611, 'M': 889, 'N': 722, 'O': 722, 'P': 611, 'Q': 722, 'R': 667, 'S': 556, 'T': 611, 'U': 722, 'V': 667, 'W': 889, 'X': 667, 'Y': 611, 'Z': 611, '[': 333, '\\': 278, ']': 333, '^': 570, '_': 500, '`': 333, 'a': 500, 'b': 500, 'c': 444, 'd': 500, 'e': 444, 'f': 333, 'g': 500, 'h': 556, 'i': 278, 'j': 278, 'k': 500, 'l': 278, 'm': 778, 'n': 556, 'o': 500, 'p': 500, 'q': 500, 'r': 389, 's': 389, 't': 278, 'u': 556, 'v': 444, 'w': 667, 'x': 500, 'y': 444, 'z': 389, '{': 348, '|': 220, '}': 348, '~': 570, '\xa1': 389, '\xa2': 500, '\xa3': 500, '\xa4': 500, '\xa5': 500, '\xa6': 220, '\xa7': 500, '\xa8': 333, '\xa9': 747, '\xaa': 266, '\xab': 500, '\xac': 606, '\xae': 747, '\xaf': 333, '\xb0': 400, '\xb1': 570, '\xb2': 300, '\xb3': 300, '\xb4': 333, '\xb5': 576, '\xb6': 500, '\xb7': 250, '\xb8': 333, '\xb9': 300, '\xba': 300, '\xbb': 500, '\xbc': 750, '\xbd': 750, '\xbe': 750, '\xbf': 500, '\xc0': 667, '\xc1': 667, '\xc2': 667, '\xc3': 667, '\xc4': 667, '\xc5': 667, '\xc6': 944, '\xc7': 667, '\xc8': 667, '\xc9': 667, '\xca': 667, '\xcb': 667, '\xcc': 389, '\xcd': 389, '\xce': 389, '\xcf': 389, '\xd0': 722, '\xd1': 722, '\xd2': 722, '\xd3': 722, '\xd4': 722, '\xd5': 722, '\xd6': 722, '\xd7': 570, '\xd8': 722, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 611, '\xde': 611, '\xdf': 500, '\xe0': 500, '\xe1': 500, '\xe2': 500, '\xe3': 500, '\xe4': 500, '\xe5': 500, '\xe6': 722, '\xe7': 444, '\xe8': 444, '\xe9': 444, '\xea': 444, '\xeb': 444, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 500, '\xf1': 556, '\xf2': 500, '\xf3': 500, '\xf4': 500, '\xf5': 500, '\xf6': 500, '\xf7': 570, '\xf8': 500, '\xf9': 556, '\xfa': 556, '\xfb': 556, '\xfc': 556, '\xfd': 444, '\xfe': 500, '\xff': 444, '\u0100': 667, '\u0101': 500, '\u0102': 667, '\u0103': 500, '\u0104': 667, '\u0105': 500, '\u0106': 667, '\u0107': 444, '\u010c': 667, '\u010d': 444, '\u010e': 722, '\u010f': 608, '\u0110': 722, '\u0111': 500, '\u0112': 667, '\u0113': 444, '\u0116': 667, '\u0117': 444, '\u0118': 667, '\u0119': 444, '\u011a': 667, '\u011b': 444, '\u011e': 722, '\u011f': 500, '\u0122': 722, '\u0123': 500, '\u012a': 389, '\u012b': 278, '\u012e': 389, '\u012f': 278, '\u0130': 389, '\u0131': 278, '\u0136': 667, '\u0137': 500, '\u0139': 611, '\u013a': 278, '\u013b': 611, '\u013c': 278, '\u013d': 611, '\u013e': 382, '\u0141': 611, '\u0142': 278, '\u0143': 722, '\u0144': 556, '\u0145': 722, '\u0146': 556, '\u0147': 722, '\u0148': 556, '\u014c': 722, '\u014d': 500, '\u0150': 722, '\u0151': 500, '\u0152': 944, '\u0153': 722, '\u0154': 667, '\u0155': 389, '\u0156': 667, '\u0157': 389, '\u0158': 667, '\u0159': 389, '\u015a': 556, '\u015b': 389, '\u015e': 556, '\u015f': 389, '\u0160': 556, '\u0161': 389, '\u0162': 611, '\u0163': 278, '\u0164': 611, '\u0165': 366, '\u016a': 722, '\u016b': 556, '\u016e': 722, '\u016f': 556, '\u0170': 722, '\u0171': 556, '\u0172': 722, '\u0173': 556, '\u0178': 611, '\u0179': 611, '\u017a': 389, '\u017b': 611, '\u017c': 389, '\u017d': 611, '\u017e': 389, '\u0192': 500, '\u0218': 556, '\u0219': 389, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 500, '\u2014': 1000, '\u2018': 333, '\u2019': 333, '\u201a': 333, '\u201c': 500, '\u201d': 500, '\u201e': 500, '\u2020': 500, '\u2021': 500, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 1000, '\u2202': 494, '\u2206': 612, '\u2211': 600, '\u2212': 606, '\u221a': 549, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 494, '\uf6c3': 250, '\ufb01': 556, '\ufb02': 556}),
|
||||
'Times-Italic': ({'FontName': 'Times-Italic', 'Descent': -217.0, 'FontBBox': (-169.0, -217.0, 1010.0, 883.0), 'FontWeight': 'Medium', 'CapHeight': 653.0, 'FontFamily': 'Times', 'Flags': 0, 'XHeight': 441.0, 'ItalicAngle': -15.5, 'Ascent': 683.0}, {' ': 250, '!': 333, '"': 420, '#': 500, '$': 500, '%': 833, '&': 778, "'": 214, '(': 333, ')': 333, '*': 500, '+': 675, ',': 250, '-': 333, '.': 250, '/': 278, '0': 500, '1': 500, '2': 500, '3': 500, '4': 500, '5': 500, '6': 500, '7': 500, '8': 500, '9': 500, ':': 333, ';': 333, '<': 675, '=': 675, '>': 675, '?': 500, '@': 920, 'A': 611, 'B': 611, 'C': 667, 'D': 722, 'E': 611, 'F': 611, 'G': 722, 'H': 722, 'I': 333, 'J': 444, 'K': 667, 'L': 556, 'M': 833, 'N': 667, 'O': 722, 'P': 611, 'Q': 722, 'R': 611, 'S': 500, 'T': 556, 'U': 722, 'V': 611, 'W': 833, 'X': 611, 'Y': 556, 'Z': 556, '[': 389, '\\': 278, ']': 389, '^': 422, '_': 500, '`': 333, 'a': 500, 'b': 500, 'c': 444, 'd': 500, 'e': 444, 'f': 278, 'g': 500, 'h': 500, 'i': 278, 'j': 278, 'k': 444, 'l': 278, 'm': 722, 'n': 500, 'o': 500, 'p': 500, 'q': 500, 'r': 389, 's': 389, 't': 278, 'u': 500, 'v': 444, 'w': 667, 'x': 444, 'y': 444, 'z': 389, '{': 400, '|': 275, '}': 400, '~': 541, '\xa1': 389, '\xa2': 500, '\xa3': 500, '\xa4': 500, '\xa5': 500, '\xa6': 275, '\xa7': 500, '\xa8': 333, '\xa9': 760, '\xaa': 276, '\xab': 500, '\xac': 675, '\xae': 760, '\xaf': 333, '\xb0': 400, '\xb1': 675, '\xb2': 300, '\xb3': 300, '\xb4': 333, '\xb5': 500, '\xb6': 523, '\xb7': 250, '\xb8': 333, '\xb9': 300, '\xba': 310, '\xbb': 500, '\xbc': 750, '\xbd': 750, '\xbe': 750, '\xbf': 500, '\xc0': 611, '\xc1': 611, '\xc2': 611, '\xc3': 611, '\xc4': 611, '\xc5': 611, '\xc6': 889, '\xc7': 667, '\xc8': 611, '\xc9': 611, '\xca': 611, '\xcb': 611, '\xcc': 333, '\xcd': 333, '\xce': 333, '\xcf': 333, '\xd0': 722, '\xd1': 667, '\xd2': 722, '\xd3': 722, '\xd4': 722, '\xd5': 722, '\xd6': 722, '\xd7': 675, '\xd8': 722, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 556, '\xde': 611, '\xdf': 500, '\xe0': 500, '\xe1': 500, '\xe2': 500, '\xe3': 500, '\xe4': 500, '\xe5': 500, '\xe6': 667, '\xe7': 444, '\xe8': 444, '\xe9': 444, '\xea': 444, '\xeb': 444, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 500, '\xf1': 500, '\xf2': 500, '\xf3': 500, '\xf4': 500, '\xf5': 500, '\xf6': 500, '\xf7': 675, '\xf8': 500, '\xf9': 500, '\xfa': 500, '\xfb': 500, '\xfc': 500, '\xfd': 444, '\xfe': 500, '\xff': 444, '\u0100': 611, '\u0101': 500, '\u0102': 611, '\u0103': 500, '\u0104': 611, '\u0105': 500, '\u0106': 667, '\u0107': 444, '\u010c': 667, '\u010d': 444, '\u010e': 722, '\u010f': 544, '\u0110': 722, '\u0111': 500, '\u0112': 611, '\u0113': 444, '\u0116': 611, '\u0117': 444, '\u0118': 611, '\u0119': 444, '\u011a': 611, '\u011b': 444, '\u011e': 722, '\u011f': 500, '\u0122': 722, '\u0123': 500, '\u012a': 333, '\u012b': 278, '\u012e': 333, '\u012f': 278, '\u0130': 333, '\u0131': 278, '\u0136': 667, '\u0137': 444, '\u0139': 556, '\u013a': 278, '\u013b': 556, '\u013c': 278, '\u013d': 611, '\u013e': 300, '\u0141': 556, '\u0142': 278, '\u0143': 667, '\u0144': 500, '\u0145': 667, '\u0146': 500, '\u0147': 667, '\u0148': 500, '\u014c': 722, '\u014d': 500, '\u0150': 722, '\u0151': 500, '\u0152': 944, '\u0153': 667, '\u0154': 611, '\u0155': 389, '\u0156': 611, '\u0157': 389, '\u0158': 611, '\u0159': 389, '\u015a': 500, '\u015b': 389, '\u015e': 500, '\u015f': 389, '\u0160': 500, '\u0161': 389, '\u0162': 556, '\u0163': 278, '\u0164': 556, '\u0165': 300, '\u016a': 722, '\u016b': 500, '\u016e': 722, '\u016f': 500, '\u0170': 722, '\u0171': 500, '\u0172': 722, '\u0173': 500, '\u0178': 556, '\u0179': 556, '\u017a': 389, '\u017b': 556, '\u017c': 389, '\u017d': 556, '\u017e': 389, '\u0192': 500, '\u0218': 500, '\u0219': 389, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 500, '\u2014': 889, '\u2018': 333, '\u2019': 333, '\u201a': 333, '\u201c': 556, '\u201d': 556, '\u201e': 556, '\u2020': 500, '\u2021': 500, '\u2022': 350, '\u2026': 889, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 980, '\u2202': 476, '\u2206': 612, '\u2211': 600, '\u2212': 675, '\u221a': 453, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 471, '\uf6c3': 250, '\ufb01': 500, '\ufb02': 500}),
|
||||
'Times-Roman': ({'FontName': 'Times-Roman', 'Descent': -217.0, 'FontBBox': (-168.0, -218.0, 1000.0, 898.0), 'FontWeight': 'Roman', 'CapHeight': 662.0, 'FontFamily': 'Times', 'Flags': 0, 'XHeight': 450.0, 'ItalicAngle': 0.0, 'Ascent': 683.0}, {' ': 250, '!': 333, '"': 408, '#': 500, '$': 500, '%': 833, '&': 778, "'": 180, '(': 333, ')': 333, '*': 500, '+': 564, ',': 250, '-': 333, '.': 250, '/': 278, '0': 500, '1': 500, '2': 500, '3': 500, '4': 500, '5': 500, '6': 500, '7': 500, '8': 500, '9': 500, ':': 278, ';': 278, '<': 564, '=': 564, '>': 564, '?': 444, '@': 921, 'A': 722, 'B': 667, 'C': 667, 'D': 722, 'E': 611, 'F': 556, 'G': 722, 'H': 722, 'I': 333, 'J': 389, 'K': 722, 'L': 611, 'M': 889, 'N': 722, 'O': 722, 'P': 556, 'Q': 722, 'R': 667, 'S': 556, 'T': 611, 'U': 722, 'V': 722, 'W': 944, 'X': 722, 'Y': 722, 'Z': 611, '[': 333, '\\': 278, ']': 333, '^': 469, '_': 500, '`': 333, 'a': 444, 'b': 500, 'c': 444, 'd': 500, 'e': 444, 'f': 333, 'g': 500, 'h': 500, 'i': 278, 'j': 278, 'k': 500, 'l': 278, 'm': 778, 'n': 500, 'o': 500, 'p': 500, 'q': 500, 'r': 333, 's': 389, 't': 278, 'u': 500, 'v': 500, 'w': 722, 'x': 500, 'y': 500, 'z': 444, '{': 480, '|': 200, '}': 480, '~': 541, '\xa1': 333, '\xa2': 500, '\xa3': 500, '\xa4': 500, '\xa5': 500, '\xa6': 200, '\xa7': 500, '\xa8': 333, '\xa9': 760, '\xaa': 276, '\xab': 500, '\xac': 564, '\xae': 760, '\xaf': 333, '\xb0': 400, '\xb1': 564, '\xb2': 300, '\xb3': 300, '\xb4': 333, '\xb5': 500, '\xb6': 453, '\xb7': 250, '\xb8': 333, '\xb9': 300, '\xba': 310, '\xbb': 500, '\xbc': 750, '\xbd': 750, '\xbe': 750, '\xbf': 444, '\xc0': 722, '\xc1': 722, '\xc2': 722, '\xc3': 722, '\xc4': 722, '\xc5': 722, '\xc6': 889, '\xc7': 667, '\xc8': 611, '\xc9': 611, '\xca': 611, '\xcb': 611, '\xcc': 333, '\xcd': 333, '\xce': 333, '\xcf': 333, '\xd0': 722, '\xd1': 722, '\xd2': 722, '\xd3': 722, '\xd4': 722, '\xd5': 722, '\xd6': 722, '\xd7': 564, '\xd8': 722, '\xd9': 722, '\xda': 722, '\xdb': 722, '\xdc': 722, '\xdd': 722, '\xde': 556, '\xdf': 500, '\xe0': 444, '\xe1': 444, '\xe2': 444, '\xe3': 444, '\xe4': 444, '\xe5': 444, '\xe6': 667, '\xe7': 444, '\xe8': 444, '\xe9': 444, '\xea': 444, '\xeb': 444, '\xec': 278, '\xed': 278, '\xee': 278, '\xef': 278, '\xf0': 500, '\xf1': 500, '\xf2': 500, '\xf3': 500, '\xf4': 500, '\xf5': 500, '\xf6': 500, '\xf7': 564, '\xf8': 500, '\xf9': 500, '\xfa': 500, '\xfb': 500, '\xfc': 500, '\xfd': 500, '\xfe': 500, '\xff': 500, '\u0100': 722, '\u0101': 444, '\u0102': 722, '\u0103': 444, '\u0104': 722, '\u0105': 444, '\u0106': 667, '\u0107': 444, '\u010c': 667, '\u010d': 444, '\u010e': 722, '\u010f': 588, '\u0110': 722, '\u0111': 500, '\u0112': 611, '\u0113': 444, '\u0116': 611, '\u0117': 444, '\u0118': 611, '\u0119': 444, '\u011a': 611, '\u011b': 444, '\u011e': 722, '\u011f': 500, '\u0122': 722, '\u0123': 500, '\u012a': 333, '\u012b': 278, '\u012e': 333, '\u012f': 278, '\u0130': 333, '\u0131': 278, '\u0136': 722, '\u0137': 500, '\u0139': 611, '\u013a': 278, '\u013b': 611, '\u013c': 278, '\u013d': 611, '\u013e': 344, '\u0141': 611, '\u0142': 278, '\u0143': 722, '\u0144': 500, '\u0145': 722, '\u0146': 500, '\u0147': 722, '\u0148': 500, '\u014c': 722, '\u014d': 500, '\u0150': 722, '\u0151': 500, '\u0152': 889, '\u0153': 722, '\u0154': 667, '\u0155': 333, '\u0156': 667, '\u0157': 333, '\u0158': 667, '\u0159': 333, '\u015a': 556, '\u015b': 389, '\u015e': 556, '\u015f': 389, '\u0160': 556, '\u0161': 389, '\u0162': 611, '\u0163': 278, '\u0164': 611, '\u0165': 326, '\u016a': 722, '\u016b': 500, '\u016e': 722, '\u016f': 500, '\u0170': 722, '\u0171': 500, '\u0172': 722, '\u0173': 500, '\u0178': 722, '\u0179': 611, '\u017a': 444, '\u017b': 611, '\u017c': 444, '\u017d': 611, '\u017e': 444, '\u0192': 500, '\u0218': 556, '\u0219': 389, '\u02c6': 333, '\u02c7': 333, '\u02d8': 333, '\u02d9': 333, '\u02da': 333, '\u02db': 333, '\u02dc': 333, '\u02dd': 333, '\u2013': 500, '\u2014': 1000, '\u2018': 333, '\u2019': 333, '\u201a': 333, '\u201c': 444, '\u201d': 444, '\u201e': 444, '\u2020': 500, '\u2021': 500, '\u2022': 350, '\u2026': 1000, '\u2030': 1000, '\u2039': 333, '\u203a': 333, '\u2044': 167, '\u2122': 980, '\u2202': 476, '\u2206': 612, '\u2211': 600, '\u2212': 564, '\u221a': 453, '\u2260': 549, '\u2264': 549, '\u2265': 549, '\u25ca': 471, '\uf6c3': 250, '\ufb01': 556, '\ufb02': 556}),
|
||||
'ZapfDingbats': ({'FontName': 'ZapfDingbats', 'FontBBox': (-1.0, -143.0, 981.0, 820.0), 'FontWeight': 'Medium', 'FontFamily': 'ITC', 'Flags': 0, 'ItalicAngle': 0.0}, {'\x01': 974, '\x02': 961, '\x03': 980, '\x04': 719, '\x05': 789, '\x06': 494, '\x07': 552, '\x08': 537, '\t': 577, '\n': 692, '\x0b': 960, '\x0c': 939, '\r': 549, '\x0e': 855, '\x0f': 911, '\x10': 933, '\x11': 945, '\x12': 974, '\x13': 755, '\x14': 846, '\x15': 762, '\x16': 761, '\x17': 571, '\x18': 677, '\x19': 763, '\x1a': 760, '\x1b': 759, '\x1c': 754, '\x1d': 786, '\x1e': 788, '\x1f': 788, ' ': 790, '!': 793, '"': 794, '#': 816, '$': 823, '%': 789, '&': 841, "'": 823, '(': 833, ')': 816, '*': 831, '+': 923, ',': 744, '-': 723, '.': 749, '/': 790, '0': 792, '1': 695, '2': 776, '3': 768, '4': 792, '5': 759, '6': 707, '7': 708, '8': 682, '9': 701, ':': 826, ';': 815, '<': 789, '=': 789, '>': 707, '?': 687, '@': 696, 'A': 689, 'B': 786, 'C': 787, 'D': 713, 'E': 791, 'F': 785, 'G': 791, 'H': 873, 'I': 761, 'J': 762, 'K': 759, 'L': 892, 'M': 892, 'N': 788, 'O': 784, 'Q': 438, 'R': 138, 'S': 277, 'T': 415, 'U': 509, 'V': 410, 'W': 234, 'X': 234, 'Y': 390, 'Z': 390, '[': 276, '\\': 276, ']': 317, '^': 317, '_': 334, '`': 334, 'a': 392, 'b': 392, 'c': 668, 'd': 668, 'e': 732, 'f': 544, 'g': 544, 'h': 910, 'i': 911, 'j': 667, 'k': 760, 'l': 760, 'm': 626, 'n': 694, 'o': 595, 'p': 776, 'u': 690, 'v': 791, 'w': 790, 'x': 788, 'y': 788, 'z': 788, '{': 788, '|': 788, '}': 788, '~': 788, '\x7f': 788, '\x80': 788, '\x81': 788, '\x82': 788, '\x83': 788, '\x84': 788, '\x85': 788, '\x86': 788, '\x87': 788, '\x88': 788, '\x89': 788, '\x8a': 788, '\x8b': 788, '\x8c': 788, '\x8d': 788, '\x8e': 788, '\x8f': 788, '\x90': 788, '\x91': 788, '\x92': 788, '\x93': 788, '\x94': 788, '\x95': 788, '\x96': 788, '\x97': 788, '\x98': 788, '\x99': 788, '\x9a': 788, '\x9b': 788, '\x9c': 788, '\x9d': 788, '\x9e': 788, '\x9f': 788, '\xa0': 894, '\xa1': 838, '\xa2': 924, '\xa3': 1016, '\xa4': 458, '\xa5': 924, '\xa6': 918, '\xa7': 927, '\xa8': 928, '\xa9': 928, '\xaa': 834, '\xab': 873, '\xac': 828, '\xad': 924, '\xae': 917, '\xaf': 930, '\xb0': 931, '\xb1': 463, '\xb2': 883, '\xb3': 836, '\xb4': 867, '\xb5': 696, '\xb6': 874, '\xb7': 760, '\xb8': 946, '\xb9': 865, '\xba': 967, '\xbb': 831, '\xbc': 873, '\xbd': 927, '\xbe': 970, '\xbf': 918, '\xc0': 748, '\xc1': 836, '\xc2': 771, '\xc3': 888, '\xc4': 748, '\xc5': 771, '\xc6': 888, '\xc7': 867, '\xc8': 696, '\xc9': 874, '\xca': 974, '\xcb': 762, '\xcc': 759, '\xcd': 509, '\xce': 410}),
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,151 +0,0 @@
|
||||
"""Functions that can be used for the most common use-cases for pdfminer.six"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from io import StringIO
|
||||
|
||||
from .converter import XMLConverter, HTMLConverter, TextConverter, \
|
||||
PDFPageAggregator
|
||||
from .image import ImageWriter
|
||||
from .layout import LAParams
|
||||
from .pdfdevice import TagExtractor
|
||||
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from .pdfpage import PDFPage
|
||||
from .utils import open_filename
|
||||
|
||||
|
||||
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
||||
laparams=None, maxpages=0, page_numbers=None,
|
||||
password="", scale=1.0, rotation=0, layoutmode='normal',
|
||||
output_dir=None, strip_control=False, debug=False,
|
||||
disable_caching=False, **kwargs):
|
||||
"""Parses text from inf-file and writes to outfp file-like object.
|
||||
|
||||
Takes loads of optional arguments but the defaults are somewhat sane.
|
||||
Beware laparams: Including an empty LAParams is not the same as passing
|
||||
None!
|
||||
|
||||
:param inf: a file-like object to read PDF structure from, such as a
|
||||
file handler (using the builtin `open()` function) or a `BytesIO`.
|
||||
:param outfp: a file-like object to write the text to.
|
||||
:param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works
|
||||
properly.
|
||||
:param codec: Text decoding codec
|
||||
:param laparams: An LAParams object from pdfminer.layout. Default is None
|
||||
but may not layout correctly.
|
||||
:param maxpages: How many pages to stop parsing after
|
||||
:param page_numbers: zero-indexed page numbers to operate on.
|
||||
:param password: For encrypted PDFs, the password to decrypt.
|
||||
:param scale: Scale factor
|
||||
:param rotation: Rotation factor
|
||||
:param layoutmode: Default is 'normal', see
|
||||
pdfminer.converter.HTMLConverter
|
||||
:param output_dir: If given, creates an ImageWriter for extracted images.
|
||||
:param strip_control: Does what it says on the tin
|
||||
:param debug: Output more logging data
|
||||
:param disable_caching: Does what it says on the tin
|
||||
:param other:
|
||||
:return: nothing, acting as it does on two streams. Use StringIO to get
|
||||
strings.
|
||||
"""
|
||||
if debug:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
imagewriter = None
|
||||
if output_dir:
|
||||
imagewriter = ImageWriter(output_dir)
|
||||
|
||||
rsrcmgr = PDFResourceManager(caching=not disable_caching)
|
||||
|
||||
if output_type == 'text':
|
||||
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||
imagewriter=imagewriter)
|
||||
|
||||
if outfp == sys.stdout:
|
||||
outfp = sys.stdout.buffer
|
||||
|
||||
if output_type == 'xml':
|
||||
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||
imagewriter=imagewriter,
|
||||
stripcontrol=strip_control)
|
||||
elif output_type == 'html':
|
||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
||||
layoutmode=layoutmode, laparams=laparams,
|
||||
imagewriter=imagewriter)
|
||||
elif output_type == 'tag':
|
||||
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
||||
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for page in PDFPage.get_pages(inf,
|
||||
page_numbers,
|
||||
maxpages=maxpages,
|
||||
password=password,
|
||||
caching=not disable_caching):
|
||||
page.rotate = (page.rotate + rotation) % 360
|
||||
interpreter.process_page(page)
|
||||
|
||||
device.close()
|
||||
|
||||
|
||||
def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||
caching=True, codec='utf-8', laparams=None):
|
||||
"""Parse and return the text contained in a PDF file.
|
||||
|
||||
:param pdf_file: Either a file path or a file-like object for the PDF file
|
||||
to be worked on.
|
||||
:param password: For encrypted PDFs, the password to decrypt.
|
||||
:param page_numbers: List of zero-indexed page numbers to extract.
|
||||
:param maxpages: The maximum number of pages to parse
|
||||
:param caching: If resources should be cached
|
||||
:param codec: Text decoding codec
|
||||
:param laparams: An LAParams object from pdfminer.layout. If None, uses
|
||||
some default settings that often work well.
|
||||
:return: a string containing all of the text extracted.
|
||||
"""
|
||||
if laparams is None:
|
||||
laparams = LAParams()
|
||||
|
||||
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
|
||||
rsrcmgr = PDFResourceManager(caching=caching)
|
||||
device = TextConverter(rsrcmgr, output_string, codec=codec,
|
||||
laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
|
||||
for page in PDFPage.get_pages(
|
||||
fp,
|
||||
page_numbers,
|
||||
maxpages=maxpages,
|
||||
password=password,
|
||||
caching=caching,
|
||||
):
|
||||
interpreter.process_page(page)
|
||||
|
||||
return output_string.getvalue()
|
||||
|
||||
|
||||
def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||
caching=True, laparams=None):
|
||||
"""Extract and yield LTPage objects
|
||||
|
||||
:param pdf_file: Either a file path or a file-like object for the PDF file
|
||||
to be worked on.
|
||||
:param password: For encrypted PDFs, the password to decrypt.
|
||||
:param page_numbers: List of zero-indexed page numbers to extract.
|
||||
:param maxpages: The maximum number of pages to parse
|
||||
:param caching: If resources should be cached
|
||||
:param laparams: An LAParams object from pdfminer.layout. If None, uses
|
||||
some default settings that often work well.
|
||||
:return:
|
||||
"""
|
||||
if laparams is None:
|
||||
laparams = LAParams()
|
||||
|
||||
with open_filename(pdf_file, "rb") as fp:
|
||||
resource_manager = PDFResourceManager(caching=caching)
|
||||
device = PDFPageAggregator(resource_manager, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||
for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,
|
||||
password=password, caching=caching):
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
yield layout
|
||||
@@ -1,165 +0,0 @@
|
||||
import os
|
||||
import os.path
|
||||
import struct
|
||||
from io import BytesIO
|
||||
|
||||
from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter
|
||||
from .pdfcolor import LITERAL_DEVICE_CMYK
|
||||
from .pdfcolor import LITERAL_DEVICE_GRAY
|
||||
from .pdfcolor import LITERAL_DEVICE_RGB
|
||||
from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE
|
||||
|
||||
|
||||
def align32(x):
|
||||
return ((x+3)//4)*4
|
||||
|
||||
|
||||
class BMPWriter:
|
||||
def __init__(self, fp, bits, width, height):
|
||||
self.fp = fp
|
||||
self.bits = bits
|
||||
self.width = width
|
||||
self.height = height
|
||||
if bits == 1:
|
||||
ncols = 2
|
||||
elif bits == 8:
|
||||
ncols = 256
|
||||
elif bits == 24:
|
||||
ncols = 0
|
||||
else:
|
||||
raise ValueError(bits)
|
||||
self.linesize = align32((self.width*self.bits+7)//8)
|
||||
self.datasize = self.linesize * self.height
|
||||
headersize = 14+40+ncols*4
|
||||
info = struct.pack('<IiiHHIIIIII', 40, self.width, self.height,
|
||||
1, self.bits, 0, self.datasize, 0, 0, ncols, 0)
|
||||
assert len(info) == 40, str(len(info))
|
||||
header = struct.pack('<ccIHHI', b'B', b'M',
|
||||
headersize+self.datasize, 0, 0, headersize)
|
||||
assert len(header) == 14, str(len(header))
|
||||
self.fp.write(header)
|
||||
self.fp.write(info)
|
||||
if ncols == 2:
|
||||
# B&W color table
|
||||
for i in (0, 255):
|
||||
self.fp.write(struct.pack('BBBx', i, i, i))
|
||||
elif ncols == 256:
|
||||
# grayscale color table
|
||||
for i in range(256):
|
||||
self.fp.write(struct.pack('BBBx', i, i, i))
|
||||
self.pos0 = self.fp.tell()
|
||||
self.pos1 = self.pos0 + self.datasize
|
||||
return
|
||||
|
||||
def write_line(self, y, data):
|
||||
self.fp.seek(self.pos1 - (y+1)*self.linesize)
|
||||
self.fp.write(data)
|
||||
return
|
||||
|
||||
|
||||
class ImageWriter:
|
||||
"""Write image to a file
|
||||
|
||||
Supports various image types: JPEG, JBIG2 and bitmaps
|
||||
"""
|
||||
|
||||
def __init__(self, outdir):
|
||||
self.outdir = outdir
|
||||
if not os.path.exists(self.outdir):
|
||||
os.makedirs(self.outdir)
|
||||
return
|
||||
|
||||
def export_image(self, image):
|
||||
(width, height) = image.srcsize
|
||||
|
||||
is_jbig2 = self.is_jbig2_image(image)
|
||||
ext = self._get_image_extension(image, width, height, is_jbig2)
|
||||
name, path = self._create_unique_image_name(self.outdir,
|
||||
image.name, ext)
|
||||
|
||||
fp = open(path, 'wb')
|
||||
if ext == '.jpg':
|
||||
raw_data = image.stream.get_rawdata()
|
||||
if LITERAL_DEVICE_CMYK in image.colorspace:
|
||||
from PIL import Image
|
||||
from PIL import ImageChops
|
||||
ifp = BytesIO(raw_data)
|
||||
i = Image.open(ifp)
|
||||
i = ImageChops.invert(i)
|
||||
i = i.convert('RGB')
|
||||
i.save(fp, 'JPEG')
|
||||
else:
|
||||
fp.write(raw_data)
|
||||
elif is_jbig2:
|
||||
input_stream = BytesIO()
|
||||
input_stream.write(image.stream.get_data())
|
||||
input_stream.seek(0)
|
||||
reader = JBIG2StreamReader(input_stream)
|
||||
segments = reader.get_segments()
|
||||
|
||||
writer = JBIG2StreamWriter(fp)
|
||||
writer.write_file(segments)
|
||||
elif image.bits == 1:
|
||||
bmp = BMPWriter(fp, 1, width, height)
|
||||
data = image.stream.get_data()
|
||||
i = 0
|
||||
width = (width+7)//8
|
||||
for y in range(height):
|
||||
bmp.write_line(y, data[i:i+width])
|
||||
i += width
|
||||
elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
|
||||
bmp = BMPWriter(fp, 24, width, height)
|
||||
data = image.stream.get_data()
|
||||
i = 0
|
||||
width = width*3
|
||||
for y in range(height):
|
||||
bmp.write_line(y, data[i:i+width])
|
||||
i += width
|
||||
elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
|
||||
bmp = BMPWriter(fp, 8, width, height)
|
||||
data = image.stream.get_data()
|
||||
i = 0
|
||||
for y in range(height):
|
||||
bmp.write_line(y, data[i:i+width])
|
||||
i += width
|
||||
else:
|
||||
fp.write(image.stream.get_data())
|
||||
fp.close()
|
||||
return name
|
||||
|
||||
@staticmethod
|
||||
def is_jbig2_image(image):
|
||||
filters = image.stream.get_filters()
|
||||
is_jbig2 = False
|
||||
for filter_name, params in filters:
|
||||
if filter_name in LITERALS_JBIG2_DECODE:
|
||||
is_jbig2 = True
|
||||
break
|
||||
return is_jbig2
|
||||
|
||||
@staticmethod
|
||||
def _get_image_extension(image, width, height, is_jbig2):
|
||||
filters = image.stream.get_filters()
|
||||
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
|
||||
ext = '.jpg'
|
||||
elif is_jbig2:
|
||||
ext = '.jb2'
|
||||
elif (image.bits == 1 or
|
||||
image.bits == 8 and
|
||||
(LITERAL_DEVICE_RGB in image.colorspace or
|
||||
LITERAL_DEVICE_GRAY in image.colorspace)):
|
||||
ext = '.%dx%d.bmp' % (width, height)
|
||||
else:
|
||||
ext = '.%d.%dx%d.img' % (image.bits, width, height)
|
||||
return ext
|
||||
|
||||
@staticmethod
|
||||
def _create_unique_image_name(dirname, image_name, ext):
|
||||
name = image_name + ext
|
||||
path = os.path.join(dirname, name)
|
||||
img_index = 0
|
||||
while os.path.exists(path):
|
||||
name = '%s.%d%s' % (image_name, img_index, ext)
|
||||
path = os.path.join(dirname, name)
|
||||
img_index += 1
|
||||
return name, path
|
||||
@@ -1,321 +0,0 @@
|
||||
import math
|
||||
import os
|
||||
from struct import pack, unpack, calcsize
|
||||
|
||||
# segment structure base
|
||||
SEG_STRUCT = [
|
||||
(">L", "number"),
|
||||
(">B", "flags"),
|
||||
(">B", "retention_flags"),
|
||||
(">B", "page_assoc"),
|
||||
(">L", "data_length"),
|
||||
]
|
||||
|
||||
# segment header literals
|
||||
HEADER_FLAG_DEFERRED = 0b10000000
|
||||
HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000
|
||||
|
||||
SEG_TYPE_MASK = 0b00111111
|
||||
|
||||
REF_COUNT_SHORT_MASK = 0b11100000
|
||||
REF_COUNT_LONG_MASK = 0x1fffffff
|
||||
REF_COUNT_LONG = 7
|
||||
|
||||
DATA_LEN_UNKNOWN = 0xffffffff
|
||||
|
||||
# segment types
|
||||
SEG_TYPE_IMMEDIATE_GEN_REGION = 38
|
||||
SEG_TYPE_END_OF_PAGE = 49
|
||||
SEG_TYPE_END_OF_FILE = 50
|
||||
|
||||
# file literals
|
||||
FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A'
|
||||
FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
|
||||
FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010
|
||||
|
||||
|
||||
def bit_set(bit_pos, value):
|
||||
return bool((value >> bit_pos) & 1)
|
||||
|
||||
|
||||
def check_flag(flag, value):
|
||||
return bool(flag & value)
|
||||
|
||||
|
||||
def masked_value(mask, value):
|
||||
for bit_pos in range(0, 31):
|
||||
if bit_set(bit_pos, mask):
|
||||
return (value & mask) >> bit_pos
|
||||
|
||||
raise Exception("Invalid mask or value")
|
||||
|
||||
|
||||
def mask_value(mask, value):
|
||||
for bit_pos in range(0, 31):
|
||||
if bit_set(bit_pos, mask):
|
||||
return (value & (mask >> bit_pos)) << bit_pos
|
||||
|
||||
raise Exception("Invalid mask or value")
|
||||
|
||||
|
||||
class JBIG2StreamReader:
|
||||
"""Read segments from a JBIG2 byte stream"""
|
||||
|
||||
def __init__(self, stream):
|
||||
self.stream = stream
|
||||
|
||||
def get_segments(self):
|
||||
segments = []
|
||||
while not self.is_eof():
|
||||
segment = {}
|
||||
for field_format, name in SEG_STRUCT:
|
||||
field_len = calcsize(field_format)
|
||||
field = self.stream.read(field_len)
|
||||
if len(field) < field_len:
|
||||
segment["_error"] = True
|
||||
break
|
||||
value = unpack(field_format, field)
|
||||
if len(value) == 1:
|
||||
[value] = value
|
||||
parser = getattr(self, "parse_%s" % name, None)
|
||||
if callable(parser):
|
||||
value = parser(segment, value, field)
|
||||
segment[name] = value
|
||||
|
||||
if not segment.get("_error"):
|
||||
segments.append(segment)
|
||||
return segments
|
||||
|
||||
def is_eof(self):
|
||||
if self.stream.read(1) == b'':
|
||||
return True
|
||||
else:
|
||||
self.stream.seek(-1, os.SEEK_CUR)
|
||||
return False
|
||||
|
||||
def parse_flags(self, segment, flags, field):
|
||||
return {
|
||||
"deferred": check_flag(HEADER_FLAG_DEFERRED, flags),
|
||||
"page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags),
|
||||
"type": masked_value(SEG_TYPE_MASK, flags)
|
||||
}
|
||||
|
||||
def parse_retention_flags(self, segment, flags, field):
|
||||
ref_count = masked_value(REF_COUNT_SHORT_MASK, flags)
|
||||
retain_segments = []
|
||||
ref_segments = []
|
||||
|
||||
if ref_count < REF_COUNT_LONG:
|
||||
for bit_pos in range(5):
|
||||
retain_segments.append(bit_set(bit_pos, flags))
|
||||
else:
|
||||
field += self.stream.read(3)
|
||||
[ref_count] = unpack(">L", field)
|
||||
ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count)
|
||||
ret_bytes_count = int(math.ceil((ref_count + 1) / 8))
|
||||
for ret_byte_index in range(ret_bytes_count):
|
||||
[ret_byte] = unpack(">B", self.stream.read(1))
|
||||
for bit_pos in range(7):
|
||||
retain_segments.append(bit_set(bit_pos, ret_byte))
|
||||
|
||||
seg_num = segment["number"]
|
||||
if seg_num <= 256:
|
||||
ref_format = ">B"
|
||||
elif seg_num <= 65536:
|
||||
ref_format = ">I"
|
||||
else:
|
||||
ref_format = ">L"
|
||||
|
||||
ref_size = calcsize(ref_format)
|
||||
|
||||
for ref_index in range(ref_count):
|
||||
ref = self.stream.read(ref_size)
|
||||
[ref] = unpack(ref_format, ref)
|
||||
ref_segments.append(ref)
|
||||
|
||||
return {
|
||||
"ref_count": ref_count,
|
||||
"retain_segments": retain_segments,
|
||||
"ref_segments": ref_segments,
|
||||
}
|
||||
|
||||
def parse_page_assoc(self, segment, page, field):
|
||||
if segment["flags"]["page_assoc_long"]:
|
||||
field += self.stream.read(3)
|
||||
[page] = unpack(">L", field)
|
||||
return page
|
||||
|
||||
def parse_data_length(self, segment, length, field):
|
||||
if length:
|
||||
if (segment["flags"]["type"] == SEG_TYPE_IMMEDIATE_GEN_REGION) \
|
||||
and (length == DATA_LEN_UNKNOWN):
|
||||
|
||||
raise NotImplementedError(
|
||||
"Working with unknown segment length "
|
||||
"is not implemented yet"
|
||||
)
|
||||
else:
|
||||
segment["raw_data"] = self.stream.read(length)
|
||||
|
||||
return length
|
||||
|
||||
|
||||
class JBIG2StreamWriter:
|
||||
"""Write JBIG2 segments to a file in JBIG2 format"""
|
||||
|
||||
def __init__(self, stream):
|
||||
self.stream = stream
|
||||
|
||||
def write_segments(self, segments, fix_last_page=True):
|
||||
data_len = 0
|
||||
current_page = None
|
||||
seg_num = None
|
||||
|
||||
for segment in segments:
|
||||
data = self.encode_segment(segment)
|
||||
self.stream.write(data)
|
||||
data_len += len(data)
|
||||
|
||||
seg_num = segment["number"]
|
||||
|
||||
if fix_last_page:
|
||||
seg_page = segment.get("page_assoc")
|
||||
|
||||
if segment["flags"]["type"] == SEG_TYPE_END_OF_PAGE:
|
||||
current_page = None
|
||||
elif seg_page:
|
||||
current_page = seg_page
|
||||
|
||||
if fix_last_page and current_page and (seg_num is not None):
|
||||
segment = self.get_eop_segment(seg_num + 1, current_page)
|
||||
data = self.encode_segment(segment)
|
||||
self.stream.write(data)
|
||||
data_len += len(data)
|
||||
|
||||
return data_len
|
||||
|
||||
def write_file(self, segments, fix_last_page=True):
|
||||
header = FILE_HEADER_ID
|
||||
header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN
|
||||
header += pack(">B", header_flags)
|
||||
self.stream.write(header)
|
||||
data_len = len(header)
|
||||
|
||||
data_len += self.write_segments(segments, fix_last_page)
|
||||
|
||||
seg_num = 0
|
||||
for segment in segments:
|
||||
seg_num = segment["number"]
|
||||
|
||||
eof_segment = self.get_eof_segment(seg_num + 1)
|
||||
data = self.encode_segment(eof_segment)
|
||||
|
||||
self.stream.write(data)
|
||||
data_len += len(data)
|
||||
|
||||
return data_len
|
||||
|
||||
def encode_segment(self, segment):
|
||||
data = b''
|
||||
for field_format, name in SEG_STRUCT:
|
||||
value = segment.get(name)
|
||||
encoder = getattr(self, "encode_%s" % name, None)
|
||||
if callable(encoder):
|
||||
field = encoder(value, segment)
|
||||
else:
|
||||
field = pack(field_format, value)
|
||||
data += field
|
||||
return data
|
||||
|
||||
def encode_flags(self, value, segment):
|
||||
flags = 0
|
||||
if value.get("deferred"):
|
||||
flags |= HEADER_FLAG_DEFERRED
|
||||
|
||||
if "page_assoc_long" in value:
|
||||
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \
|
||||
if value["page_assoc_long"] else flags
|
||||
else:
|
||||
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \
|
||||
if segment.get("page", 0) > 255 else flags
|
||||
|
||||
flags |= mask_value(SEG_TYPE_MASK, value["type"])
|
||||
|
||||
return pack(">B", flags)
|
||||
|
||||
def encode_retention_flags(self, value, segment):
|
||||
flags = []
|
||||
flags_format = ">B"
|
||||
ref_count = value["ref_count"]
|
||||
retain_segments = value.get("retain_segments", [])
|
||||
|
||||
if ref_count <= 4:
|
||||
flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
|
||||
for ref_index, ref_retain in enumerate(retain_segments):
|
||||
flags_byte |= 1 << ref_index
|
||||
flags.append(flags_byte)
|
||||
else:
|
||||
bytes_count = math.ceil((ref_count + 1) / 8)
|
||||
flags_format = ">L" + ("B" * bytes_count)
|
||||
flags_dword = mask_value(
|
||||
REF_COUNT_SHORT_MASK,
|
||||
REF_COUNT_LONG
|
||||
) << 24
|
||||
flags.append(flags_dword)
|
||||
|
||||
for byte_index in range(bytes_count):
|
||||
ret_byte = 0
|
||||
ret_part = retain_segments[byte_index * 8:byte_index * 8 + 8]
|
||||
for bit_pos, ret_seg in enumerate(ret_part):
|
||||
ret_byte |= 1 << bit_pos if ret_seg else ret_byte
|
||||
|
||||
flags.append(ret_byte)
|
||||
|
||||
ref_segments = value.get("ref_segments", [])
|
||||
|
||||
seg_num = segment["number"]
|
||||
if seg_num <= 256:
|
||||
ref_format = "B"
|
||||
elif seg_num <= 65536:
|
||||
ref_format = "I"
|
||||
else:
|
||||
ref_format = "L"
|
||||
|
||||
for ref in ref_segments:
|
||||
flags_format += ref_format
|
||||
flags.append(ref)
|
||||
|
||||
return pack(flags_format, *flags)
|
||||
|
||||
def encode_data_length(self, value, segment):
|
||||
data = pack(">L", value)
|
||||
data += segment["raw_data"]
|
||||
return data
|
||||
|
||||
def get_eop_segment(self, seg_number, page_number):
|
||||
return {
|
||||
'data_length': 0,
|
||||
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_PAGE},
|
||||
'number': seg_number,
|
||||
'page_assoc': page_number,
|
||||
'raw_data': b'',
|
||||
'retention_flags': {
|
||||
'ref_count': 0,
|
||||
'ref_segments': [],
|
||||
'retain_segments': []
|
||||
}
|
||||
}
|
||||
|
||||
def get_eof_segment(self, seg_number):
|
||||
return {
|
||||
'data_length': 0,
|
||||
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_FILE},
|
||||
'number': seg_number,
|
||||
'page_assoc': 0,
|
||||
'raw_data': b'',
|
||||
'retention_flags': {
|
||||
'ref_count': 0,
|
||||
'ref_segments': [],
|
||||
'retain_segments': []
|
||||
}
|
||||
}
|
||||
@@ -1,242 +0,0 @@
|
||||
""" Standard encoding tables used in PDF.
|
||||
|
||||
This table is extracted from PDF Reference Manual 1.6, pp.925
|
||||
"D.1 Latin Character Set and Encodings"
|
||||
|
||||
"""
|
||||
|
||||
ENCODING = [
|
||||
# (name, std, mac, win, pdf)
|
||||
('A', 65, 65, 65, 65),
|
||||
('AE', 225, 174, 198, 198),
|
||||
('Aacute', None, 231, 193, 193),
|
||||
('Acircumflex', None, 229, 194, 194),
|
||||
('Adieresis', None, 128, 196, 196),
|
||||
('Agrave', None, 203, 192, 192),
|
||||
('Aring', None, 129, 197, 197),
|
||||
('Atilde', None, 204, 195, 195),
|
||||
('B', 66, 66, 66, 66),
|
||||
('C', 67, 67, 67, 67),
|
||||
('Ccedilla', None, 130, 199, 199),
|
||||
('D', 68, 68, 68, 68),
|
||||
('E', 69, 69, 69, 69),
|
||||
('Eacute', None, 131, 201, 201),
|
||||
('Ecircumflex', None, 230, 202, 202),
|
||||
('Edieresis', None, 232, 203, 203),
|
||||
('Egrave', None, 233, 200, 200),
|
||||
('Eth', None, None, 208, 208),
|
||||
('Euro', None, None, 128, 160),
|
||||
('F', 70, 70, 70, 70),
|
||||
('G', 71, 71, 71, 71),
|
||||
('H', 72, 72, 72, 72),
|
||||
('I', 73, 73, 73, 73),
|
||||
('Iacute', None, 234, 205, 205),
|
||||
('Icircumflex', None, 235, 206, 206),
|
||||
('Idieresis', None, 236, 207, 207),
|
||||
('Igrave', None, 237, 204, 204),
|
||||
('J', 74, 74, 74, 74),
|
||||
('K', 75, 75, 75, 75),
|
||||
('L', 76, 76, 76, 76),
|
||||
('Lslash', 232, None, None, 149),
|
||||
('M', 77, 77, 77, 77),
|
||||
('N', 78, 78, 78, 78),
|
||||
('Ntilde', None, 132, 209, 209),
|
||||
('O', 79, 79, 79, 79),
|
||||
('OE', 234, 206, 140, 150),
|
||||
('Oacute', None, 238, 211, 211),
|
||||
('Ocircumflex', None, 239, 212, 212),
|
||||
('Odieresis', None, 133, 214, 214),
|
||||
('Ograve', None, 241, 210, 210),
|
||||
('Oslash', 233, 175, 216, 216),
|
||||
('Otilde', None, 205, 213, 213),
|
||||
('P', 80, 80, 80, 80),
|
||||
('Q', 81, 81, 81, 81),
|
||||
('R', 82, 82, 82, 82),
|
||||
('S', 83, 83, 83, 83),
|
||||
('Scaron', None, None, 138, 151),
|
||||
('T', 84, 84, 84, 84),
|
||||
('Thorn', None, None, 222, 222),
|
||||
('U', 85, 85, 85, 85),
|
||||
('Uacute', None, 242, 218, 218),
|
||||
('Ucircumflex', None, 243, 219, 219),
|
||||
('Udieresis', None, 134, 220, 220),
|
||||
('Ugrave', None, 244, 217, 217),
|
||||
('V', 86, 86, 86, 86),
|
||||
('W', 87, 87, 87, 87),
|
||||
('X', 88, 88, 88, 88),
|
||||
('Y', 89, 89, 89, 89),
|
||||
('Yacute', None, None, 221, 221),
|
||||
('Ydieresis', None, 217, 159, 152),
|
||||
('Z', 90, 90, 90, 90),
|
||||
('Zcaron', None, None, 142, 153),
|
||||
('a', 97, 97, 97, 97),
|
||||
('aacute', None, 135, 225, 225),
|
||||
('acircumflex', None, 137, 226, 226),
|
||||
('acute', 194, 171, 180, 180),
|
||||
('adieresis', None, 138, 228, 228),
|
||||
('ae', 241, 190, 230, 230),
|
||||
('agrave', None, 136, 224, 224),
|
||||
('ampersand', 38, 38, 38, 38),
|
||||
('aring', None, 140, 229, 229),
|
||||
('asciicircum', 94, 94, 94, 94),
|
||||
('asciitilde', 126, 126, 126, 126),
|
||||
('asterisk', 42, 42, 42, 42),
|
||||
('at', 64, 64, 64, 64),
|
||||
('atilde', None, 139, 227, 227),
|
||||
('b', 98, 98, 98, 98),
|
||||
('backslash', 92, 92, 92, 92),
|
||||
('bar', 124, 124, 124, 124),
|
||||
('braceleft', 123, 123, 123, 123),
|
||||
('braceright', 125, 125, 125, 125),
|
||||
('bracketleft', 91, 91, 91, 91),
|
||||
('bracketright', 93, 93, 93, 93),
|
||||
('breve', 198, 249, None, 24),
|
||||
('brokenbar', None, None, 166, 166),
|
||||
('bullet', 183, 165, 149, 128),
|
||||
('c', 99, 99, 99, 99),
|
||||
('caron', 207, 255, None, 25),
|
||||
('ccedilla', None, 141, 231, 231),
|
||||
('cedilla', 203, 252, 184, 184),
|
||||
('cent', 162, 162, 162, 162),
|
||||
('circumflex', 195, 246, 136, 26),
|
||||
('colon', 58, 58, 58, 58),
|
||||
('comma', 44, 44, 44, 44),
|
||||
('copyright', None, 169, 169, 169),
|
||||
('currency', 168, 219, 164, 164),
|
||||
('d', 100, 100, 100, 100),
|
||||
('dagger', 178, 160, 134, 129),
|
||||
('daggerdbl', 179, 224, 135, 130),
|
||||
('degree', None, 161, 176, 176),
|
||||
('dieresis', 200, 172, 168, 168),
|
||||
('divide', None, 214, 247, 247),
|
||||
('dollar', 36, 36, 36, 36),
|
||||
('dotaccent', 199, 250, None, 27),
|
||||
('dotlessi', 245, 245, None, 154),
|
||||
('e', 101, 101, 101, 101),
|
||||
('eacute', None, 142, 233, 233),
|
||||
('ecircumflex', None, 144, 234, 234),
|
||||
('edieresis', None, 145, 235, 235),
|
||||
('egrave', None, 143, 232, 232),
|
||||
('eight', 56, 56, 56, 56),
|
||||
('ellipsis', 188, 201, 133, 131),
|
||||
('emdash', 208, 209, 151, 132),
|
||||
('endash', 177, 208, 150, 133),
|
||||
('equal', 61, 61, 61, 61),
|
||||
('eth', None, None, 240, 240),
|
||||
('exclam', 33, 33, 33, 33),
|
||||
('exclamdown', 161, 193, 161, 161),
|
||||
('f', 102, 102, 102, 102),
|
||||
('fi', 174, 222, None, 147),
|
||||
('five', 53, 53, 53, 53),
|
||||
('fl', 175, 223, None, 148),
|
||||
('florin', 166, 196, 131, 134),
|
||||
('four', 52, 52, 52, 52),
|
||||
('fraction', 164, 218, None, 135),
|
||||
('g', 103, 103, 103, 103),
|
||||
('germandbls', 251, 167, 223, 223),
|
||||
('grave', 193, 96, 96, 96),
|
||||
('greater', 62, 62, 62, 62),
|
||||
('guillemotleft', 171, 199, 171, 171),
|
||||
('guillemotright', 187, 200, 187, 187),
|
||||
('guilsinglleft', 172, 220, 139, 136),
|
||||
('guilsinglright', 173, 221, 155, 137),
|
||||
('h', 104, 104, 104, 104),
|
||||
('hungarumlaut', 205, 253, None, 28),
|
||||
('hyphen', 45, 45, 45, 45),
|
||||
('i', 105, 105, 105, 105),
|
||||
('iacute', None, 146, 237, 237),
|
||||
('icircumflex', None, 148, 238, 238),
|
||||
('idieresis', None, 149, 239, 239),
|
||||
('igrave', None, 147, 236, 236),
|
||||
('j', 106, 106, 106, 106),
|
||||
('k', 107, 107, 107, 107),
|
||||
('l', 108, 108, 108, 108),
|
||||
('less', 60, 60, 60, 60),
|
||||
('logicalnot', None, 194, 172, 172),
|
||||
('lslash', 248, None, None, 155),
|
||||
('m', 109, 109, 109, 109),
|
||||
('macron', 197, 248, 175, 175),
|
||||
('minus', None, None, None, 138),
|
||||
('mu', None, 181, 181, 181),
|
||||
('multiply', None, None, 215, 215),
|
||||
('n', 110, 110, 110, 110),
|
||||
('nbspace', None, 202, 160, None),
|
||||
('nine', 57, 57, 57, 57),
|
||||
('ntilde', None, 150, 241, 241),
|
||||
('numbersign', 35, 35, 35, 35),
|
||||
('o', 111, 111, 111, 111),
|
||||
('oacute', None, 151, 243, 243),
|
||||
('ocircumflex', None, 153, 244, 244),
|
||||
('odieresis', None, 154, 246, 246),
|
||||
('oe', 250, 207, 156, 156),
|
||||
('ogonek', 206, 254, None, 29),
|
||||
('ograve', None, 152, 242, 242),
|
||||
('one', 49, 49, 49, 49),
|
||||
('onehalf', None, None, 189, 189),
|
||||
('onequarter', None, None, 188, 188),
|
||||
('onesuperior', None, None, 185, 185),
|
||||
('ordfeminine', 227, 187, 170, 170),
|
||||
('ordmasculine', 235, 188, 186, 186),
|
||||
('oslash', 249, 191, 248, 248),
|
||||
('otilde', None, 155, 245, 245),
|
||||
('p', 112, 112, 112, 112),
|
||||
('paragraph', 182, 166, 182, 182),
|
||||
('parenleft', 40, 40, 40, 40),
|
||||
('parenright', 41, 41, 41, 41),
|
||||
('percent', 37, 37, 37, 37),
|
||||
('period', 46, 46, 46, 46),
|
||||
('periodcentered', 180, 225, 183, 183),
|
||||
('perthousand', 189, 228, 137, 139),
|
||||
('plus', 43, 43, 43, 43),
|
||||
('plusminus', None, 177, 177, 177),
|
||||
('q', 113, 113, 113, 113),
|
||||
('question', 63, 63, 63, 63),
|
||||
('questiondown', 191, 192, 191, 191),
|
||||
('quotedbl', 34, 34, 34, 34),
|
||||
('quotedblbase', 185, 227, 132, 140),
|
||||
('quotedblleft', 170, 210, 147, 141),
|
||||
('quotedblright', 186, 211, 148, 142),
|
||||
('quoteleft', 96, 212, 145, 143),
|
||||
('quoteright', 39, 213, 146, 144),
|
||||
('quotesinglbase', 184, 226, 130, 145),
|
||||
('quotesingle', 169, 39, 39, 39),
|
||||
('r', 114, 114, 114, 114),
|
||||
('registered', None, 168, 174, 174),
|
||||
('ring', 202, 251, None, 30),
|
||||
('s', 115, 115, 115, 115),
|
||||
('scaron', None, None, 154, 157),
|
||||
('section', 167, 164, 167, 167),
|
||||
('semicolon', 59, 59, 59, 59),
|
||||
('seven', 55, 55, 55, 55),
|
||||
('six', 54, 54, 54, 54),
|
||||
('slash', 47, 47, 47, 47),
|
||||
('space', 32, 32, 32, 32),
|
||||
('space', None, 202, 160, None),
|
||||
('space', None, 202, 173, None),
|
||||
('sterling', 163, 163, 163, 163),
|
||||
('t', 116, 116, 116, 116),
|
||||
('thorn', None, None, 254, 254),
|
||||
('three', 51, 51, 51, 51),
|
||||
('threequarters', None, None, 190, 190),
|
||||
('threesuperior', None, None, 179, 179),
|
||||
('tilde', 196, 247, 152, 31),
|
||||
('trademark', None, 170, 153, 146),
|
||||
('two', 50, 50, 50, 50),
|
||||
('twosuperior', None, None, 178, 178),
|
||||
('u', 117, 117, 117, 117),
|
||||
('uacute', None, 156, 250, 250),
|
||||
('ucircumflex', None, 158, 251, 251),
|
||||
('udieresis', None, 159, 252, 252),
|
||||
('ugrave', None, 157, 249, 249),
|
||||
('underscore', 95, 95, 95, 95),
|
||||
('v', 118, 118, 118, 118),
|
||||
('w', 119, 119, 119, 119),
|
||||
('x', 120, 120, 120, 120),
|
||||
('y', 121, 121, 121, 121),
|
||||
('yacute', None, None, 253, 253),
|
||||
('ydieresis', None, 216, 255, 255),
|
||||
('yen', 165, 180, 165, 165),
|
||||
('z', 122, 122, 122, 122),
|
||||
('zcaron', None, None, 158, 158),
|
||||
('zero', 48, 48, 48, 48),
|
||||
]
|
||||
@@ -1,866 +0,0 @@
|
||||
import heapq
|
||||
import logging
|
||||
|
||||
from .utils import INF
|
||||
from .utils import Plane
|
||||
from .utils import apply_matrix_pt
|
||||
from .utils import bbox2str
|
||||
from .utils import fsplit
|
||||
from .utils import get_bound
|
||||
from .utils import matrix2str
|
||||
from .utils import uniq
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class IndexAssigner:
|
||||
|
||||
def __init__(self, index=0):
|
||||
self.index = index
|
||||
return
|
||||
|
||||
def run(self, obj):
|
||||
if isinstance(obj, LTTextBox):
|
||||
obj.index = self.index
|
||||
self.index += 1
|
||||
elif isinstance(obj, LTTextGroup):
|
||||
for x in obj:
|
||||
self.run(x)
|
||||
return
|
||||
|
||||
|
||||
class LAParams:
|
||||
"""Parameters for layout analysis
|
||||
|
||||
:param line_overlap: If two characters have more overlap than this they
|
||||
are considered to be on the same line. The overlap is specified
|
||||
relative to the minimum height of both characters.
|
||||
:param char_margin: If two characters are closer together than this
|
||||
margin they are considered part of the same line. The margin is
|
||||
specified relative to the width of the character.
|
||||
:param word_margin: If two characters on the same line are further apart
|
||||
than this margin then they are considered to be two separate words, and
|
||||
an intermediate space will be added for readability. The margin is
|
||||
specified relative to the width of the character.
|
||||
:param line_margin: If two lines are are close together they are
|
||||
considered to be part of the same paragraph. The margin is
|
||||
specified relative to the height of a line.
|
||||
:param boxes_flow: Specifies how much a horizontal and vertical position
|
||||
of a text matters when determining the order of text boxes. The value
|
||||
should be within the range of -1.0 (only horizontal position
|
||||
matters) to +1.0 (only vertical position matters). You can also pass
|
||||
`None` to disable advanced layout analysis, and instead return text
|
||||
based on the position of the bottom left corner of the text box.
|
||||
:param detect_vertical: If vertical text should be considered during
|
||||
layout analysis
|
||||
:param all_texts: If layout analysis should be performed on text in
|
||||
figures.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
line_overlap=0.5,
|
||||
char_margin=2.0,
|
||||
line_margin=0.5,
|
||||
word_margin=0.1,
|
||||
boxes_flow=0.5,
|
||||
detect_vertical=False,
|
||||
all_texts=False):
|
||||
self.line_overlap = line_overlap
|
||||
self.char_margin = char_margin
|
||||
self.line_margin = line_margin
|
||||
self.word_margin = word_margin
|
||||
self.boxes_flow = boxes_flow
|
||||
self.detect_vertical = detect_vertical
|
||||
self.all_texts = all_texts
|
||||
|
||||
self._validate()
|
||||
return
|
||||
|
||||
def _validate(self):
|
||||
if self.boxes_flow is not None:
|
||||
boxes_flow_err_msg = ("LAParam boxes_flow should be None, or a "
|
||||
"number between -1 and +1")
|
||||
if not (isinstance(self.boxes_flow, int) or
|
||||
isinstance(self.boxes_flow, float)):
|
||||
raise TypeError(boxes_flow_err_msg)
|
||||
if not -1 <= self.boxes_flow <= 1:
|
||||
raise ValueError(boxes_flow_err_msg)
|
||||
|
||||
def __repr__(self):
|
||||
return '<LAParams: char_margin=%.1f, line_margin=%.1f, ' \
|
||||
'word_margin=%.1f all_texts=%r>' % \
|
||||
(self.char_margin, self.line_margin, self.word_margin,
|
||||
self.all_texts)
|
||||
|
||||
|
||||
class LTItem:
|
||||
"""Interface for things that can be analyzed"""
|
||||
|
||||
def analyze(self, laparams):
|
||||
"""Perform the layout analysis."""
|
||||
return
|
||||
|
||||
|
||||
class LTText:
|
||||
"""Interface for things that have text"""
|
||||
|
||||
def __repr__(self):
|
||||
return ('<%s %r>' %
|
||||
(self.__class__.__name__, self.get_text()))
|
||||
|
||||
def get_text(self):
|
||||
"""Text contained in this object"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class LTComponent(LTItem):
|
||||
"""Object with a bounding box"""
|
||||
|
||||
def __init__(self, bbox):
|
||||
LTItem.__init__(self)
|
||||
self.set_bbox(bbox)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<%s %s>' %
|
||||
(self.__class__.__name__, bbox2str(self.bbox)))
|
||||
|
||||
# Disable comparison.
|
||||
def __lt__(self, _):
|
||||
raise ValueError
|
||||
|
||||
def __le__(self, _):
|
||||
raise ValueError
|
||||
|
||||
def __gt__(self, _):
|
||||
raise ValueError
|
||||
|
||||
def __ge__(self, _):
|
||||
raise ValueError
|
||||
|
||||
def set_bbox(self, bbox):
|
||||
(x0, y0, x1, y1) = bbox
|
||||
self.x0 = x0
|
||||
self.y0 = y0
|
||||
self.x1 = x1
|
||||
self.y1 = y1
|
||||
self.width = x1-x0
|
||||
self.height = y1-y0
|
||||
self.bbox = bbox
|
||||
return
|
||||
|
||||
def is_empty(self):
|
||||
return self.width <= 0 or self.height <= 0
|
||||
|
||||
def is_hoverlap(self, obj):
|
||||
assert isinstance(obj, LTComponent), str(type(obj))
|
||||
return obj.x0 <= self.x1 and self.x0 <= obj.x1
|
||||
|
||||
def hdistance(self, obj):
|
||||
assert isinstance(obj, LTComponent), str(type(obj))
|
||||
if self.is_hoverlap(obj):
|
||||
return 0
|
||||
else:
|
||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||
|
||||
def hoverlap(self, obj):
|
||||
assert isinstance(obj, LTComponent), str(type(obj))
|
||||
if self.is_hoverlap(obj):
|
||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||
else:
|
||||
return 0
|
||||
|
||||
def is_voverlap(self, obj):
|
||||
assert isinstance(obj, LTComponent), str(type(obj))
|
||||
return obj.y0 <= self.y1 and self.y0 <= obj.y1
|
||||
|
||||
def vdistance(self, obj):
|
||||
assert isinstance(obj, LTComponent), str(type(obj))
|
||||
if self.is_voverlap(obj):
|
||||
return 0
|
||||
else:
|
||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||
|
||||
def voverlap(self, obj):
|
||||
assert isinstance(obj, LTComponent), str(type(obj))
|
||||
if self.is_voverlap(obj):
|
||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
class LTCurve(LTComponent):
|
||||
"""A generic Bezier curve"""
|
||||
|
||||
def __init__(self, linewidth, pts, stroke=False, fill=False, evenodd=False,
|
||||
stroking_color=None, non_stroking_color=None):
|
||||
LTComponent.__init__(self, get_bound(pts))
|
||||
self.pts = pts
|
||||
self.linewidth = linewidth
|
||||
self.stroke = stroke
|
||||
self.fill = fill
|
||||
self.evenodd = evenodd
|
||||
self.stroking_color = stroking_color
|
||||
self.non_stroking_color = non_stroking_color
|
||||
return
|
||||
|
||||
def get_pts(self):
|
||||
return ','.join('%.3f,%.3f' % p for p in self.pts)
|
||||
|
||||
|
||||
class LTLine(LTCurve):
|
||||
"""A single straight line.
|
||||
|
||||
Could be used for separating text or figures.
|
||||
"""
|
||||
|
||||
def __init__(self, linewidth, p0, p1, stroke=False, fill=False,
|
||||
evenodd=False, stroking_color=None, non_stroking_color=None):
|
||||
LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd,
|
||||
stroking_color, non_stroking_color)
|
||||
return
|
||||
|
||||
|
||||
class LTRect(LTCurve):
|
||||
"""A rectangle.
|
||||
|
||||
Could be used for framing another pictures or figures.
|
||||
"""
|
||||
|
||||
def __init__(self, linewidth, bbox, stroke=False, fill=False,
|
||||
evenodd=False, stroking_color=None, non_stroking_color=None):
|
||||
(x0, y0, x1, y1) = bbox
|
||||
LTCurve.__init__(self, linewidth,
|
||||
[(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke,
|
||||
fill, evenodd, stroking_color, non_stroking_color)
|
||||
return
|
||||
|
||||
|
||||
class LTImage(LTComponent):
|
||||
"""An image object.
|
||||
|
||||
Embedded images can be in JPEG, Bitmap or JBIG2.
|
||||
"""
|
||||
|
||||
def __init__(self, name, stream, bbox):
|
||||
LTComponent.__init__(self, bbox)
|
||||
self.name = name
|
||||
self.stream = stream
|
||||
self.srcsize = (stream.get_any(('W', 'Width')),
|
||||
stream.get_any(('H', 'Height')))
|
||||
self.imagemask = stream.get_any(('IM', 'ImageMask'))
|
||||
self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
|
||||
self.colorspace = stream.get_any(('CS', 'ColorSpace'))
|
||||
if not isinstance(self.colorspace, list):
|
||||
self.colorspace = [self.colorspace]
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<%s(%s) %s %r>' %
|
||||
(self.__class__.__name__, self.name,
|
||||
bbox2str(self.bbox), self.srcsize))
|
||||
|
||||
|
||||
class LTAnno(LTItem, LTText):
|
||||
"""Actual letter in the text as a Unicode string.
|
||||
|
||||
Note that, while a LTChar object has actual boundaries, LTAnno objects does
|
||||
not, as these are "virtual" characters, inserted by a layout analyzer
|
||||
according to the relationship between two characters (e.g. a space).
|
||||
"""
|
||||
|
||||
def __init__(self, text):
|
||||
self._text = text
|
||||
return
|
||||
|
||||
def get_text(self):
|
||||
return self._text
|
||||
|
||||
|
||||
class LTChar(LTComponent, LTText):
|
||||
"""Actual letter in the text as a Unicode string."""
|
||||
|
||||
def __init__(self, matrix, font, fontsize, scaling, rise,
|
||||
text, textwidth, textdisp, ncs, graphicstate):
|
||||
LTText.__init__(self)
|
||||
self._text = text
|
||||
self.matrix = matrix
|
||||
self.fontname = font.fontname
|
||||
self.ncs = ncs
|
||||
self.graphicstate = graphicstate
|
||||
self.adv = textwidth * fontsize * scaling
|
||||
# compute the boundary rectangle.
|
||||
if font.is_vertical():
|
||||
# vertical
|
||||
(vx, vy) = textdisp
|
||||
if vx is None:
|
||||
vx = fontsize * 0.5
|
||||
else:
|
||||
vx = vx * fontsize * .001
|
||||
vy = (1000 - vy) * fontsize * .001
|
||||
bbox_lower_left = (-vx, vy + rise + self.adv)
|
||||
bbox_upper_right = (-vx + fontsize, vy + rise)
|
||||
else:
|
||||
# horizontal
|
||||
descent = font.get_descent() * fontsize
|
||||
bbox_lower_left = (0, descent + rise)
|
||||
bbox_upper_right = (self.adv, descent + rise + fontsize)
|
||||
(a, b, c, d, e, f) = self.matrix
|
||||
self.upright = (0 < a*d*scaling and b*c <= 0)
|
||||
(x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
|
||||
(x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
|
||||
if x1 < x0:
|
||||
(x0, x1) = (x1, x0)
|
||||
if y1 < y0:
|
||||
(y0, y1) = (y1, y0)
|
||||
LTComponent.__init__(self, (x0, y0, x1, y1))
|
||||
if font.is_vertical():
|
||||
self.size = self.width
|
||||
else:
|
||||
self.size = self.height
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
|
||||
(self.__class__.__name__, bbox2str(self.bbox),
|
||||
matrix2str(self.matrix), self.fontname, self.adv,
|
||||
self.get_text()))
|
||||
|
||||
def get_text(self):
|
||||
return self._text
|
||||
|
||||
def is_compatible(self, obj):
|
||||
"""Returns True if two characters can coexist in the same line."""
|
||||
return True
|
||||
|
||||
|
||||
class LTContainer(LTComponent):
|
||||
"""Object that can be extended and analyzed"""
|
||||
|
||||
def __init__(self, bbox):
|
||||
LTComponent.__init__(self, bbox)
|
||||
self._objs = []
|
||||
return
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._objs)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._objs)
|
||||
|
||||
def add(self, obj):
|
||||
self._objs.append(obj)
|
||||
return
|
||||
|
||||
def extend(self, objs):
|
||||
for obj in objs:
|
||||
self.add(obj)
|
||||
return
|
||||
|
||||
def analyze(self, laparams):
|
||||
for obj in self._objs:
|
||||
obj.analyze(laparams)
|
||||
return
|
||||
|
||||
|
||||
class LTExpandableContainer(LTContainer):
|
||||
def __init__(self):
|
||||
LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
|
||||
return
|
||||
|
||||
def add(self, obj):
|
||||
LTContainer.add(self, obj)
|
||||
self.set_bbox((min(self.x0, obj.x0), min(self.y0, obj.y0),
|
||||
max(self.x1, obj.x1), max(self.y1, obj.y1)))
|
||||
return
|
||||
|
||||
|
||||
class LTTextContainer(LTExpandableContainer, LTText):
|
||||
def __init__(self):
|
||||
LTText.__init__(self)
|
||||
LTExpandableContainer.__init__(self)
|
||||
return
|
||||
|
||||
def get_text(self):
|
||||
return ''.join(obj.get_text() for obj in self
|
||||
if isinstance(obj, LTText))
|
||||
|
||||
|
||||
class LTTextLine(LTTextContainer):
|
||||
"""Contains a list of LTChar objects that represent a single text line.
|
||||
|
||||
The characters are aligned either horizontally or vertically, depending on
|
||||
the text's writing mode.
|
||||
"""
|
||||
|
||||
def __init__(self, word_margin):
|
||||
LTTextContainer.__init__(self)
|
||||
self.word_margin = word_margin
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<%s %s %r>' %
|
||||
(self.__class__.__name__, bbox2str(self.bbox),
|
||||
self.get_text()))
|
||||
|
||||
def analyze(self, laparams):
|
||||
LTTextContainer.analyze(self, laparams)
|
||||
LTContainer.add(self, LTAnno('\n'))
|
||||
return
|
||||
|
||||
def find_neighbors(self, plane, ratio):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class LTTextLineHorizontal(LTTextLine):
|
||||
def __init__(self, word_margin):
|
||||
LTTextLine.__init__(self, word_margin)
|
||||
self._x1 = +INF
|
||||
return
|
||||
|
||||
def add(self, obj):
|
||||
if isinstance(obj, LTChar) and self.word_margin:
|
||||
margin = self.word_margin * max(obj.width, obj.height)
|
||||
if self._x1 < obj.x0 - margin:
|
||||
LTContainer.add(self, LTAnno(' '))
|
||||
self._x1 = obj.x1
|
||||
LTTextLine.add(self, obj)
|
||||
return
|
||||
|
||||
def find_neighbors(self, plane, ratio):
|
||||
"""
|
||||
Finds neighboring LTTextLineHorizontals in the plane.
|
||||
|
||||
Returns a list of other LTTestLineHorizontals in the plane which are
|
||||
close to self. "Close" can be controlled by ratio. The returned objects
|
||||
will be the same height as self, and also either left-, right-, or
|
||||
centrally-aligned.
|
||||
"""
|
||||
d = ratio * self.height
|
||||
objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
|
||||
return [obj for obj in objs
|
||||
if (isinstance(obj, LTTextLineHorizontal) and
|
||||
self._is_same_height_as(obj, tolerance=d) and
|
||||
(self._is_left_aligned_with(obj, tolerance=d) or
|
||||
self._is_right_aligned_with(obj, tolerance=d) or
|
||||
self._is_centrally_aligned_with(obj, tolerance=d)))]
|
||||
|
||||
def _is_left_aligned_with(self, other, tolerance=0):
|
||||
"""
|
||||
Whether the left-hand edge of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(other.x0 - self.x0) <= tolerance
|
||||
|
||||
def _is_right_aligned_with(self, other, tolerance=0):
|
||||
"""
|
||||
Whether the right-hand edge of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(other.x1 - self.x1) <= tolerance
|
||||
|
||||
def _is_centrally_aligned_with(self, other, tolerance=0):
|
||||
"""
|
||||
Whether the horizontal center of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(
|
||||
(other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
|
||||
|
||||
def _is_same_height_as(self, other, tolerance):
|
||||
return abs(other.height - self.height) <= tolerance
|
||||
|
||||
|
||||
class LTTextLineVertical(LTTextLine):
|
||||
def __init__(self, word_margin):
|
||||
LTTextLine.__init__(self, word_margin)
|
||||
self._y0 = -INF
|
||||
return
|
||||
|
||||
def add(self, obj):
|
||||
if isinstance(obj, LTChar) and self.word_margin:
|
||||
margin = self.word_margin * max(obj.width, obj.height)
|
||||
if obj.y1 + margin < self._y0:
|
||||
LTContainer.add(self, LTAnno(' '))
|
||||
self._y0 = obj.y0
|
||||
LTTextLine.add(self, obj)
|
||||
return
|
||||
|
||||
def find_neighbors(self, plane, ratio):
|
||||
"""
|
||||
Finds neighboring LTTextLineVerticals in the plane.
|
||||
|
||||
Returns a list of other LTTextLineVerticals in the plane which are
|
||||
close to self. "Close" can be controlled by ratio. The returned objects
|
||||
will be the same width as self, and also either upper-, lower-, or
|
||||
centrally-aligned.
|
||||
"""
|
||||
d = ratio * self.width
|
||||
objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
|
||||
return [obj for obj in objs
|
||||
if (isinstance(obj, LTTextLineVertical) and
|
||||
self._is_same_width_as(obj, tolerance=d) and
|
||||
(self._is_lower_aligned_with(obj, tolerance=d) or
|
||||
self._is_upper_aligned_with(obj, tolerance=d) or
|
||||
self._is_centrally_aligned_with(obj, tolerance=d)))]
|
||||
|
||||
def _is_lower_aligned_with(self, other, tolerance=0):
|
||||
"""
|
||||
Whether the lower edge of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(other.y0 - self.y0) <= tolerance
|
||||
|
||||
def _is_upper_aligned_with(self, other, tolerance=0):
|
||||
"""
|
||||
Whether the upper edge of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(other.y1 - self.y1) <= tolerance
|
||||
|
||||
def _is_centrally_aligned_with(self, other, tolerance=0):
|
||||
"""
|
||||
Whether the vertical center of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(
|
||||
(other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
|
||||
|
||||
def _is_same_width_as(self, other, tolerance):
|
||||
return abs(other.width - self.width) <= tolerance
|
||||
|
||||
|
||||
class LTTextBox(LTTextContainer):
|
||||
"""Represents a group of text chunks in a rectangular area.
|
||||
|
||||
Note that this box is created by geometric analysis and does not
|
||||
necessarily represents a logical boundary of the text. It contains a list
|
||||
of LTTextLine objects.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
LTTextContainer.__init__(self)
|
||||
self.index = -1
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<%s(%s) %s %r>' %
|
||||
(self.__class__.__name__,
|
||||
self.index, bbox2str(self.bbox), self.get_text()))
|
||||
|
||||
|
||||
class LTTextBoxHorizontal(LTTextBox):
|
||||
def analyze(self, laparams):
|
||||
LTTextBox.analyze(self, laparams)
|
||||
self._objs.sort(key=lambda obj: -obj.y1)
|
||||
return
|
||||
|
||||
def get_writing_mode(self):
|
||||
return 'lr-tb'
|
||||
|
||||
|
||||
class LTTextBoxVertical(LTTextBox):
|
||||
def analyze(self, laparams):
|
||||
LTTextBox.analyze(self, laparams)
|
||||
self._objs.sort(key=lambda obj: -obj.x1)
|
||||
return
|
||||
|
||||
def get_writing_mode(self):
|
||||
return 'tb-rl'
|
||||
|
||||
|
||||
class LTTextGroup(LTTextContainer):
|
||||
def __init__(self, objs):
|
||||
LTTextContainer.__init__(self)
|
||||
self.extend(objs)
|
||||
return
|
||||
|
||||
|
||||
class LTTextGroupLRTB(LTTextGroup):
|
||||
def analyze(self, laparams):
|
||||
LTTextGroup.analyze(self, laparams)
|
||||
# reorder the objects from top-left to bottom-right.
|
||||
self._objs.sort(
|
||||
key=lambda obj: (1 - laparams.boxes_flow) * obj.x0
|
||||
- (1 + laparams.boxes_flow) * (obj.y0 + obj.y1))
|
||||
return
|
||||
|
||||
|
||||
class LTTextGroupTBRL(LTTextGroup):
|
||||
def analyze(self, laparams):
|
||||
LTTextGroup.analyze(self, laparams)
|
||||
# reorder the objects from top-right to bottom-left.
|
||||
self._objs.sort(
|
||||
key=lambda obj: - (1 + laparams.boxes_flow) * (obj.x0 + obj.x1)
|
||||
- (1 - laparams.boxes_flow) * obj.y1)
|
||||
return
|
||||
|
||||
|
||||
class LTLayoutContainer(LTContainer):
|
||||
def __init__(self, bbox):
|
||||
LTContainer.__init__(self, bbox)
|
||||
self.groups = None
|
||||
return
|
||||
|
||||
# group_objects: group text object to textlines.
|
||||
def group_objects(self, laparams, objs):
|
||||
obj0 = None
|
||||
line = None
|
||||
for obj1 in objs:
|
||||
if obj0 is not None:
|
||||
# halign: obj0 and obj1 is horizontally aligned.
|
||||
#
|
||||
# +------+ - - -
|
||||
# | obj0 | - - +------+ -
|
||||
# | | | obj1 | | (line_overlap)
|
||||
# +------+ - - | | -
|
||||
# - - - +------+
|
||||
#
|
||||
# |<--->|
|
||||
# (char_margin)
|
||||
halign = \
|
||||
obj0.is_compatible(obj1) \
|
||||
and obj0.is_voverlap(obj1) \
|
||||
and min(obj0.height, obj1.height) * laparams.line_overlap \
|
||||
< obj0.voverlap(obj1) \
|
||||
and obj0.hdistance(obj1) \
|
||||
< max(obj0.width, obj1.width) * laparams.char_margin
|
||||
|
||||
# valign: obj0 and obj1 is vertically aligned.
|
||||
#
|
||||
# +------+
|
||||
# | obj0 |
|
||||
# | |
|
||||
# +------+ - - -
|
||||
# | | | (char_margin)
|
||||
# +------+ - -
|
||||
# | obj1 |
|
||||
# | |
|
||||
# +------+
|
||||
#
|
||||
# |<-->|
|
||||
# (line_overlap)
|
||||
valign = \
|
||||
laparams.detect_vertical \
|
||||
and obj0.is_compatible(obj1) \
|
||||
and obj0.is_hoverlap(obj1) \
|
||||
and min(obj0.width, obj1.width) * laparams.line_overlap \
|
||||
< obj0.hoverlap(obj1) \
|
||||
and obj0.vdistance(obj1) \
|
||||
< max(obj0.height, obj1.height) * laparams.char_margin
|
||||
|
||||
if ((halign and isinstance(line, LTTextLineHorizontal)) or
|
||||
(valign and isinstance(line, LTTextLineVertical))):
|
||||
|
||||
line.add(obj1)
|
||||
elif line is not None:
|
||||
yield line
|
||||
line = None
|
||||
else:
|
||||
if valign and not halign:
|
||||
line = LTTextLineVertical(laparams.word_margin)
|
||||
line.add(obj0)
|
||||
line.add(obj1)
|
||||
elif halign and not valign:
|
||||
line = LTTextLineHorizontal(laparams.word_margin)
|
||||
line.add(obj0)
|
||||
line.add(obj1)
|
||||
else:
|
||||
line = LTTextLineHorizontal(laparams.word_margin)
|
||||
line.add(obj0)
|
||||
yield line
|
||||
line = None
|
||||
obj0 = obj1
|
||||
if line is None:
|
||||
line = LTTextLineHorizontal(laparams.word_margin)
|
||||
line.add(obj0)
|
||||
yield line
|
||||
return
|
||||
|
||||
def group_textlines(self, laparams, lines):
|
||||
"""Group neighboring lines to textboxes"""
|
||||
plane = Plane(self.bbox)
|
||||
plane.extend(lines)
|
||||
boxes = {}
|
||||
for line in lines:
|
||||
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
||||
members = [line]
|
||||
for obj1 in neighbors:
|
||||
members.append(obj1)
|
||||
if obj1 in boxes:
|
||||
members.extend(boxes.pop(obj1))
|
||||
if isinstance(line, LTTextLineHorizontal):
|
||||
box = LTTextBoxHorizontal()
|
||||
else:
|
||||
box = LTTextBoxVertical()
|
||||
for obj in uniq(members):
|
||||
box.add(obj)
|
||||
boxes[obj] = box
|
||||
done = set()
|
||||
for line in lines:
|
||||
if line not in boxes:
|
||||
continue
|
||||
box = boxes[line]
|
||||
if box in done:
|
||||
continue
|
||||
done.add(box)
|
||||
if not box.is_empty():
|
||||
yield box
|
||||
return
|
||||
|
||||
def group_textboxes(self, laparams, boxes):
|
||||
"""Group textboxes hierarchically.
|
||||
|
||||
Get pair-wise distances, via dist func defined below, and then merge
|
||||
from the closest textbox pair. Once obj1 and obj2 are merged /
|
||||
grouped, the resulting group is considered as a new object, and its
|
||||
distances to other objects & groups are added to the process queue.
|
||||
|
||||
For performance reason, pair-wise distances and object pair info are
|
||||
maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2)
|
||||
tuples. It ensures quick access to the smallest element. Note that
|
||||
since comparison operators, e.g., __lt__, are disabled for
|
||||
LTComponent, id(obj) has to appear before obj in element tuples.
|
||||
|
||||
:param laparams: LAParams object.
|
||||
:param boxes: All textbox objects to be grouped.
|
||||
:return: a list that has only one element, the final top level textbox.
|
||||
"""
|
||||
|
||||
def dist(obj1, obj2):
|
||||
"""A distance function between two TextBoxes.
|
||||
|
||||
Consider the bounding rectangle for obj1 and obj2.
|
||||
Return its area less the areas of obj1 and obj2,
|
||||
shown as 'www' below. This value may be negative.
|
||||
+------+..........+ (x1, y1)
|
||||
| obj1 |wwwwwwwwww:
|
||||
+------+www+------+
|
||||
:wwwwwwwwww| obj2 |
|
||||
(x0, y0) +..........+------+
|
||||
"""
|
||||
x0 = min(obj1.x0, obj2.x0)
|
||||
y0 = min(obj1.y0, obj2.y0)
|
||||
x1 = max(obj1.x1, obj2.x1)
|
||||
y1 = max(obj1.y1, obj2.y1)
|
||||
return (x1 - x0) * (y1 - y0) \
|
||||
- obj1.width*obj1.height - obj2.width*obj2.height
|
||||
|
||||
def isany(obj1, obj2):
|
||||
"""Check if there's any other object between obj1 and obj2."""
|
||||
x0 = min(obj1.x0, obj2.x0)
|
||||
y0 = min(obj1.y0, obj2.y0)
|
||||
x1 = max(obj1.x1, obj2.x1)
|
||||
y1 = max(obj1.y1, obj2.y1)
|
||||
objs = set(plane.find((x0, y0, x1, y1)))
|
||||
return objs.difference((obj1, obj2))
|
||||
|
||||
dists = []
|
||||
for i in range(len(boxes)):
|
||||
obj1 = boxes[i]
|
||||
for j in range(i+1, len(boxes)):
|
||||
obj2 = boxes[j]
|
||||
dists.append((False, dist(obj1, obj2), id(obj1), id(obj2),
|
||||
obj1, obj2))
|
||||
heapq.heapify(dists)
|
||||
|
||||
plane = Plane(self.bbox)
|
||||
plane.extend(boxes)
|
||||
done = set()
|
||||
while len(dists) > 0:
|
||||
(skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
|
||||
# Skip objects that are already merged
|
||||
if (id1 not in done) and (id2 not in done):
|
||||
if skip_isany and isany(obj1, obj2):
|
||||
heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
|
||||
continue
|
||||
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
|
||||
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):
|
||||
group = LTTextGroupTBRL([obj1, obj2])
|
||||
else:
|
||||
group = LTTextGroupLRTB([obj1, obj2])
|
||||
plane.remove(obj1)
|
||||
plane.remove(obj2)
|
||||
done.update([id1, id2])
|
||||
|
||||
for other in plane:
|
||||
heapq.heappush(dists, (False, dist(group, other),
|
||||
id(group), id(other), group, other))
|
||||
plane.add(group)
|
||||
return list(plane)
|
||||
|
||||
def analyze(self, laparams):
|
||||
# textobjs is a list of LTChar objects, i.e.
|
||||
# it has all the individual characters in the page.
|
||||
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar),
|
||||
self)
|
||||
for obj in otherobjs:
|
||||
obj.analyze(laparams)
|
||||
if not textobjs:
|
||||
return
|
||||
textlines = list(self.group_objects(laparams, textobjs))
|
||||
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
|
||||
for obj in empties:
|
||||
obj.analyze(laparams)
|
||||
textboxes = list(self.group_textlines(laparams, textlines))
|
||||
if laparams.boxes_flow is None:
|
||||
for textbox in textboxes:
|
||||
textbox.analyze(laparams)
|
||||
|
||||
def getkey(box):
|
||||
if isinstance(box, LTTextBoxVertical):
|
||||
return (0, -box.x1, -box.y0)
|
||||
else:
|
||||
return (1, -box.y0, box.x0)
|
||||
textboxes.sort(key=getkey)
|
||||
else:
|
||||
self.groups = self.group_textboxes(laparams, textboxes)
|
||||
assigner = IndexAssigner()
|
||||
for group in self.groups:
|
||||
group.analyze(laparams)
|
||||
assigner.run(group)
|
||||
textboxes.sort(key=lambda box: box.index)
|
||||
self._objs = textboxes + otherobjs + empties
|
||||
return
|
||||
|
||||
|
||||
class LTFigure(LTLayoutContainer):
|
||||
"""Represents an area used by PDF Form objects.
|
||||
|
||||
PDF Forms can be used to present figures or pictures by embedding yet
|
||||
another PDF document within a page. Note that LTFigure objects can appear
|
||||
recursively.
|
||||
"""
|
||||
|
||||
def __init__(self, name, bbox, matrix):
|
||||
self.name = name
|
||||
self.matrix = matrix
|
||||
(x, y, w, h) = bbox
|
||||
bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h))
|
||||
bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds)
|
||||
LTLayoutContainer.__init__(self, bbox)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<%s(%s) %s matrix=%s>' %
|
||||
(self.__class__.__name__, self.name,
|
||||
bbox2str(self.bbox), matrix2str(self.matrix)))
|
||||
|
||||
def analyze(self, laparams):
|
||||
if not laparams.all_texts:
|
||||
return
|
||||
LTLayoutContainer.analyze(self, laparams)
|
||||
return
|
||||
|
||||
|
||||
class LTPage(LTLayoutContainer):
|
||||
"""Represents an entire page.
|
||||
|
||||
May contain child objects like LTTextBox, LTFigure, LTImage, LTRect,
|
||||
LTCurve and LTLine.
|
||||
"""
|
||||
|
||||
def __init__(self, pageid, bbox, rotate=0):
|
||||
LTLayoutContainer.__init__(self, bbox)
|
||||
self.pageid = pageid
|
||||
self.rotate = rotate
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<%s(%r) %s rotate=%r>' %
|
||||
(self.__class__.__name__, self.pageid,
|
||||
bbox2str(self.bbox), self.rotate))
|
||||
@@ -1,99 +0,0 @@
|
||||
from io import BytesIO
|
||||
import logging
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CorruptDataError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class LZWDecoder:
|
||||
|
||||
def __init__(self, fp):
|
||||
self.fp = fp
|
||||
self.buff = 0
|
||||
self.bpos = 8
|
||||
self.nbits = 9
|
||||
self.table = None
|
||||
self.prevbuf = None
|
||||
return
|
||||
|
||||
def readbits(self, bits):
|
||||
v = 0
|
||||
while 1:
|
||||
# the number of remaining bits we can get from the current buffer.
|
||||
r = 8-self.bpos
|
||||
if bits <= r:
|
||||
# |-----8-bits-----|
|
||||
# |-bpos-|-bits-| |
|
||||
# | |----r----|
|
||||
v = (v << bits) | ((self.buff >> (r-bits)) & ((1 << bits)-1))
|
||||
self.bpos += bits
|
||||
break
|
||||
else:
|
||||
# |-----8-bits-----|
|
||||
# |-bpos-|---bits----...
|
||||
# | |----r----|
|
||||
v = (v << r) | (self.buff & ((1 << r)-1))
|
||||
bits -= r
|
||||
x = self.fp.read(1)
|
||||
if not x:
|
||||
raise EOFError
|
||||
self.buff = ord(x)
|
||||
self.bpos = 0
|
||||
return v
|
||||
|
||||
def feed(self, code):
|
||||
x = b''
|
||||
if code == 256:
|
||||
self.table = [bytes((c,)) for c in range(256)] # 0-255
|
||||
self.table.append(None) # 256
|
||||
self.table.append(None) # 257
|
||||
self.prevbuf = b''
|
||||
self.nbits = 9
|
||||
elif code == 257:
|
||||
pass
|
||||
elif not self.prevbuf:
|
||||
x = self.prevbuf = self.table[code]
|
||||
else:
|
||||
if code < len(self.table):
|
||||
x = self.table[code]
|
||||
self.table.append(self.prevbuf+x[:1])
|
||||
elif code == len(self.table):
|
||||
self.table.append(self.prevbuf+self.prevbuf[:1])
|
||||
x = self.table[code]
|
||||
else:
|
||||
raise CorruptDataError
|
||||
table_length = len(self.table)
|
||||
if table_length == 511:
|
||||
self.nbits = 10
|
||||
elif table_length == 1023:
|
||||
self.nbits = 11
|
||||
elif table_length == 2047:
|
||||
self.nbits = 12
|
||||
self.prevbuf = x
|
||||
return x
|
||||
|
||||
def run(self):
|
||||
while 1:
|
||||
try:
|
||||
code = self.readbits(self.nbits)
|
||||
except EOFError:
|
||||
break
|
||||
try:
|
||||
x = self.feed(code)
|
||||
except CorruptDataError:
|
||||
# just ignore corrupt data and stop yielding there
|
||||
break
|
||||
yield x
|
||||
logger.debug('nbits=%d, code=%d, output=%r, table=%r'
|
||||
% (self.nbits, code, x, self.table[258:]))
|
||||
return
|
||||
|
||||
|
||||
def lzwdecode(data):
|
||||
fp = BytesIO(data)
|
||||
s = LZWDecoder(fp).run()
|
||||
return b''.join(s)
|
||||
@@ -1,35 +0,0 @@
|
||||
import collections
|
||||
from .psparser import LIT
|
||||
|
||||
|
||||
LITERAL_DEVICE_GRAY = LIT('DeviceGray')
|
||||
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
|
||||
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
|
||||
|
||||
|
||||
class PDFColorSpace:
|
||||
|
||||
def __init__(self, name, ncomponents):
|
||||
self.name = name
|
||||
self.ncomponents = ncomponents
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFColorSpace: %s, ncomponents=%d>' % \
|
||||
(self.name, self.ncomponents)
|
||||
|
||||
|
||||
PREDEFINED_COLORSPACE = collections.OrderedDict()
|
||||
|
||||
for (name, n) in [
|
||||
('DeviceGray', 1), # default value first
|
||||
('CalRGB', 3),
|
||||
('CalGray', 1),
|
||||
('Lab', 3),
|
||||
('DeviceRGB', 3),
|
||||
('DeviceCMYK', 4),
|
||||
('Separation', 1),
|
||||
('Indexed', 1),
|
||||
('Pattern', 1),
|
||||
]:
|
||||
PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n)
|
||||
@@ -1,193 +0,0 @@
|
||||
from . import utils
|
||||
from .pdffont import PDFUnicodeNotDefined
|
||||
|
||||
|
||||
class PDFDevice:
|
||||
"""Translate the output of PDFPageInterpreter to the output that is needed
|
||||
"""
|
||||
|
||||
def __init__(self, rsrcmgr):
|
||||
self.rsrcmgr = rsrcmgr
|
||||
self.ctm = None
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFDevice>'
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
return
|
||||
|
||||
def set_ctm(self, ctm):
|
||||
self.ctm = ctm
|
||||
return
|
||||
|
||||
def begin_tag(self, tag, props=None):
|
||||
return
|
||||
|
||||
def end_tag(self):
|
||||
return
|
||||
|
||||
def do_tag(self, tag, props=None):
|
||||
return
|
||||
|
||||
def begin_page(self, page, ctm):
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
return
|
||||
|
||||
def begin_figure(self, name, bbox, matrix):
|
||||
return
|
||||
|
||||
def end_figure(self, name):
|
||||
return
|
||||
|
||||
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
||||
return
|
||||
|
||||
def render_image(self, name, stream):
|
||||
return
|
||||
|
||||
def render_string(self, textstate, seq, ncs, graphicstate):
|
||||
return
|
||||
|
||||
|
||||
class PDFTextDevice(PDFDevice):
|
||||
|
||||
def render_string(self, textstate, seq, ncs, graphicstate):
|
||||
matrix = utils.mult_matrix(textstate.matrix, self.ctm)
|
||||
font = textstate.font
|
||||
fontsize = textstate.fontsize
|
||||
scaling = textstate.scaling * .01
|
||||
charspace = textstate.charspace * scaling
|
||||
wordspace = textstate.wordspace * scaling
|
||||
rise = textstate.rise
|
||||
if font.is_multibyte():
|
||||
wordspace = 0
|
||||
dxscale = .001 * fontsize * scaling
|
||||
if font.is_vertical():
|
||||
textstate.linematrix = self.render_string_vertical(
|
||||
seq, matrix, textstate.linematrix, font, fontsize,
|
||||
scaling, charspace, wordspace, rise, dxscale, ncs,
|
||||
graphicstate)
|
||||
else:
|
||||
textstate.linematrix = self.render_string_horizontal(
|
||||
seq, matrix, textstate.linematrix, font, fontsize,
|
||||
scaling, charspace, wordspace, rise, dxscale, ncs,
|
||||
graphicstate)
|
||||
return
|
||||
|
||||
def render_string_horizontal(self, seq, matrix, pos,
|
||||
font, fontsize, scaling, charspace, wordspace,
|
||||
rise, dxscale, ncs, graphicstate):
|
||||
(x, y) = pos
|
||||
needcharspace = False
|
||||
for obj in seq:
|
||||
if utils.isnumber(obj):
|
||||
x -= obj*dxscale
|
||||
needcharspace = True
|
||||
else:
|
||||
for cid in font.decode(obj):
|
||||
if needcharspace:
|
||||
x += charspace
|
||||
x += self.render_char(
|
||||
utils.translate_matrix(matrix, (x, y)), font,
|
||||
fontsize, scaling, rise, cid, ncs, graphicstate)
|
||||
if cid == 32 and wordspace:
|
||||
x += wordspace
|
||||
needcharspace = True
|
||||
return (x, y)
|
||||
|
||||
def render_string_vertical(self, seq, matrix, pos,
|
||||
font, fontsize, scaling, charspace, wordspace,
|
||||
rise, dxscale, ncs, graphicstate):
|
||||
(x, y) = pos
|
||||
needcharspace = False
|
||||
for obj in seq:
|
||||
if utils.isnumber(obj):
|
||||
y -= obj*dxscale
|
||||
needcharspace = True
|
||||
else:
|
||||
for cid in font.decode(obj):
|
||||
if needcharspace:
|
||||
y += charspace
|
||||
y += self.render_char(
|
||||
utils.translate_matrix(matrix, (x, y)), font, fontsize,
|
||||
scaling, rise, cid, ncs, graphicstate)
|
||||
if cid == 32 and wordspace:
|
||||
y += wordspace
|
||||
needcharspace = True
|
||||
return (x, y)
|
||||
|
||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
||||
graphicstate):
|
||||
return 0
|
||||
|
||||
|
||||
class TagExtractor(PDFDevice):
|
||||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8'):
|
||||
PDFDevice.__init__(self, rsrcmgr)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
self.pageno = 0
|
||||
self._stack = []
|
||||
return
|
||||
|
||||
def render_string(self, textstate, seq, ncs, graphicstate):
|
||||
font = textstate.font
|
||||
text = ''
|
||||
for obj in seq:
|
||||
if isinstance(obj, str):
|
||||
obj = utils.make_compat_bytes(obj)
|
||||
if not isinstance(obj, bytes):
|
||||
continue
|
||||
chars = font.decode(obj)
|
||||
for cid in chars:
|
||||
try:
|
||||
char = font.to_unichr(cid)
|
||||
text += char
|
||||
except PDFUnicodeNotDefined:
|
||||
print(chars)
|
||||
pass
|
||||
self.outfp.write(utils.enc(text))
|
||||
return
|
||||
|
||||
def begin_page(self, page, ctm):
|
||||
output = '<page id="%s" bbox="%s" rotate="%d">' %\
|
||||
(self.pageno, utils.bbox2str(page.mediabox), page.rotate)
|
||||
self.outfp.write(utils.make_compat_bytes(output))
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
self.outfp.write(utils.make_compat_bytes('</page>\n'))
|
||||
self.pageno += 1
|
||||
return
|
||||
|
||||
def begin_tag(self, tag, props=None):
|
||||
s = ''
|
||||
if isinstance(props, dict):
|
||||
s = ''.join(' {}="{}"'.format(utils.enc(k), utils.enc(str(v)))
|
||||
for (k, v) in sorted(props.items()))
|
||||
out_s = '<{}{}>'.format(utils.enc(tag.name), s)
|
||||
self.outfp.write(utils.make_compat_bytes(out_s))
|
||||
self._stack.append(tag)
|
||||
return
|
||||
|
||||
def end_tag(self):
|
||||
assert self._stack, str(self.pageno)
|
||||
tag = self._stack.pop(-1)
|
||||
out_s = '</%s>' % utils.enc(tag.name)
|
||||
self.outfp.write(utils.make_compat_bytes(out_s))
|
||||
return
|
||||
|
||||
def do_tag(self, tag, props=None):
|
||||
self.begin_tag(tag, props)
|
||||
self._stack.pop(-1)
|
||||
return
|
||||
@@ -1,831 +0,0 @@
|
||||
import logging
|
||||
import re
|
||||
import struct
|
||||
from hashlib import sha256, md5
|
||||
|
||||
from cryptography.hazmat.backends import default_backend
|
||||
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
||||
|
||||
from . import settings
|
||||
from .arcfour import Arcfour
|
||||
from .pdfparser import PDFSyntaxError, PDFStreamParser
|
||||
from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
|
||||
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
|
||||
dict_value, stream_value
|
||||
from .psparser import PSEOF, literal_name, LIT, KWD
|
||||
from .utils import choplist, nunpack, decode_text
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PDFNoValidXRef(PDFSyntaxError):
|
||||
pass
|
||||
|
||||
|
||||
class PDFNoValidXRefWarning(SyntaxWarning):
|
||||
pass
|
||||
|
||||
|
||||
class PDFNoOutlines(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFDestinationNotFound(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFEncryptionError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFPasswordIncorrect(PDFEncryptionError):
|
||||
pass
|
||||
|
||||
|
||||
class PDFTextExtractionNotAllowedWarning(UserWarning):
|
||||
pass
|
||||
|
||||
|
||||
class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
||||
pass
|
||||
|
||||
|
||||
class PDFTextExtractionNotAllowedError(PDFTextExtractionNotAllowed):
|
||||
def __init__(self, *args):
|
||||
from warnings import warn
|
||||
warn('PDFTextExtractionNotAllowedError will be removed in the future. '
|
||||
'Use PDFTextExtractionNotAllowed instead.', DeprecationWarning)
|
||||
super().__init__(*args)
|
||||
|
||||
|
||||
# some predefined literals and keywords.
|
||||
LITERAL_OBJSTM = LIT('ObjStm')
|
||||
LITERAL_XREF = LIT('XRef')
|
||||
LITERAL_CATALOG = LIT('Catalog')
|
||||
|
||||
|
||||
class PDFBaseXRef:
|
||||
|
||||
def get_trailer(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_objids(self):
|
||||
return []
|
||||
|
||||
# Must return
|
||||
# (strmid, index, genno)
|
||||
# or (None, pos, genno)
|
||||
def get_pos(self, objid):
|
||||
raise KeyError(objid)
|
||||
|
||||
|
||||
class PDFXRef(PDFBaseXRef):
|
||||
|
||||
def __init__(self):
|
||||
self.offsets = {}
|
||||
self.trailer = {}
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
|
||||
|
||||
def load(self, parser):
|
||||
while True:
|
||||
try:
|
||||
(pos, line) = parser.nextline()
|
||||
if not line.strip():
|
||||
continue
|
||||
except PSEOF:
|
||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
||||
if not line:
|
||||
raise PDFNoValidXRef('Premature eof: %r' % parser)
|
||||
if line.startswith(b'trailer'):
|
||||
parser.seek(pos)
|
||||
break
|
||||
f = line.strip().split(b' ')
|
||||
if len(f) != 2:
|
||||
error_msg = 'Trailer not found: {!r}: line={!r}'\
|
||||
.format(parser, line)
|
||||
raise PDFNoValidXRef(error_msg)
|
||||
try:
|
||||
(start, nobjs) = map(int, f)
|
||||
except ValueError:
|
||||
error_msg = 'Invalid line: {!r}: line={!r}'\
|
||||
.format(parser, line)
|
||||
raise PDFNoValidXRef(error_msg)
|
||||
for objid in range(start, start+nobjs):
|
||||
try:
|
||||
(_, line) = parser.nextline()
|
||||
except PSEOF:
|
||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
||||
f = line.strip().split(b' ')
|
||||
if len(f) != 3:
|
||||
error_msg = 'Invalid XRef format: {!r}, line={!r}'\
|
||||
.format(parser, line)
|
||||
raise PDFNoValidXRef(error_msg)
|
||||
(pos, genno, use) = f
|
||||
if use != b'n':
|
||||
continue
|
||||
self.offsets[objid] = (None, int(pos), int(genno))
|
||||
log.info('xref objects: %r', self.offsets)
|
||||
self.load_trailer(parser)
|
||||
return
|
||||
|
||||
def load_trailer(self, parser):
|
||||
try:
|
||||
(_, kwd) = parser.nexttoken()
|
||||
assert kwd is KWD(b'trailer'), str(kwd)
|
||||
(_, dic) = parser.nextobject()
|
||||
except PSEOF:
|
||||
x = parser.pop(1)
|
||||
if not x:
|
||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
||||
(_, dic) = x[0]
|
||||
self.trailer.update(dict_value(dic))
|
||||
log.debug('trailer=%r', self.trailer)
|
||||
return
|
||||
|
||||
def get_trailer(self):
|
||||
return self.trailer
|
||||
|
||||
def get_objids(self):
|
||||
return self.offsets.keys()
|
||||
|
||||
def get_pos(self, objid):
|
||||
try:
|
||||
return self.offsets[objid]
|
||||
except KeyError:
|
||||
raise
|
||||
|
||||
|
||||
class PDFXRefFallback(PDFXRef):
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
|
||||
|
||||
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
|
||||
|
||||
def load(self, parser):
|
||||
parser.seek(0)
|
||||
while 1:
|
||||
try:
|
||||
(pos, line) = parser.nextline()
|
||||
except PSEOF:
|
||||
break
|
||||
if line.startswith(b'trailer'):
|
||||
parser.seek(pos)
|
||||
self.load_trailer(parser)
|
||||
log.info('trailer: %r', self.trailer)
|
||||
break
|
||||
line = line.decode('latin-1') # default pdf encoding
|
||||
m = self.PDFOBJ_CUE.match(line)
|
||||
if not m:
|
||||
continue
|
||||
(objid, genno) = m.groups()
|
||||
objid = int(objid)
|
||||
genno = int(genno)
|
||||
self.offsets[objid] = (None, pos, genno)
|
||||
# expand ObjStm.
|
||||
parser.seek(pos)
|
||||
(_, obj) = parser.nextobject()
|
||||
if isinstance(obj, PDFStream) \
|
||||
and obj.get('Type') is LITERAL_OBJSTM:
|
||||
stream = stream_value(obj)
|
||||
try:
|
||||
n = stream['N']
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('N is not defined: %r' % stream)
|
||||
n = 0
|
||||
parser1 = PDFStreamParser(stream.get_data())
|
||||
objs = []
|
||||
try:
|
||||
while 1:
|
||||
(_, obj) = parser1.nextobject()
|
||||
objs.append(obj)
|
||||
except PSEOF:
|
||||
pass
|
||||
n = min(n, len(objs)//2)
|
||||
for index in range(n):
|
||||
objid1 = objs[index*2]
|
||||
self.offsets[objid1] = (objid, index, 0)
|
||||
return
|
||||
|
||||
|
||||
class PDFXRefStream(PDFBaseXRef):
|
||||
|
||||
def __init__(self):
|
||||
self.data = None
|
||||
self.entlen = None
|
||||
self.fl1 = self.fl2 = self.fl3 = None
|
||||
self.ranges = []
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
|
||||
|
||||
def load(self, parser):
|
||||
(_, objid) = parser.nexttoken() # ignored
|
||||
(_, genno) = parser.nexttoken() # ignored
|
||||
(_, kwd) = parser.nexttoken()
|
||||
(_, stream) = parser.nextobject()
|
||||
if not isinstance(stream, PDFStream) \
|
||||
or stream['Type'] is not LITERAL_XREF:
|
||||
raise PDFNoValidXRef('Invalid PDF stream spec.')
|
||||
size = stream['Size']
|
||||
index_array = stream.get('Index', (0, size))
|
||||
if len(index_array) % 2 != 0:
|
||||
raise PDFSyntaxError('Invalid index number')
|
||||
self.ranges.extend(choplist(2, index_array))
|
||||
(self.fl1, self.fl2, self.fl3) = stream['W']
|
||||
self.data = stream.get_data()
|
||||
self.entlen = self.fl1+self.fl2+self.fl3
|
||||
self.trailer = stream.attrs
|
||||
log.info('xref stream: objid=%s, fields=%d,%d,%d',
|
||||
', '.join(map(repr, self.ranges)),
|
||||
self.fl1, self.fl2, self.fl3)
|
||||
return
|
||||
|
||||
def get_trailer(self):
|
||||
return self.trailer
|
||||
|
||||
def get_objids(self):
|
||||
for (start, nobjs) in self.ranges:
|
||||
for i in range(nobjs):
|
||||
offset = self.entlen * i
|
||||
ent = self.data[offset:offset+self.entlen]
|
||||
f1 = nunpack(ent[:self.fl1], 1)
|
||||
if f1 == 1 or f1 == 2:
|
||||
yield start+i
|
||||
return
|
||||
|
||||
def get_pos(self, objid):
|
||||
index = 0
|
||||
for (start, nobjs) in self.ranges:
|
||||
if start <= objid and objid < start+nobjs:
|
||||
index += objid - start
|
||||
break
|
||||
else:
|
||||
index += nobjs
|
||||
else:
|
||||
raise KeyError(objid)
|
||||
offset = self.entlen * index
|
||||
ent = self.data[offset:offset+self.entlen]
|
||||
f1 = nunpack(ent[:self.fl1], 1)
|
||||
f2 = nunpack(ent[self.fl1:self.fl1+self.fl2])
|
||||
f3 = nunpack(ent[self.fl1+self.fl2:])
|
||||
if f1 == 1:
|
||||
return (None, f2, f3)
|
||||
elif f1 == 2:
|
||||
return (f2, f3, 0)
|
||||
else:
|
||||
# this is a free object
|
||||
raise KeyError(objid)
|
||||
|
||||
|
||||
class PDFStandardSecurityHandler:
|
||||
|
||||
PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
|
||||
b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
|
||||
supported_revisions = (2, 3)
|
||||
|
||||
def __init__(self, docid, param, password=''):
|
||||
self.docid = docid
|
||||
self.param = param
|
||||
self.password = password
|
||||
self.init()
|
||||
return
|
||||
|
||||
def init(self):
|
||||
self.init_params()
|
||||
if self.r not in self.supported_revisions:
|
||||
error_msg = 'Unsupported revision: param=%r' % self.param
|
||||
raise PDFEncryptionError(error_msg)
|
||||
self.init_key()
|
||||
return
|
||||
|
||||
def init_params(self):
|
||||
self.v = int_value(self.param.get('V', 0))
|
||||
self.r = int_value(self.param['R'])
|
||||
self.p = uint_value(self.param['P'], 32)
|
||||
self.o = str_value(self.param['O'])
|
||||
self.u = str_value(self.param['U'])
|
||||
self.length = int_value(self.param.get('Length', 40))
|
||||
return
|
||||
|
||||
def init_key(self):
|
||||
self.key = self.authenticate(self.password)
|
||||
if self.key is None:
|
||||
raise PDFPasswordIncorrect
|
||||
return
|
||||
|
||||
def is_printable(self):
|
||||
return bool(self.p & 4)
|
||||
|
||||
def is_modifiable(self):
|
||||
return bool(self.p & 8)
|
||||
|
||||
def is_extractable(self):
|
||||
return bool(self.p & 16)
|
||||
|
||||
def compute_u(self, key):
|
||||
if self.r == 2:
|
||||
# Algorithm 3.4
|
||||
return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2
|
||||
else:
|
||||
# Algorithm 3.5
|
||||
hash = md5(self.PASSWORD_PADDING) # 2
|
||||
hash.update(self.docid[0]) # 3
|
||||
result = Arcfour(key).encrypt(hash.digest()) # 4
|
||||
for i in range(1, 20): # 5
|
||||
k = b''.join(bytes((c ^ i,)) for c in iter(key))
|
||||
result = Arcfour(k).encrypt(result)
|
||||
result += result # 6
|
||||
return result
|
||||
|
||||
def compute_encryption_key(self, password):
|
||||
# Algorithm 3.2
|
||||
password = (password + self.PASSWORD_PADDING)[:32] # 1
|
||||
hash = md5(password) # 2
|
||||
hash.update(self.o) # 3
|
||||
# See https://github.com/pdfminer/pdfminer.six/issues/186
|
||||
hash.update(struct.pack('<L', self.p)) # 4
|
||||
hash.update(self.docid[0]) # 5
|
||||
if self.r >= 4:
|
||||
if not self.encrypt_metadata:
|
||||
hash.update(b'\xff\xff\xff\xff')
|
||||
result = hash.digest()
|
||||
n = 5
|
||||
if self.r >= 3:
|
||||
n = self.length // 8
|
||||
for _ in range(50):
|
||||
result = md5(result[:n]).digest()
|
||||
return result[:n]
|
||||
|
||||
def authenticate(self, password):
|
||||
password = password.encode("latin1")
|
||||
key = self.authenticate_user_password(password)
|
||||
if key is None:
|
||||
key = self.authenticate_owner_password(password)
|
||||
return key
|
||||
|
||||
def authenticate_user_password(self, password):
|
||||
key = self.compute_encryption_key(password)
|
||||
if self.verify_encryption_key(key):
|
||||
return key
|
||||
else:
|
||||
return None
|
||||
|
||||
def verify_encryption_key(self, key):
|
||||
# Algorithm 3.6
|
||||
u = self.compute_u(key)
|
||||
if self.r == 2:
|
||||
return u == self.u
|
||||
return u[:16] == self.u[:16]
|
||||
|
||||
def authenticate_owner_password(self, password):
|
||||
# Algorithm 3.7
|
||||
password = (password + self.PASSWORD_PADDING)[:32]
|
||||
hash = md5(password)
|
||||
if self.r >= 3:
|
||||
for _ in range(50):
|
||||
hash = md5(hash.digest())
|
||||
n = 5
|
||||
if self.r >= 3:
|
||||
n = self.length // 8
|
||||
key = hash.digest()[:n]
|
||||
if self.r == 2:
|
||||
user_password = Arcfour(key).decrypt(self.o)
|
||||
else:
|
||||
user_password = self.o
|
||||
for i in range(19, -1, -1):
|
||||
k = b''.join(bytes((c ^ i,)) for c in iter(key))
|
||||
user_password = Arcfour(k).decrypt(user_password)
|
||||
return self.authenticate_user_password(user_password)
|
||||
|
||||
def decrypt(self, objid, genno, data, attrs=None):
|
||||
return self.decrypt_rc4(objid, genno, data)
|
||||
|
||||
def decrypt_rc4(self, objid, genno, data):
|
||||
key = self.key + struct.pack('<L', objid)[:3] \
|
||||
+ struct.pack('<L', genno)[:2]
|
||||
hash = md5(key)
|
||||
key = hash.digest()[:min(len(key), 16)]
|
||||
return Arcfour(key).decrypt(data)
|
||||
|
||||
|
||||
class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
||||
|
||||
supported_revisions = (4,)
|
||||
|
||||
def init_params(self):
|
||||
super().init_params()
|
||||
self.length = 128
|
||||
self.cf = dict_value(self.param.get('CF'))
|
||||
self.stmf = literal_name(self.param['StmF'])
|
||||
self.strf = literal_name(self.param['StrF'])
|
||||
self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True))
|
||||
if self.stmf != self.strf:
|
||||
error_msg = 'Unsupported crypt filter: param=%r' % self.param
|
||||
raise PDFEncryptionError(error_msg)
|
||||
self.cfm = {}
|
||||
for k, v in self.cf.items():
|
||||
f = self.get_cfm(literal_name(v['CFM']))
|
||||
if f is None:
|
||||
error_msg = 'Unknown crypt filter method: param=%r' \
|
||||
% self.param
|
||||
raise PDFEncryptionError(error_msg)
|
||||
self.cfm[k] = f
|
||||
self.cfm['Identity'] = self.decrypt_identity
|
||||
if self.strf not in self.cfm:
|
||||
error_msg = 'Undefined crypt filter: param=%r' % self.param
|
||||
raise PDFEncryptionError(error_msg)
|
||||
return
|
||||
|
||||
def get_cfm(self, name):
|
||||
if name == 'V2':
|
||||
return self.decrypt_rc4
|
||||
elif name == 'AESV2':
|
||||
return self.decrypt_aes128
|
||||
else:
|
||||
return None
|
||||
|
||||
def decrypt(self, objid, genno, data, attrs=None, name=None):
|
||||
if not self.encrypt_metadata and attrs is not None:
|
||||
t = attrs.get('Type')
|
||||
if t is not None and literal_name(t) == 'Metadata':
|
||||
return data
|
||||
if name is None:
|
||||
name = self.strf
|
||||
return self.cfm[name](objid, genno, data)
|
||||
|
||||
def decrypt_identity(self, objid, genno, data):
|
||||
return data
|
||||
|
||||
def decrypt_aes128(self, objid, genno, data):
|
||||
key = self.key + struct.pack('<L', objid)[:3] \
|
||||
+ struct.pack('<L', genno)[:2] + b'sAlT'
|
||||
hash = md5(key)
|
||||
key = hash.digest()[:min(len(key), 16)]
|
||||
initialization_vector = data[:16]
|
||||
ciphertext = data[16:]
|
||||
cipher = Cipher(algorithms.AES(key),
|
||||
modes.CBC(initialization_vector),
|
||||
backend=default_backend())
|
||||
return cipher.decryptor().update(ciphertext)
|
||||
|
||||
|
||||
class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
||||
|
||||
supported_revisions = (5,)
|
||||
|
||||
def init_params(self):
|
||||
super().init_params()
|
||||
self.length = 256
|
||||
self.oe = str_value(self.param['OE'])
|
||||
self.ue = str_value(self.param['UE'])
|
||||
self.o_hash = self.o[:32]
|
||||
self.o_validation_salt = self.o[32:40]
|
||||
self.o_key_salt = self.o[40:]
|
||||
self.u_hash = self.u[:32]
|
||||
self.u_validation_salt = self.u[32:40]
|
||||
self.u_key_salt = self.u[40:]
|
||||
return
|
||||
|
||||
def get_cfm(self, name):
|
||||
if name == 'AESV3':
|
||||
return self.decrypt_aes256
|
||||
else:
|
||||
return None
|
||||
|
||||
def authenticate(self, password):
|
||||
password = password.encode('utf-8')[:127]
|
||||
hash = sha256(password)
|
||||
hash.update(self.o_validation_salt)
|
||||
hash.update(self.u)
|
||||
if hash.digest() == self.o_hash:
|
||||
hash = sha256(password)
|
||||
hash.update(self.o_key_salt)
|
||||
hash.update(self.u)
|
||||
cipher = Cipher(algorithms.AES(hash.digest()),
|
||||
modes.CBC(b'\0' * 16),
|
||||
backend=default_backend())
|
||||
return cipher.decryptor().update(self.oe)
|
||||
hash = sha256(password)
|
||||
hash.update(self.u_validation_salt)
|
||||
if hash.digest() == self.u_hash:
|
||||
hash = sha256(password)
|
||||
hash.update(self.u_key_salt)
|
||||
cipher = Cipher(algorithms.AES(hash.digest()),
|
||||
modes.CBC(b'\0' * 16),
|
||||
backend=default_backend())
|
||||
return cipher.decryptor().update(self.ue)
|
||||
return None
|
||||
|
||||
def decrypt_aes256(self, objid, genno, data):
|
||||
initialization_vector = data[:16]
|
||||
ciphertext = data[16:]
|
||||
cipher = Cipher(algorithms.AES(self.key),
|
||||
modes.CBC(initialization_vector),
|
||||
backend=default_backend())
|
||||
return cipher.decryptor().update(ciphertext)
|
||||
|
||||
|
||||
class PDFDocument:
|
||||
"""PDFDocument object represents a PDF document.
|
||||
|
||||
Since a PDF file can be very big, normally it is not loaded at
|
||||
once. So PDF document has to cooperate with a PDF parser in order to
|
||||
dynamically import the data as processing goes.
|
||||
|
||||
Typical usage:
|
||||
doc = PDFDocument(parser, password)
|
||||
obj = doc.getobj(objid)
|
||||
|
||||
"""
|
||||
|
||||
security_handler_registry = {
|
||||
1: PDFStandardSecurityHandler,
|
||||
2: PDFStandardSecurityHandler,
|
||||
4: PDFStandardSecurityHandlerV4,
|
||||
5: PDFStandardSecurityHandlerV5,
|
||||
}
|
||||
|
||||
def __init__(self, parser, password='', caching=True, fallback=True):
|
||||
"Set the document to use a given PDFParser object."
|
||||
self.caching = caching
|
||||
self.xrefs = []
|
||||
self.info = []
|
||||
self.catalog = None
|
||||
self.encryption = None
|
||||
self.decipher = None
|
||||
self._parser = None
|
||||
self._cached_objs = {}
|
||||
self._parsed_objs = {}
|
||||
self._parser = parser
|
||||
self._parser.set_document(self)
|
||||
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||
# Retrieve the information of each header that was appended
|
||||
# (maybe multiple times) at the end of the document.
|
||||
try:
|
||||
pos = self.find_xref(parser)
|
||||
self.read_xref_from(parser, pos, self.xrefs)
|
||||
except PDFNoValidXRef:
|
||||
pass # fallback = True
|
||||
if fallback:
|
||||
parser.fallback = True
|
||||
xref = PDFXRefFallback()
|
||||
xref.load(parser)
|
||||
self.xrefs.append(xref)
|
||||
for xref in self.xrefs:
|
||||
trailer = xref.get_trailer()
|
||||
if not trailer:
|
||||
continue
|
||||
# If there's an encryption info, remember it.
|
||||
if 'Encrypt' in trailer:
|
||||
self.encryption = (list_value(trailer['ID']),
|
||||
dict_value(trailer['Encrypt']))
|
||||
self._initialize_password(password)
|
||||
if 'Info' in trailer:
|
||||
self.info.append(dict_value(trailer['Info']))
|
||||
if 'Root' in trailer:
|
||||
# Every PDF file must have exactly one /Root dictionary.
|
||||
self.catalog = dict_value(trailer['Root'])
|
||||
break
|
||||
else:
|
||||
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
|
||||
if self.catalog.get('Type') is not LITERAL_CATALOG:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('Catalog not found!')
|
||||
return
|
||||
|
||||
KEYWORD_OBJ = KWD(b'obj')
|
||||
|
||||
# _initialize_password(password=b'')
|
||||
# Perform the initialization with a given password.
|
||||
def _initialize_password(self, password=''):
|
||||
(docid, param) = self.encryption
|
||||
if literal_name(param.get('Filter')) != 'Standard':
|
||||
raise PDFEncryptionError('Unknown filter: param=%r' % param)
|
||||
v = int_value(param.get('V', 0))
|
||||
factory = self.security_handler_registry.get(v)
|
||||
if factory is None:
|
||||
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
|
||||
handler = factory(docid, param, password)
|
||||
self.decipher = handler.decrypt
|
||||
self.is_printable = handler.is_printable()
|
||||
self.is_modifiable = handler.is_modifiable()
|
||||
self.is_extractable = handler.is_extractable()
|
||||
self._parser.fallback = False # need to read streams with exact length
|
||||
return
|
||||
|
||||
def _getobj_objstm(self, stream, index, objid):
|
||||
if stream.objid in self._parsed_objs:
|
||||
(objs, n) = self._parsed_objs[stream.objid]
|
||||
else:
|
||||
(objs, n) = self._get_objects(stream)
|
||||
if self.caching:
|
||||
self._parsed_objs[stream.objid] = (objs, n)
|
||||
i = n*2+index
|
||||
try:
|
||||
obj = objs[i]
|
||||
except IndexError:
|
||||
raise PDFSyntaxError('index too big: %r' % index)
|
||||
return obj
|
||||
|
||||
def _get_objects(self, stream):
|
||||
if stream.get('Type') is not LITERAL_OBJSTM:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
||||
try:
|
||||
n = stream['N']
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('N is not defined: %r' % stream)
|
||||
n = 0
|
||||
parser = PDFStreamParser(stream.get_data())
|
||||
parser.set_document(self)
|
||||
objs = []
|
||||
try:
|
||||
while 1:
|
||||
(_, obj) = parser.nextobject()
|
||||
objs.append(obj)
|
||||
except PSEOF:
|
||||
pass
|
||||
return (objs, n)
|
||||
|
||||
def _getobj_parse(self, pos, objid):
|
||||
self._parser.seek(pos)
|
||||
(_, objid1) = self._parser.nexttoken() # objid
|
||||
(_, genno) = self._parser.nexttoken() # genno
|
||||
(_, kwd) = self._parser.nexttoken()
|
||||
# hack around malformed pdf files
|
||||
# copied from https://github.com/jaepil/pdfminer3k/blob/master/
|
||||
# pdfminer/pdfparser.py#L399
|
||||
# to solve https://github.com/pdfminer/pdfminer.six/issues/56
|
||||
# assert objid1 == objid, str((objid1, objid))
|
||||
if objid1 != objid:
|
||||
x = []
|
||||
while kwd is not self.KEYWORD_OBJ:
|
||||
(_, kwd) = self._parser.nexttoken()
|
||||
x.append(kwd)
|
||||
if len(x) >= 2:
|
||||
objid1 = x[-2]
|
||||
# #### end hack around malformed pdf files
|
||||
if objid1 != objid:
|
||||
raise PDFSyntaxError('objid mismatch: {!r}={!r}'
|
||||
.format(objid1, objid))
|
||||
|
||||
if kwd != KWD(b'obj'):
|
||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
|
||||
(_, obj) = self._parser.nextobject()
|
||||
return obj
|
||||
|
||||
# can raise PDFObjectNotFound
|
||||
def getobj(self, objid):
|
||||
"""Get object from PDF
|
||||
|
||||
:raises PDFException if PDFDocument is not initialized
|
||||
:raises PDFObjectNotFound if objid does not exist in PDF
|
||||
"""
|
||||
if not self.xrefs:
|
||||
raise PDFException('PDFDocument is not initialized')
|
||||
log.debug('getobj: objid=%r', objid)
|
||||
if objid in self._cached_objs:
|
||||
(obj, genno) = self._cached_objs[objid]
|
||||
else:
|
||||
for xref in self.xrefs:
|
||||
try:
|
||||
(strmid, index, genno) = xref.get_pos(objid)
|
||||
except KeyError:
|
||||
continue
|
||||
try:
|
||||
if strmid is not None:
|
||||
stream = stream_value(self.getobj(strmid))
|
||||
obj = self._getobj_objstm(stream, index, objid)
|
||||
else:
|
||||
obj = self._getobj_parse(index, objid)
|
||||
if self.decipher:
|
||||
obj = decipher_all(self.decipher, objid, genno,
|
||||
obj)
|
||||
|
||||
if isinstance(obj, PDFStream):
|
||||
obj.set_objid(objid, genno)
|
||||
break
|
||||
except (PSEOF, PDFSyntaxError):
|
||||
continue
|
||||
else:
|
||||
raise PDFObjectNotFound(objid)
|
||||
log.debug('register: objid=%r: %r', objid, obj)
|
||||
if self.caching:
|
||||
self._cached_objs[objid] = (obj, genno)
|
||||
return obj
|
||||
|
||||
def get_outlines(self):
|
||||
if 'Outlines' not in self.catalog:
|
||||
raise PDFNoOutlines
|
||||
|
||||
def search(entry, level):
|
||||
entry = dict_value(entry)
|
||||
if 'Title' in entry:
|
||||
if 'A' in entry or 'Dest' in entry:
|
||||
title = decode_text(str_value(entry['Title']))
|
||||
dest = entry.get('Dest')
|
||||
action = entry.get('A')
|
||||
se = entry.get('SE')
|
||||
yield (level, title, dest, action, se)
|
||||
if 'First' in entry and 'Last' in entry:
|
||||
yield from search(entry['First'], level+1)
|
||||
if 'Next' in entry:
|
||||
yield from search(entry['Next'], level)
|
||||
return
|
||||
return search(self.catalog['Outlines'], 0)
|
||||
|
||||
def lookup_name(self, cat, key):
|
||||
try:
|
||||
names = dict_value(self.catalog['Names'])
|
||||
except (PDFTypeError, KeyError):
|
||||
raise KeyError((cat, key))
|
||||
# may raise KeyError
|
||||
d0 = dict_value(names[cat])
|
||||
|
||||
def lookup(d):
|
||||
if 'Limits' in d:
|
||||
(k1, k2) = list_value(d['Limits'])
|
||||
if key < k1 or k2 < key:
|
||||
return None
|
||||
if 'Names' in d:
|
||||
objs = list_value(d['Names'])
|
||||
names = dict(choplist(2, objs))
|
||||
return names[key]
|
||||
if 'Kids' in d:
|
||||
for c in list_value(d['Kids']):
|
||||
v = lookup(dict_value(c))
|
||||
if v:
|
||||
return v
|
||||
raise KeyError((cat, key))
|
||||
return lookup(d0)
|
||||
|
||||
def get_dest(self, name):
|
||||
try:
|
||||
# PDF-1.2 or later
|
||||
obj = self.lookup_name('Dests', name)
|
||||
except KeyError:
|
||||
# PDF-1.1 or prior
|
||||
if 'Dests' not in self.catalog:
|
||||
raise PDFDestinationNotFound(name)
|
||||
d0 = dict_value(self.catalog['Dests'])
|
||||
if name not in d0:
|
||||
raise PDFDestinationNotFound(name)
|
||||
obj = d0[name]
|
||||
return obj
|
||||
|
||||
# find_xref
|
||||
def find_xref(self, parser):
|
||||
"""Internal function used to locate the first XRef."""
|
||||
# search the last xref table by scanning the file backwards.
|
||||
prev = None
|
||||
for line in parser.revreadlines():
|
||||
line = line.strip()
|
||||
log.debug('find_xref: %r', line)
|
||||
if line == b'startxref':
|
||||
break
|
||||
if line:
|
||||
prev = line
|
||||
else:
|
||||
raise PDFNoValidXRef('Unexpected EOF')
|
||||
log.info('xref found: pos=%r', prev)
|
||||
return int(prev)
|
||||
|
||||
# read xref table
|
||||
def read_xref_from(self, parser, start, xrefs):
|
||||
"""Reads XRefs from the given location."""
|
||||
parser.seek(start)
|
||||
parser.reset()
|
||||
try:
|
||||
(pos, token) = parser.nexttoken()
|
||||
except PSEOF:
|
||||
raise PDFNoValidXRef('Unexpected EOF')
|
||||
log.info('read_xref_from: start=%d, token=%r', start, token)
|
||||
if isinstance(token, int):
|
||||
# XRefStream: PDF-1.5
|
||||
parser.seek(pos)
|
||||
parser.reset()
|
||||
xref = PDFXRefStream()
|
||||
xref.load(parser)
|
||||
else:
|
||||
if token is parser.KEYWORD_XREF:
|
||||
parser.nextline()
|
||||
xref = PDFXRef()
|
||||
xref.load(parser)
|
||||
xrefs.append(xref)
|
||||
trailer = xref.get_trailer()
|
||||
log.info('trailer: %r', trailer)
|
||||
if 'XRefStm' in trailer:
|
||||
pos = int_value(trailer['XRefStm'])
|
||||
self.read_xref_from(parser, pos, xrefs)
|
||||
if 'Prev' in trailer:
|
||||
# find previous xref
|
||||
pos = int_value(trailer['Prev'])
|
||||
self.read_xref_from(parser, pos, xrefs)
|
||||
return
|
||||
@@ -1,801 +0,0 @@
|
||||
import logging
|
||||
import struct
|
||||
import sys
|
||||
from io import BytesIO
|
||||
|
||||
from . import settings
|
||||
from .cmapdb import CMap
|
||||
from .cmapdb import CMapDB
|
||||
from .cmapdb import CMapParser
|
||||
from .cmapdb import FileUnicodeMap
|
||||
from .encodingdb import EncodingDB
|
||||
from .encodingdb import name2unicode
|
||||
from .fontmetrics import FONT_METRICS
|
||||
from .pdftypes import PDFException
|
||||
from .pdftypes import PDFStream
|
||||
from .pdftypes import dict_value
|
||||
from .pdftypes import int_value
|
||||
from .pdftypes import list_value
|
||||
from .pdftypes import num_value
|
||||
from .pdftypes import resolve1, resolve_all
|
||||
from .pdftypes import stream_value
|
||||
from .psparser import KWD
|
||||
from .psparser import LIT
|
||||
from .psparser import PSEOF
|
||||
from .psparser import PSLiteral
|
||||
from .psparser import PSStackParser
|
||||
from .psparser import literal_name
|
||||
from .utils import apply_matrix_norm
|
||||
from .utils import choplist
|
||||
from .utils import isnumber
|
||||
from .utils import nunpack
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_widths(seq):
|
||||
widths = {}
|
||||
r = []
|
||||
for v in seq:
|
||||
if isinstance(v, list):
|
||||
if r:
|
||||
char1 = r[-1]
|
||||
for (i, w) in enumerate(v):
|
||||
widths[char1+i] = w
|
||||
r = []
|
||||
elif isnumber(v):
|
||||
r.append(v)
|
||||
if len(r) == 3:
|
||||
(char1, char2, w) = r
|
||||
for i in range(char1, char2+1):
|
||||
widths[i] = w
|
||||
r = []
|
||||
return widths
|
||||
|
||||
|
||||
def get_widths2(seq):
|
||||
widths = {}
|
||||
r = []
|
||||
for v in seq:
|
||||
if isinstance(v, list):
|
||||
if r:
|
||||
char1 = r[-1]
|
||||
for (i, (w, vx, vy)) in enumerate(choplist(3, v)):
|
||||
widths[char1+i] = (w, (vx, vy))
|
||||
r = []
|
||||
elif isnumber(v):
|
||||
r.append(v)
|
||||
if len(r) == 5:
|
||||
(char1, char2, w, vx, vy) = r
|
||||
for i in range(char1, char2+1):
|
||||
widths[i] = (w, (vx, vy))
|
||||
r = []
|
||||
return widths
|
||||
|
||||
|
||||
class FontMetricsDB:
|
||||
|
||||
@classmethod
|
||||
def get_metrics(cls, fontname):
|
||||
return FONT_METRICS[fontname]
|
||||
|
||||
|
||||
class Type1FontHeaderParser(PSStackParser):
|
||||
|
||||
KEYWORD_BEGIN = KWD(b'begin')
|
||||
KEYWORD_END = KWD(b'end')
|
||||
KEYWORD_DEF = KWD(b'def')
|
||||
KEYWORD_PUT = KWD(b'put')
|
||||
KEYWORD_DICT = KWD(b'dict')
|
||||
KEYWORD_ARRAY = KWD(b'array')
|
||||
KEYWORD_READONLY = KWD(b'readonly')
|
||||
KEYWORD_FOR = KWD(b'for')
|
||||
|
||||
def __init__(self, data):
|
||||
PSStackParser.__init__(self, data)
|
||||
self._cid2unicode = {}
|
||||
return
|
||||
|
||||
def get_encoding(self):
|
||||
"""Parse the font encoding.
|
||||
|
||||
The Type1 font encoding maps character codes to character names. These
|
||||
character names could either be standard Adobe glyph names, or
|
||||
character names associated with custom CharStrings for this font. A
|
||||
CharString is a sequence of operations that describe how the character
|
||||
should be drawn. Currently, this function returns '' (empty string)
|
||||
for character names that are associated with a CharStrings.
|
||||
|
||||
Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format
|
||||
|
||||
:returns mapping of character identifiers (cid's) to unicode characters
|
||||
"""
|
||||
while 1:
|
||||
try:
|
||||
(cid, name) = self.nextobject()
|
||||
except PSEOF:
|
||||
break
|
||||
try:
|
||||
self._cid2unicode[cid] = name2unicode(name)
|
||||
except KeyError as e:
|
||||
log.debug(str(e))
|
||||
return self._cid2unicode
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
if token is self.KEYWORD_PUT:
|
||||
((_, key), (_, value)) = self.pop(2)
|
||||
if (isinstance(key, int) and isinstance(value, PSLiteral)):
|
||||
self.add_results((key, literal_name(value)))
|
||||
return
|
||||
|
||||
|
||||
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-',
|
||||
None, '-')
|
||||
|
||||
# Mapping of cmap names. Original cmap name is kept if not in the mapping.
|
||||
# (missing reference for why DLIdent is mapped to Identity)
|
||||
IDENTITY_ENCODER = {
|
||||
'DLIdent-H': 'Identity-H',
|
||||
'DLIdent-V': 'Identity-V',
|
||||
}
|
||||
|
||||
|
||||
def getdict(data):
|
||||
d = {}
|
||||
fp = BytesIO(data)
|
||||
stack = []
|
||||
while 1:
|
||||
c = fp.read(1)
|
||||
if not c:
|
||||
break
|
||||
b0 = ord(c)
|
||||
if b0 <= 21:
|
||||
d[b0] = stack
|
||||
stack = []
|
||||
continue
|
||||
if b0 == 30:
|
||||
s = ''
|
||||
loop = True
|
||||
while loop:
|
||||
b = ord(fp.read(1))
|
||||
for n in (b >> 4, b & 15):
|
||||
if n == 15:
|
||||
loop = False
|
||||
else:
|
||||
s += NIBBLES[n]
|
||||
value = float(s)
|
||||
elif 32 <= b0 and b0 <= 246:
|
||||
value = b0-139
|
||||
else:
|
||||
b1 = ord(fp.read(1))
|
||||
if 247 <= b0 and b0 <= 250:
|
||||
value = ((b0-247) << 8)+b1+108
|
||||
elif 251 <= b0 and b0 <= 254:
|
||||
value = -((b0-251) << 8)-b1-108
|
||||
else:
|
||||
b2 = ord(fp.read(1))
|
||||
if 128 <= b1:
|
||||
b1 -= 256
|
||||
if b0 == 28:
|
||||
value = b1 << 8 | b2
|
||||
else:
|
||||
value = b1 << 24 | b2 << 16 | \
|
||||
struct.unpack('>H', fp.read(2))[0]
|
||||
stack.append(value)
|
||||
return d
|
||||
|
||||
|
||||
class CFFFont:
|
||||
|
||||
STANDARD_STRINGS = (
|
||||
'.notdef', 'space', 'exclam', 'quotedbl', 'numbersign',
|
||||
'dollar', 'percent', 'ampersand', 'quoteright', 'parenleft',
|
||||
'parenright', 'asterisk', 'plus', 'comma', 'hyphen', 'period',
|
||||
'slash', 'zero', 'one', 'two', 'three', 'four', 'five', 'six',
|
||||
'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal',
|
||||
'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
|
||||
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
|
||||
'U', 'V', 'W', 'X', 'Y', 'Z', 'bracketleft', 'backslash',
|
||||
'bracketright', 'asciicircum', 'underscore', 'quoteleft', 'a',
|
||||
'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
|
||||
'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
||||
'braceleft', 'bar', 'braceright', 'asciitilde', 'exclamdown',
|
||||
'cent', 'sterling', 'fraction', 'yen', 'florin', 'section',
|
||||
'currency', 'quotesingle', 'quotedblleft', 'guillemotleft',
|
||||
'guilsinglleft', 'guilsinglright', 'fi', 'fl', 'endash',
|
||||
'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet',
|
||||
'quotesinglbase', 'quotedblbase', 'quotedblright',
|
||||
'guillemotright', 'ellipsis', 'perthousand', 'questiondown',
|
||||
'grave', 'acute', 'circumflex', 'tilde', 'macron', 'breve',
|
||||
'dotaccent', 'dieresis', 'ring', 'cedilla', 'hungarumlaut',
|
||||
'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash',
|
||||
'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash',
|
||||
'oslash', 'oe', 'germandbls', 'onesuperior', 'logicalnot', 'mu',
|
||||
'trademark', 'Eth', 'onehalf', 'plusminus', 'Thorn',
|
||||
'onequarter', 'divide', 'brokenbar', 'degree', 'thorn',
|
||||
'threequarters', 'twosuperior', 'registered', 'minus', 'eth',
|
||||
'multiply', 'threesuperior', 'copyright', 'Aacute',
|
||||
'Acircumflex', 'Adieresis', 'Agrave', 'Aring', 'Atilde',
|
||||
'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave',
|
||||
'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde',
|
||||
'Oacute', 'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde',
|
||||
'Scaron', 'Uacute', 'Ucircumflex', 'Udieresis', 'Ugrave',
|
||||
'Yacute', 'Ydieresis', 'Zcaron', 'aacute', 'acircumflex',
|
||||
'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla', 'eacute',
|
||||
'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex',
|
||||
'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex',
|
||||
'odieresis', 'ograve', 'otilde', 'scaron', 'uacute',
|
||||
'ucircumflex', 'udieresis', 'ugrave', 'yacute', 'ydieresis',
|
||||
'zcaron', 'exclamsmall', 'Hungarumlautsmall', 'dollaroldstyle',
|
||||
'dollarsuperior', 'ampersandsmall', 'Acutesmall',
|
||||
'parenleftsuperior', 'parenrightsuperior', 'twodotenleader',
|
||||
'onedotenleader', 'zerooldstyle', 'oneoldstyle', 'twooldstyle',
|
||||
'threeoldstyle', 'fouroldstyle', 'fiveoldstyle', 'sixoldstyle',
|
||||
'sevenoldstyle', 'eightoldstyle', 'nineoldstyle',
|
||||
'commasuperior', 'threequartersemdash', 'periodsuperior',
|
||||
'questionsmall', 'asuperior', 'bsuperior', 'centsuperior',
|
||||
'dsuperior', 'esuperior', 'isuperior', 'lsuperior', 'msuperior',
|
||||
'nsuperior', 'osuperior', 'rsuperior', 'ssuperior', 'tsuperior',
|
||||
'ff', 'ffi', 'ffl', 'parenleftinferior', 'parenrightinferior',
|
||||
'Circumflexsmall', 'hyphensuperior', 'Gravesmall', 'Asmall',
|
||||
'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall',
|
||||
'Hsmall', 'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall',
|
||||
'Nsmall', 'Osmall', 'Psmall', 'Qsmall', 'Rsmall', 'Ssmall',
|
||||
'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall', 'Ysmall',
|
||||
'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall',
|
||||
'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall',
|
||||
'Zcaronsmall', 'Dieresissmall', 'Brevesmall', 'Caronsmall',
|
||||
'Dotaccentsmall', 'Macronsmall', 'figuredash', 'hypheninferior',
|
||||
'Ogoneksmall', 'Ringsmall', 'Cedillasmall', 'questiondownsmall',
|
||||
'oneeighth', 'threeeighths', 'fiveeighths', 'seveneighths',
|
||||
'onethird', 'twothirds', 'zerosuperior', 'foursuperior',
|
||||
'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior',
|
||||
'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior',
|
||||
'threeinferior', 'fourinferior', 'fiveinferior', 'sixinferior',
|
||||
'seveninferior', 'eightinferior', 'nineinferior',
|
||||
'centinferior', 'dollarinferior', 'periodinferior',
|
||||
'commainferior', 'Agravesmall', 'Aacutesmall',
|
||||
'Acircumflexsmall', 'Atildesmall', 'Adieresissmall',
|
||||
'Aringsmall', 'AEsmall', 'Ccedillasmall', 'Egravesmall',
|
||||
'Eacutesmall', 'Ecircumflexsmall', 'Edieresissmall',
|
||||
'Igravesmall', 'Iacutesmall', 'Icircumflexsmall',
|
||||
'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall',
|
||||
'Oacutesmall', 'Ocircumflexsmall', 'Otildesmall',
|
||||
'Odieresissmall', 'OEsmall', 'Oslashsmall', 'Ugravesmall',
|
||||
'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall',
|
||||
'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000',
|
||||
'001.001', '001.002', '001.003', 'Black', 'Bold', 'Book',
|
||||
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
|
||||
)
|
||||
|
||||
class INDEX:
|
||||
|
||||
def __init__(self, fp):
|
||||
self.fp = fp
|
||||
self.offsets = []
|
||||
(count, offsize) = struct.unpack('>HB', self.fp.read(3))
|
||||
for i in range(count+1):
|
||||
self.offsets.append(nunpack(self.fp.read(offsize)))
|
||||
self.base = self.fp.tell()-1
|
||||
self.fp.seek(self.base+self.offsets[-1])
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<INDEX: size=%d>' % len(self)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.offsets)-1
|
||||
|
||||
def __getitem__(self, i):
|
||||
self.fp.seek(self.base+self.offsets[i])
|
||||
return self.fp.read(self.offsets[i+1]-self.offsets[i])
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self[i] for i in range(len(self)))
|
||||
|
||||
def __init__(self, name, fp):
|
||||
self.name = name
|
||||
self.fp = fp
|
||||
# Header
|
||||
(_major, _minor, hdrsize, offsize) = struct.unpack('BBBB',
|
||||
self.fp.read(4))
|
||||
self.fp.read(hdrsize-4)
|
||||
# Name INDEX
|
||||
self.name_index = self.INDEX(self.fp)
|
||||
# Top DICT INDEX
|
||||
self.dict_index = self.INDEX(self.fp)
|
||||
# String INDEX
|
||||
self.string_index = self.INDEX(self.fp)
|
||||
# Global Subr INDEX
|
||||
self.subr_index = self.INDEX(self.fp)
|
||||
# Top DICT DATA
|
||||
self.top_dict = getdict(self.dict_index[0])
|
||||
(charset_pos,) = self.top_dict.get(15, [0])
|
||||
(encoding_pos,) = self.top_dict.get(16, [0])
|
||||
(charstring_pos,) = self.top_dict.get(17, [0])
|
||||
# CharStrings
|
||||
self.fp.seek(charstring_pos)
|
||||
self.charstring = self.INDEX(self.fp)
|
||||
self.nglyphs = len(self.charstring)
|
||||
# Encodings
|
||||
self.code2gid = {}
|
||||
self.gid2code = {}
|
||||
self.fp.seek(encoding_pos)
|
||||
format = self.fp.read(1)
|
||||
if format == b'\x00':
|
||||
# Format 0
|
||||
(n,) = struct.unpack('B', self.fp.read(1))
|
||||
for (code, gid) in enumerate(struct.unpack('B'*n,
|
||||
self.fp.read(n))):
|
||||
self.code2gid[code] = gid
|
||||
self.gid2code[gid] = code
|
||||
elif format == b'\x01':
|
||||
# Format 1
|
||||
(n,) = struct.unpack('B', self.fp.read(1))
|
||||
code = 0
|
||||
for i in range(n):
|
||||
(first, nleft) = struct.unpack('BB', self.fp.read(2))
|
||||
for gid in range(first, first+nleft+1):
|
||||
self.code2gid[code] = gid
|
||||
self.gid2code[gid] = code
|
||||
code += 1
|
||||
else:
|
||||
raise ValueError('unsupported encoding format: %r' % format)
|
||||
# Charsets
|
||||
self.name2gid = {}
|
||||
self.gid2name = {}
|
||||
self.fp.seek(charset_pos)
|
||||
format = self.fp.read(1)
|
||||
if format == b'\x00':
|
||||
# Format 0
|
||||
n = self.nglyphs-1
|
||||
for (gid, sid) in enumerate(struct.unpack('>'+'H'*n,
|
||||
self.fp.read(2*n))):
|
||||
gid += 1
|
||||
name = self.getstr(sid)
|
||||
self.name2gid[name] = gid
|
||||
self.gid2name[gid] = name
|
||||
elif format == b'\x01':
|
||||
# Format 1
|
||||
(n,) = struct.unpack('B', self.fp.read(1))
|
||||
sid = 0
|
||||
for i in range(n):
|
||||
(first, nleft) = struct.unpack('BB', self.fp.read(2))
|
||||
for gid in range(first, first+nleft+1):
|
||||
name = self.getstr(sid)
|
||||
self.name2gid[name] = gid
|
||||
self.gid2name[gid] = name
|
||||
sid += 1
|
||||
elif format == b'\x02':
|
||||
# Format 2
|
||||
assert False, str(('Unhandled', format))
|
||||
else:
|
||||
raise ValueError('unsupported charset format: %r' % format)
|
||||
return
|
||||
|
||||
def getstr(self, sid):
|
||||
if sid < len(self.STANDARD_STRINGS):
|
||||
return self.STANDARD_STRINGS[sid]
|
||||
return self.string_index[sid-len(self.STANDARD_STRINGS)]
|
||||
|
||||
|
||||
class TrueTypeFont:
|
||||
|
||||
class CMapNotFound(Exception):
|
||||
pass
|
||||
|
||||
def __init__(self, name, fp):
|
||||
self.name = name
|
||||
self.fp = fp
|
||||
self.tables = {}
|
||||
self.fonttype = fp.read(4)
|
||||
try:
|
||||
(ntables, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
|
||||
for _ in range(ntables):
|
||||
(name, tsum, offset, length) = struct.unpack('>4sLLL',
|
||||
fp.read(16))
|
||||
self.tables[name] = (offset, length)
|
||||
except struct.error:
|
||||
# Do not fail if there are not enough bytes to read. Even for
|
||||
# corrupted PDFs we would like to get as much information as
|
||||
# possible, so continue.
|
||||
pass
|
||||
return
|
||||
|
||||
def create_unicode_map(self):
|
||||
if 'cmap' not in self.tables:
|
||||
raise TrueTypeFont.CMapNotFound
|
||||
(base_offset, length) = self.tables['cmap']
|
||||
fp = self.fp
|
||||
fp.seek(base_offset)
|
||||
(version, nsubtables) = struct.unpack('>HH', fp.read(4))
|
||||
subtables = []
|
||||
for i in range(nsubtables):
|
||||
subtables.append(struct.unpack('>HHL', fp.read(8)))
|
||||
char2gid = {}
|
||||
# Only supports subtable type 0, 2 and 4.
|
||||
for (_1, _2, st_offset) in subtables:
|
||||
fp.seek(base_offset+st_offset)
|
||||
(fmttype, fmtlen, fmtlang) = struct.unpack('>HHH', fp.read(6))
|
||||
if fmttype == 0:
|
||||
char2gid.update(enumerate(struct.unpack('>256B',
|
||||
fp.read(256))))
|
||||
elif fmttype == 2:
|
||||
subheaderkeys = struct.unpack('>256H', fp.read(512))
|
||||
firstbytes = [0]*8192
|
||||
for (i, k) in enumerate(subheaderkeys):
|
||||
firstbytes[k//8] = i
|
||||
nhdrs = max(subheaderkeys)//8 + 1
|
||||
hdrs = []
|
||||
for i in range(nhdrs):
|
||||
(firstcode, entcount, delta, offset) = \
|
||||
struct.unpack('>HHhH', fp.read(8))
|
||||
hdrs.append((i, firstcode, entcount, delta,
|
||||
fp.tell()-2+offset))
|
||||
for (i, firstcode, entcount, delta, pos) in hdrs:
|
||||
if not entcount:
|
||||
continue
|
||||
first = firstcode + (firstbytes[i] << 8)
|
||||
fp.seek(pos)
|
||||
for c in range(entcount):
|
||||
gid = struct.unpack('>H', fp.read(2))
|
||||
if gid:
|
||||
gid += delta
|
||||
char2gid[first+c] = gid
|
||||
elif fmttype == 4:
|
||||
(segcount, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
|
||||
segcount //= 2
|
||||
ecs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||
fp.read(2)
|
||||
scs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||
idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount))
|
||||
pos = fp.tell()
|
||||
idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||
for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
|
||||
if idr:
|
||||
fp.seek(pos+idr)
|
||||
for c in range(sc, ec+1):
|
||||
b = struct.unpack('>H', fp.read(2))[0]
|
||||
char2gid[c] = (b + idd) & 0xffff
|
||||
else:
|
||||
for c in range(sc, ec+1):
|
||||
char2gid[c] = (c + idd) & 0xffff
|
||||
else:
|
||||
assert False, str(('Unhandled', fmttype))
|
||||
# create unicode map
|
||||
unicode_map = FileUnicodeMap()
|
||||
for (char, gid) in char2gid.items():
|
||||
unicode_map.add_cid2unichr(gid, char)
|
||||
return unicode_map
|
||||
|
||||
|
||||
class PDFFontError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFUnicodeNotDefined(PDFFontError):
|
||||
pass
|
||||
|
||||
|
||||
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
|
||||
LITERAL_TYPE1C = LIT('Type1C')
|
||||
|
||||
|
||||
class PDFFont:
|
||||
|
||||
def __init__(self, descriptor, widths, default_width=None):
|
||||
self.descriptor = descriptor
|
||||
self.widths = resolve_all(widths)
|
||||
self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
|
||||
if isinstance(self.fontname, PSLiteral):
|
||||
self.fontname = literal_name(self.fontname)
|
||||
self.flags = int_value(descriptor.get('Flags', 0))
|
||||
self.ascent = num_value(descriptor.get('Ascent', 0))
|
||||
self.descent = num_value(descriptor.get('Descent', 0))
|
||||
self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
|
||||
if default_width is None:
|
||||
self.default_width = num_value(descriptor.get('MissingWidth', 0))
|
||||
else:
|
||||
self.default_width = default_width
|
||||
self.leading = num_value(descriptor.get('Leading', 0))
|
||||
self.bbox = list_value(resolve_all(descriptor.get('FontBBox',
|
||||
(0, 0, 0, 0))))
|
||||
self.hscale = self.vscale = .001
|
||||
|
||||
# PDF RM 9.8.1 specifies /Descent should always be a negative number.
|
||||
# PScript5.dll seems to produce Descent with a positive number, but
|
||||
# text analysis will be wrong if this is taken as correct. So force
|
||||
# descent to negative.
|
||||
if self.descent > 0:
|
||||
self.descent = -self.descent
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFFont>'
|
||||
|
||||
def is_vertical(self):
|
||||
return False
|
||||
|
||||
def is_multibyte(self):
|
||||
return False
|
||||
|
||||
def decode(self, bytes):
|
||||
return bytearray(bytes) # map(ord, bytes)
|
||||
|
||||
def get_ascent(self):
|
||||
"""Ascent above the baseline, in text space units"""
|
||||
return self.ascent * self.vscale
|
||||
|
||||
def get_descent(self):
|
||||
"""Descent below the baseline, in text space units; always negative"""
|
||||
return self.descent * self.vscale
|
||||
|
||||
def get_width(self):
|
||||
w = self.bbox[2]-self.bbox[0]
|
||||
if w == 0:
|
||||
w = -self.default_width
|
||||
return w * self.hscale
|
||||
|
||||
def get_height(self):
|
||||
h = self.bbox[3]-self.bbox[1]
|
||||
if h == 0:
|
||||
h = self.ascent - self.descent
|
||||
return h * self.vscale
|
||||
|
||||
def char_width(self, cid):
|
||||
try:
|
||||
return self.widths[cid] * self.hscale
|
||||
except KeyError:
|
||||
try:
|
||||
return self.widths[self.to_unichr(cid)] * self.hscale
|
||||
except (KeyError, PDFUnicodeNotDefined):
|
||||
return self.default_width * self.hscale
|
||||
|
||||
def char_disp(self, cid):
|
||||
return 0
|
||||
|
||||
def string_width(self, s):
|
||||
return sum(self.char_width(cid) for cid in self.decode(s))
|
||||
|
||||
|
||||
class PDFSimpleFont(PDFFont):
|
||||
|
||||
def __init__(self, descriptor, widths, spec):
|
||||
# Font encoding is specified either by a name of
|
||||
# built-in encoding or a dictionary that describes
|
||||
# the differences.
|
||||
if 'Encoding' in spec:
|
||||
encoding = resolve1(spec['Encoding'])
|
||||
else:
|
||||
encoding = LITERAL_STANDARD_ENCODING
|
||||
if isinstance(encoding, dict):
|
||||
name = literal_name(encoding.get('BaseEncoding',
|
||||
LITERAL_STANDARD_ENCODING))
|
||||
diff = list_value(encoding.get('Differences', []))
|
||||
self.cid2unicode = EncodingDB.get_encoding(name, diff)
|
||||
else:
|
||||
self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
|
||||
self.unicode_map = None
|
||||
if 'ToUnicode' in spec:
|
||||
strm = stream_value(spec['ToUnicode'])
|
||||
self.unicode_map = FileUnicodeMap()
|
||||
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
|
||||
PDFFont.__init__(self, descriptor, widths)
|
||||
return
|
||||
|
||||
def to_unichr(self, cid):
|
||||
if self.unicode_map:
|
||||
try:
|
||||
return self.unicode_map.get_unichr(cid)
|
||||
except KeyError:
|
||||
pass
|
||||
try:
|
||||
return self.cid2unicode[cid]
|
||||
except KeyError:
|
||||
raise PDFUnicodeNotDefined(None, cid)
|
||||
|
||||
|
||||
class PDFType1Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, rsrcmgr, spec):
|
||||
try:
|
||||
self.basefont = literal_name(spec['BaseFont'])
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFFontError('BaseFont is missing')
|
||||
self.basefont = 'unknown'
|
||||
try:
|
||||
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
|
||||
except KeyError:
|
||||
descriptor = dict_value(spec.get('FontDescriptor', {}))
|
||||
firstchar = int_value(spec.get('FirstChar', 0))
|
||||
# lastchar = int_value(spec.get('LastChar', 255))
|
||||
widths = list_value(spec.get('Widths', [0]*256))
|
||||
widths = {i+firstchar: w for (i, w) in enumerate(widths)}
|
||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||
if 'Encoding' not in spec and 'FontFile' in descriptor:
|
||||
# try to recover the missing encoding info from the font file.
|
||||
self.fontfile = stream_value(descriptor.get('FontFile'))
|
||||
length1 = int_value(self.fontfile['Length1'])
|
||||
data = self.fontfile.get_data()[:length1]
|
||||
parser = Type1FontHeaderParser(BytesIO(data))
|
||||
self.cid2unicode = parser.get_encoding()
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFType1Font: basefont=%r>' % self.basefont
|
||||
|
||||
|
||||
class PDFTrueTypeFont(PDFType1Font):
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
||||
|
||||
|
||||
class PDFType3Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, rsrcmgr, spec):
|
||||
firstchar = int_value(spec.get('FirstChar', 0))
|
||||
# lastchar = int_value(spec.get('LastChar', 0))
|
||||
widths = list_value(spec.get('Widths', [0]*256))
|
||||
widths = {i+firstchar: w for (i, w) in enumerate(widths)}
|
||||
if 'FontDescriptor' in spec:
|
||||
descriptor = dict_value(spec['FontDescriptor'])
|
||||
else:
|
||||
descriptor = {'Ascent': 0, 'Descent': 0,
|
||||
'FontBBox': spec['FontBBox']}
|
||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||
self.matrix = tuple(list_value(spec.get('FontMatrix')))
|
||||
(_, self.descent, _, self.ascent) = self.bbox
|
||||
(self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFType3Font>'
|
||||
|
||||
|
||||
class PDFCIDFont(PDFFont):
|
||||
|
||||
def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
|
||||
try:
|
||||
self.basefont = literal_name(spec['BaseFont'])
|
||||
except KeyError:
|
||||
if strict:
|
||||
raise PDFFontError('BaseFont is missing')
|
||||
self.basefont = 'unknown'
|
||||
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
|
||||
cid_registry = resolve1(
|
||||
self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1")
|
||||
cid_ordering = resolve1(
|
||||
self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")
|
||||
self.cidcoding = '{}-{}'.format(cid_registry, cid_ordering)
|
||||
self.cmap = self.get_cmap_from_spec(spec, strict)
|
||||
|
||||
try:
|
||||
descriptor = dict_value(spec['FontDescriptor'])
|
||||
except KeyError:
|
||||
if strict:
|
||||
raise PDFFontError('FontDescriptor is missing')
|
||||
descriptor = {}
|
||||
ttf = None
|
||||
if 'FontFile2' in descriptor:
|
||||
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
||||
ttf = TrueTypeFont(self.basefont,
|
||||
BytesIO(self.fontfile.get_data()))
|
||||
self.unicode_map = None
|
||||
if 'ToUnicode' in spec:
|
||||
strm = stream_value(spec['ToUnicode'])
|
||||
self.unicode_map = FileUnicodeMap()
|
||||
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
|
||||
elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'):
|
||||
if ttf:
|
||||
try:
|
||||
self.unicode_map = ttf.create_unicode_map()
|
||||
except TrueTypeFont.CMapNotFound:
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
self.unicode_map = CMapDB.get_unicode_map(
|
||||
self.cidcoding, self.cmap.is_vertical())
|
||||
except CMapDB.CMapNotFound:
|
||||
pass
|
||||
|
||||
self.vertical = self.cmap.is_vertical()
|
||||
if self.vertical:
|
||||
# writing mode: vertical
|
||||
widths = get_widths2(list_value(spec.get('W2', [])))
|
||||
self.disps = {cid: (vx, vy)
|
||||
for (cid, (_, (vx, vy))) in widths.items()}
|
||||
(vy, w) = spec.get('DW2', [880, -1000])
|
||||
self.default_disp = (None, vy)
|
||||
widths = {cid: w for (cid, (w, _)) in widths.items()}
|
||||
default_width = w
|
||||
else:
|
||||
# writing mode: horizontal
|
||||
self.disps = {}
|
||||
self.default_disp = 0
|
||||
widths = get_widths(list_value(spec.get('W', [])))
|
||||
default_width = spec.get('DW', 1000)
|
||||
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
|
||||
return
|
||||
|
||||
def get_cmap_from_spec(self, spec, strict):
|
||||
"""Get cmap from font specification
|
||||
|
||||
For certain PDFs, Encoding Type isn't mentioned as an attribute of
|
||||
Encoding but as an attribute of CMapName, where CMapName is an
|
||||
attribute of spec['Encoding'].
|
||||
The horizontal/vertical modes are mentioned with different name
|
||||
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
|
||||
"""
|
||||
cmap_name = self._get_cmap_name(spec, strict)
|
||||
|
||||
try:
|
||||
return CMapDB.get_cmap(cmap_name)
|
||||
except CMapDB.CMapNotFound as e:
|
||||
if strict:
|
||||
raise PDFFontError(e)
|
||||
return CMap()
|
||||
|
||||
@staticmethod
|
||||
def _get_cmap_name(spec, strict):
|
||||
"""Get cmap name from font specification"""
|
||||
cmap_name = 'unknown' # default value
|
||||
|
||||
try:
|
||||
spec_encoding = spec['Encoding']
|
||||
if hasattr(spec_encoding, 'name'):
|
||||
cmap_name = literal_name(spec['Encoding'])
|
||||
else:
|
||||
cmap_name = literal_name(spec_encoding['CMapName'])
|
||||
except KeyError:
|
||||
if strict:
|
||||
raise PDFFontError('Encoding is unspecified')
|
||||
|
||||
if type(cmap_name) is PDFStream:
|
||||
if 'CMapName' in cmap_name:
|
||||
cmap_name = cmap_name.get('CMapName').name
|
||||
else:
|
||||
if strict:
|
||||
raise PDFFontError('CMapName unspecified for encoding')
|
||||
|
||||
cmap_name = IDENTITY_ENCODER.get(cmap_name, cmap_name)
|
||||
return cmap_name
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFCIDFont: basefont={!r}, cidcoding={!r}>'\
|
||||
.format(self.basefont, self.cidcoding)
|
||||
|
||||
def is_vertical(self):
|
||||
return self.vertical
|
||||
|
||||
def is_multibyte(self):
|
||||
return True
|
||||
|
||||
def decode(self, bytes):
|
||||
return self.cmap.decode(bytes)
|
||||
|
||||
def char_disp(self, cid):
|
||||
"Returns an integer for horizontal fonts, a tuple for vertical fonts."
|
||||
return self.disps.get(cid, self.default_disp)
|
||||
|
||||
def to_unichr(self, cid):
|
||||
try:
|
||||
if not self.unicode_map:
|
||||
raise KeyError(cid)
|
||||
return self.unicode_map.get_unichr(cid)
|
||||
except KeyError:
|
||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||
|
||||
|
||||
def main(argv):
|
||||
for fname in argv[1:]:
|
||||
fp = open(fname, 'rb')
|
||||
font = CFFFont(fname, fp)
|
||||
print(font)
|
||||
fp.close()
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
@@ -1,943 +0,0 @@
|
||||
import re
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from .cmapdb import CMapDB
|
||||
from .cmapdb import CMap
|
||||
from .psparser import PSTypeError
|
||||
from .psparser import PSEOF
|
||||
from .psparser import PSKeyword
|
||||
from .psparser import literal_name
|
||||
from .psparser import keyword_name
|
||||
from .psparser import PSStackParser
|
||||
from .psparser import LIT
|
||||
from .psparser import KWD
|
||||
from . import settings
|
||||
from .pdftypes import PDFException
|
||||
from .pdftypes import PDFStream
|
||||
from .pdftypes import PDFObjRef
|
||||
from .pdftypes import resolve1
|
||||
from .pdftypes import list_value
|
||||
from .pdftypes import dict_value
|
||||
from .pdftypes import stream_value
|
||||
from .pdffont import PDFFontError
|
||||
from .pdffont import PDFType1Font
|
||||
from .pdffont import PDFTrueTypeFont
|
||||
from .pdffont import PDFType3Font
|
||||
from .pdffont import PDFCIDFont
|
||||
from .pdfcolor import PDFColorSpace
|
||||
from .pdfcolor import PREDEFINED_COLORSPACE
|
||||
from .utils import choplist
|
||||
from .utils import mult_matrix
|
||||
from .utils import MATRIX_IDENTITY
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PDFResourceError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFInterpreterError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
LITERAL_PDF = LIT('PDF')
|
||||
LITERAL_TEXT = LIT('Text')
|
||||
LITERAL_FONT = LIT('Font')
|
||||
LITERAL_FORM = LIT('Form')
|
||||
LITERAL_IMAGE = LIT('Image')
|
||||
|
||||
|
||||
class PDFTextState:
|
||||
|
||||
def __init__(self):
|
||||
self.font = None
|
||||
self.fontsize = 0
|
||||
self.charspace = 0
|
||||
self.wordspace = 0
|
||||
self.scaling = 100
|
||||
self.leading = 0
|
||||
self.render = 0
|
||||
self.rise = 0
|
||||
self.reset()
|
||||
# self.matrix is set
|
||||
# self.linematrix is set
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFTextState: font=%r, fontsize=%r, charspace=%r, ' \
|
||||
'wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, ' \
|
||||
'matrix=%r, linematrix=%r>' \
|
||||
% (self.font, self.fontsize, self.charspace, self.wordspace,
|
||||
self.scaling, self.leading, self.render, self.rise,
|
||||
self.matrix, self.linematrix)
|
||||
|
||||
def copy(self):
|
||||
obj = PDFTextState()
|
||||
obj.font = self.font
|
||||
obj.fontsize = self.fontsize
|
||||
obj.charspace = self.charspace
|
||||
obj.wordspace = self.wordspace
|
||||
obj.scaling = self.scaling
|
||||
obj.leading = self.leading
|
||||
obj.render = self.render
|
||||
obj.rise = self.rise
|
||||
obj.matrix = self.matrix
|
||||
obj.linematrix = self.linematrix
|
||||
return obj
|
||||
|
||||
def reset(self):
|
||||
self.matrix = MATRIX_IDENTITY
|
||||
self.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
|
||||
class PDFGraphicState:
|
||||
|
||||
def __init__(self):
|
||||
self.linewidth = 0
|
||||
self.linecap = None
|
||||
self.linejoin = None
|
||||
self.miterlimit = None
|
||||
self.dash = None
|
||||
self.intent = None
|
||||
self.flatness = None
|
||||
|
||||
# stroking color
|
||||
self.scolor = None
|
||||
|
||||
# non stroking color
|
||||
self.ncolor = None
|
||||
return
|
||||
|
||||
def copy(self):
|
||||
obj = PDFGraphicState()
|
||||
obj.linewidth = self.linewidth
|
||||
obj.linecap = self.linecap
|
||||
obj.linejoin = self.linejoin
|
||||
obj.miterlimit = self.miterlimit
|
||||
obj.dash = self.dash
|
||||
obj.intent = self.intent
|
||||
obj.flatness = self.flatness
|
||||
obj.scolor = self.scolor
|
||||
obj.ncolor = self.ncolor
|
||||
return obj
|
||||
|
||||
def __repr__(self):
|
||||
return ('<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
|
||||
' miterlimit=%r, dash=%r, intent=%r, flatness=%r, '
|
||||
' stroking color=%r, non stroking color=%r>' %
|
||||
(self.linewidth, self.linecap, self.linejoin,
|
||||
self.miterlimit, self.dash, self.intent, self.flatness,
|
||||
self.scolor, self.ncolor))
|
||||
|
||||
|
||||
class PDFResourceManager:
|
||||
"""Repository of shared resources.
|
||||
|
||||
ResourceManager facilitates reuse of shared resources
|
||||
such as fonts and images so that large objects are not
|
||||
allocated multiple times.
|
||||
"""
|
||||
|
||||
def __init__(self, caching=True):
|
||||
self.caching = caching
|
||||
self._cached_fonts = {}
|
||||
return
|
||||
|
||||
def get_procset(self, procs):
|
||||
for proc in procs:
|
||||
if proc is LITERAL_PDF:
|
||||
pass
|
||||
elif proc is LITERAL_TEXT:
|
||||
pass
|
||||
else:
|
||||
pass
|
||||
return
|
||||
|
||||
def get_cmap(self, cmapname, strict=False):
|
||||
try:
|
||||
return CMapDB.get_cmap(cmapname)
|
||||
except CMapDB.CMapNotFound:
|
||||
if strict:
|
||||
raise
|
||||
return CMap()
|
||||
|
||||
def get_font(self, objid, spec):
|
||||
if objid and objid in self._cached_fonts:
|
||||
font = self._cached_fonts[objid]
|
||||
else:
|
||||
log.info('get_font: create: objid=%r, spec=%r', objid, spec)
|
||||
if settings.STRICT:
|
||||
if spec['Type'] is not LITERAL_FONT:
|
||||
raise PDFFontError('Type is not /Font')
|
||||
# Create a Font object.
|
||||
if 'Subtype' in spec:
|
||||
subtype = literal_name(spec['Subtype'])
|
||||
else:
|
||||
if settings.STRICT:
|
||||
raise PDFFontError('Font Subtype is not specified.')
|
||||
subtype = 'Type1'
|
||||
if subtype in ('Type1', 'MMType1'):
|
||||
# Type1 Font
|
||||
font = PDFType1Font(self, spec)
|
||||
elif subtype == 'TrueType':
|
||||
# TrueType Font
|
||||
font = PDFTrueTypeFont(self, spec)
|
||||
elif subtype == 'Type3':
|
||||
# Type3 Font
|
||||
font = PDFType3Font(self, spec)
|
||||
elif subtype in ('CIDFontType0', 'CIDFontType2'):
|
||||
# CID Font
|
||||
font = PDFCIDFont(self, spec)
|
||||
elif subtype == 'Type0':
|
||||
# Type0 Font
|
||||
dfonts = list_value(spec['DescendantFonts'])
|
||||
assert dfonts
|
||||
subspec = dict_value(dfonts[0]).copy()
|
||||
for k in ('Encoding', 'ToUnicode'):
|
||||
if k in spec:
|
||||
subspec[k] = resolve1(spec[k])
|
||||
font = self.get_font(None, subspec)
|
||||
else:
|
||||
if settings.STRICT:
|
||||
raise PDFFontError('Invalid Font spec: %r' % spec)
|
||||
font = PDFType1Font(self, spec) # this is so wrong!
|
||||
if objid and self.caching:
|
||||
self._cached_fonts[objid] = font
|
||||
return font
|
||||
|
||||
|
||||
class PDFContentParser(PSStackParser):
|
||||
|
||||
def __init__(self, streams):
|
||||
self.streams = streams
|
||||
self.istream = 0
|
||||
PSStackParser.__init__(self, None)
|
||||
return
|
||||
|
||||
def fillfp(self):
|
||||
if not self.fp:
|
||||
if self.istream < len(self.streams):
|
||||
strm = stream_value(self.streams[self.istream])
|
||||
self.istream += 1
|
||||
else:
|
||||
raise PSEOF('Unexpected EOF, file truncated?')
|
||||
self.fp = BytesIO(strm.get_data())
|
||||
return
|
||||
|
||||
def seek(self, pos):
|
||||
self.fillfp()
|
||||
PSStackParser.seek(self, pos)
|
||||
return
|
||||
|
||||
def fillbuf(self):
|
||||
if self.charpos < len(self.buf):
|
||||
return
|
||||
while 1:
|
||||
self.fillfp()
|
||||
self.bufpos = self.fp.tell()
|
||||
self.buf = self.fp.read(self.BUFSIZ)
|
||||
if self.buf:
|
||||
break
|
||||
self.fp = None
|
||||
self.charpos = 0
|
||||
return
|
||||
|
||||
def get_inline_data(self, pos, target=b'EI'):
|
||||
self.seek(pos)
|
||||
i = 0
|
||||
data = b''
|
||||
while i <= len(target):
|
||||
self.fillbuf()
|
||||
if i:
|
||||
c = self.buf[self.charpos]
|
||||
c = bytes((c,))
|
||||
data += c
|
||||
self.charpos += 1
|
||||
if len(target) <= i and c.isspace():
|
||||
i += 1
|
||||
elif i < len(target) and c == (bytes((target[i],))):
|
||||
i += 1
|
||||
else:
|
||||
i = 0
|
||||
else:
|
||||
try:
|
||||
j = self.buf.index(target[0], self.charpos)
|
||||
data += self.buf[self.charpos:j+1]
|
||||
self.charpos = j+1
|
||||
i = 1
|
||||
except ValueError:
|
||||
data += self.buf[self.charpos:]
|
||||
self.charpos = len(self.buf)
|
||||
data = data[:-(len(target)+1)] # strip the last part
|
||||
data = re.sub(br'(\x0d\x0a|[\x0d\x0a])$', b'', data)
|
||||
return (pos, data)
|
||||
|
||||
def flush(self):
|
||||
self.add_results(*self.popall())
|
||||
return
|
||||
|
||||
KEYWORD_BI = KWD(b'BI')
|
||||
KEYWORD_ID = KWD(b'ID')
|
||||
KEYWORD_EI = KWD(b'EI')
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
if token is self.KEYWORD_BI:
|
||||
# inline image within a content stream
|
||||
self.start_type(pos, 'inline')
|
||||
elif token is self.KEYWORD_ID:
|
||||
try:
|
||||
(_, objs) = self.end_type('inline')
|
||||
if len(objs) % 2 != 0:
|
||||
error_msg = 'Invalid dictionary construct: {!r}' \
|
||||
.format(objs)
|
||||
raise PSTypeError(error_msg)
|
||||
d = {literal_name(k): v for (k, v) in choplist(2, objs)}
|
||||
(pos, data) = self.get_inline_data(pos+len(b'ID '))
|
||||
obj = PDFStream(d, data)
|
||||
self.push((pos, obj))
|
||||
self.push((pos, self.KEYWORD_EI))
|
||||
except PSTypeError:
|
||||
if settings.STRICT:
|
||||
raise
|
||||
else:
|
||||
self.push((pos, token))
|
||||
return
|
||||
|
||||
|
||||
class PDFPageInterpreter:
|
||||
"""Processor for the content of a PDF page
|
||||
|
||||
Reference: PDF Reference, Appendix A, Operator Summary
|
||||
"""
|
||||
|
||||
def __init__(self, rsrcmgr, device):
|
||||
self.rsrcmgr = rsrcmgr
|
||||
self.device = device
|
||||
return
|
||||
|
||||
def dup(self):
|
||||
return self.__class__(self.rsrcmgr, self.device)
|
||||
|
||||
def init_resources(self, resources):
|
||||
"""Prepare the fonts and XObjects listed in the Resource attribute."""
|
||||
self.resources = resources
|
||||
self.fontmap = {}
|
||||
self.xobjmap = {}
|
||||
self.csmap = PREDEFINED_COLORSPACE.copy()
|
||||
if not resources:
|
||||
return
|
||||
|
||||
def get_colorspace(spec):
|
||||
if isinstance(spec, list):
|
||||
name = literal_name(spec[0])
|
||||
else:
|
||||
name = literal_name(spec)
|
||||
if name == 'ICCBased' and isinstance(spec, list) \
|
||||
and 2 <= len(spec):
|
||||
return PDFColorSpace(name, stream_value(spec[1])['N'])
|
||||
elif name == 'DeviceN' and isinstance(spec, list) \
|
||||
and 2 <= len(spec):
|
||||
return PDFColorSpace(name, len(list_value(spec[1])))
|
||||
else:
|
||||
return PREDEFINED_COLORSPACE.get(name)
|
||||
for (k, v) in dict_value(resources).items():
|
||||
log.debug('Resource: %r: %r', k, v)
|
||||
if k == 'Font':
|
||||
for (fontid, spec) in dict_value(v).items():
|
||||
objid = None
|
||||
if isinstance(spec, PDFObjRef):
|
||||
objid = spec.objid
|
||||
spec = dict_value(spec)
|
||||
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
||||
elif k == 'ColorSpace':
|
||||
for (csid, spec) in dict_value(v).items():
|
||||
self.csmap[csid] = get_colorspace(resolve1(spec))
|
||||
elif k == 'ProcSet':
|
||||
self.rsrcmgr.get_procset(list_value(v))
|
||||
elif k == 'XObject':
|
||||
for (xobjid, xobjstrm) in dict_value(v).items():
|
||||
self.xobjmap[xobjid] = xobjstrm
|
||||
return
|
||||
|
||||
def init_state(self, ctm):
|
||||
"""Initialize the text and graphic states for rendering a page."""
|
||||
self.gstack = [] # stack for graphical states.
|
||||
self.ctm = ctm
|
||||
self.device.set_ctm(self.ctm)
|
||||
self.textstate = PDFTextState()
|
||||
self.graphicstate = PDFGraphicState()
|
||||
self.curpath = []
|
||||
# argstack: stack for command arguments.
|
||||
self.argstack = []
|
||||
# set some global states.
|
||||
self.scs = self.ncs = None
|
||||
if self.csmap:
|
||||
self.scs = self.ncs = next(iter(self.csmap.values()))
|
||||
return
|
||||
|
||||
def push(self, obj):
|
||||
self.argstack.append(obj)
|
||||
return
|
||||
|
||||
def pop(self, n):
|
||||
if n == 0:
|
||||
return []
|
||||
x = self.argstack[-n:]
|
||||
self.argstack = self.argstack[:-n]
|
||||
return x
|
||||
|
||||
def get_current_state(self):
|
||||
return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
|
||||
|
||||
def set_current_state(self, state):
|
||||
(self.ctm, self.textstate, self.graphicstate) = state
|
||||
self.device.set_ctm(self.ctm)
|
||||
return
|
||||
|
||||
def do_q(self):
|
||||
"""Save graphics state"""
|
||||
self.gstack.append(self.get_current_state())
|
||||
return
|
||||
|
||||
def do_Q(self):
|
||||
"""Restore graphics state"""
|
||||
if self.gstack:
|
||||
self.set_current_state(self.gstack.pop())
|
||||
return
|
||||
|
||||
def do_cm(self, a1, b1, c1, d1, e1, f1):
|
||||
"""Concatenate matrix to current transformation matrix"""
|
||||
self.ctm = mult_matrix((a1, b1, c1, d1, e1, f1), self.ctm)
|
||||
self.device.set_ctm(self.ctm)
|
||||
return
|
||||
|
||||
def do_w(self, linewidth):
|
||||
"""Set line width"""
|
||||
self.graphicstate.linewidth = linewidth
|
||||
return
|
||||
|
||||
def do_J(self, linecap):
|
||||
"""Set line cap style"""
|
||||
self.graphicstate.linecap = linecap
|
||||
return
|
||||
|
||||
def do_j(self, linejoin):
|
||||
"""Set line join style"""
|
||||
self.graphicstate.linejoin = linejoin
|
||||
return
|
||||
|
||||
def do_M(self, miterlimit):
|
||||
"""Set miter limit"""
|
||||
self.graphicstate.miterlimit = miterlimit
|
||||
return
|
||||
|
||||
def do_d(self, dash, phase):
|
||||
"""Set line dash pattern"""
|
||||
self.graphicstate.dash = (dash, phase)
|
||||
return
|
||||
|
||||
def do_ri(self, intent):
|
||||
"""Set color rendering intent"""
|
||||
self.graphicstate.intent = intent
|
||||
return
|
||||
|
||||
def do_i(self, flatness):
|
||||
"""Set flatness tolerance"""
|
||||
self.graphicstate.flatness = flatness
|
||||
return
|
||||
|
||||
def do_gs(self, name):
|
||||
"""Set parameters from graphics state parameter dictionary"""
|
||||
# todo
|
||||
return
|
||||
|
||||
def do_m(self, x, y):
|
||||
"""Begin new subpath"""
|
||||
self.curpath.append(('m', x, y))
|
||||
return
|
||||
|
||||
def do_l(self, x, y):
|
||||
"""Append straight line segment to path"""
|
||||
self.curpath.append(('l', x, y))
|
||||
return
|
||||
|
||||
def do_c(self, x1, y1, x2, y2, x3, y3):
|
||||
"""Append curved segment to path (three control points)"""
|
||||
self.curpath.append(('c', x1, y1, x2, y2, x3, y3))
|
||||
return
|
||||
|
||||
def do_v(self, x2, y2, x3, y3):
|
||||
"""Append curved segment to path (initial point replicated)"""
|
||||
self.curpath.append(('v', x2, y2, x3, y3))
|
||||
return
|
||||
|
||||
def do_y(self, x1, y1, x3, y3):
|
||||
"""Append curved segment to path (final point replicated)"""
|
||||
self.curpath.append(('y', x1, y1, x3, y3))
|
||||
return
|
||||
|
||||
def do_h(self):
|
||||
"""Close subpath"""
|
||||
self.curpath.append(('h',))
|
||||
return
|
||||
|
||||
def do_re(self, x, y, w, h):
|
||||
"""Append rectangle to path"""
|
||||
self.curpath.append(('m', x, y))
|
||||
self.curpath.append(('l', x+w, y))
|
||||
self.curpath.append(('l', x+w, y+h))
|
||||
self.curpath.append(('l', x, y+h))
|
||||
self.curpath.append(('h',))
|
||||
return
|
||||
|
||||
def do_S(self):
|
||||
"""Stroke path"""
|
||||
self.device.paint_path(self.graphicstate, True, False, False,
|
||||
self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
def do_s(self):
|
||||
"""Close and stroke path"""
|
||||
self.do_h()
|
||||
self.do_S()
|
||||
return
|
||||
|
||||
def do_f(self):
|
||||
"""Fill path using nonzero winding number rule"""
|
||||
self.device.paint_path(self.graphicstate, False, True, False,
|
||||
self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
def do_F(self):
|
||||
"""Fill path using nonzero winding number rule (obsolete)"""
|
||||
return self.do_f()
|
||||
|
||||
def do_f_a(self):
|
||||
"""Fill path using even-odd rule"""
|
||||
self.device.paint_path(self.graphicstate, False, True, True,
|
||||
self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
def do_B(self):
|
||||
"""Fill and stroke path using nonzero winding number rule"""
|
||||
self.device.paint_path(self.graphicstate, True, True, False,
|
||||
self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
def do_B_a(self):
|
||||
"""Fill and stroke path using even-odd rule"""
|
||||
self.device.paint_path(self.graphicstate, True, True, True,
|
||||
self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
def do_b(self):
|
||||
"""Close, fill, and stroke path using nonzero winding number rule"""
|
||||
self.do_h()
|
||||
self.do_B()
|
||||
return
|
||||
|
||||
def do_b_a(self):
|
||||
"""Close, fill, and stroke path using even-odd rule"""
|
||||
self.do_h()
|
||||
self.do_B_a()
|
||||
return
|
||||
|
||||
def do_n(self):
|
||||
"""End path without filling or stroking"""
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
def do_W(self):
|
||||
"""Set clipping path using nonzero winding number rule"""
|
||||
return
|
||||
|
||||
def do_W_a(self):
|
||||
"""Set clipping path using even-odd rule"""
|
||||
return
|
||||
|
||||
def do_CS(self, name):
|
||||
"""Set color space for stroking operations
|
||||
|
||||
Introduced in PDF 1.1
|
||||
"""
|
||||
try:
|
||||
self.scs = self.csmap[literal_name(name)]
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
|
||||
return
|
||||
|
||||
def do_cs(self, name):
|
||||
"""Set color space for nonstroking operations"""
|
||||
try:
|
||||
self.ncs = self.csmap[literal_name(name)]
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
|
||||
return
|
||||
|
||||
def do_G(self, gray):
|
||||
"""Set gray level for stroking operations"""
|
||||
self.graphicstate.scolor = gray
|
||||
return
|
||||
|
||||
def do_g(self, gray):
|
||||
"""Set gray level for nonstroking operations"""
|
||||
self.graphicstate.ncolor = gray
|
||||
return
|
||||
|
||||
def do_RG(self, r, g, b):
|
||||
"""Set RGB color for stroking operations"""
|
||||
self.graphicstate.scolor = (r, g, b)
|
||||
return
|
||||
|
||||
def do_rg(self, r, g, b):
|
||||
"""Set RGB color for nonstroking operations"""
|
||||
self.graphicstate.ncolor = (r, g, b)
|
||||
return
|
||||
|
||||
def do_K(self, c, m, y, k):
|
||||
"""Set CMYK color for stroking operations"""
|
||||
self.graphicstate.scolor = (c, m, y, k)
|
||||
return
|
||||
|
||||
def do_k(self, c, m, y, k):
|
||||
"""Set CMYK color for nonstroking operations"""
|
||||
self.graphicstate.ncolor = (c, m, y, k)
|
||||
return
|
||||
|
||||
def do_SCN(self):
|
||||
"""Set color for stroking operations."""
|
||||
if self.scs:
|
||||
n = self.scs.ncomponents
|
||||
else:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('No colorspace specified!')
|
||||
n = 1
|
||||
self.graphicstate.scolor = self.pop(n)
|
||||
return
|
||||
|
||||
def do_scn(self):
|
||||
"""Set color for nonstroking operations"""
|
||||
if self.ncs:
|
||||
n = self.ncs.ncomponents
|
||||
else:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('No colorspace specified!')
|
||||
n = 1
|
||||
self.graphicstate.ncolor = self.pop(n)
|
||||
return
|
||||
|
||||
def do_SC(self):
|
||||
"""Set color for stroking operations"""
|
||||
self.do_SCN()
|
||||
return
|
||||
|
||||
def do_sc(self):
|
||||
"""Set color for nonstroking operations"""
|
||||
self.do_scn()
|
||||
return
|
||||
|
||||
def do_sh(self, name):
|
||||
"""Paint area defined by shading pattern"""
|
||||
return
|
||||
|
||||
def do_BT(self):
|
||||
"""Begin text object
|
||||
|
||||
Initializing the text matrix, Tm, and the text line matrix, Tlm, to
|
||||
the identity matrix. Text objects cannot be nested; a second BT cannot
|
||||
appear before an ET.
|
||||
"""
|
||||
self.textstate.reset()
|
||||
return
|
||||
|
||||
def do_ET(self):
|
||||
"""End a text object"""
|
||||
return
|
||||
|
||||
def do_BX(self):
|
||||
"""Begin compatibility section"""
|
||||
return
|
||||
|
||||
def do_EX(self):
|
||||
"""End compatibility section"""
|
||||
return
|
||||
|
||||
def do_MP(self, tag):
|
||||
"""Define marked-content point"""
|
||||
self.device.do_tag(tag)
|
||||
return
|
||||
|
||||
def do_DP(self, tag, props):
|
||||
"""Define marked-content point with property list"""
|
||||
self.device.do_tag(tag, props)
|
||||
return
|
||||
|
||||
def do_BMC(self, tag):
|
||||
"""Begin marked-content sequence"""
|
||||
self.device.begin_tag(tag)
|
||||
return
|
||||
|
||||
def do_BDC(self, tag, props):
|
||||
"""Begin marked-content sequence with property list"""
|
||||
self.device.begin_tag(tag, props)
|
||||
return
|
||||
|
||||
def do_EMC(self):
|
||||
"""End marked-content sequence"""
|
||||
self.device.end_tag()
|
||||
return
|
||||
|
||||
def do_Tc(self, space):
|
||||
"""Set character spacing.
|
||||
|
||||
Character spacing is used by the Tj, TJ, and ' operators.
|
||||
|
||||
:param space: a number expressed in unscaled text space units.
|
||||
"""
|
||||
self.textstate.charspace = space
|
||||
return
|
||||
|
||||
def do_Tw(self, space):
|
||||
"""Set the word spacing.
|
||||
|
||||
Word spacing is used by the Tj, TJ, and ' operators.
|
||||
|
||||
:param space: a number expressed in unscaled text space units
|
||||
"""
|
||||
self.textstate.wordspace = space
|
||||
return
|
||||
|
||||
def do_Tz(self, scale):
|
||||
"""Set the horizontal scaling.
|
||||
|
||||
:param scale: is a number specifying the percentage of the normal width
|
||||
"""
|
||||
self.textstate.scaling = scale
|
||||
return
|
||||
|
||||
def do_TL(self, leading):
|
||||
"""Set the text leading.
|
||||
|
||||
Text leading is used only by the T*, ', and " operators.
|
||||
|
||||
:param leading: a number expressed in unscaled text space units
|
||||
"""
|
||||
self.textstate.leading = -leading
|
||||
return
|
||||
|
||||
def do_Tf(self, fontid, fontsize):
|
||||
"""Set the text font
|
||||
|
||||
:param fontid: the name of a font resource in the Font subdictionary
|
||||
of the current resource dictionary
|
||||
:param fontsize: size is a number representing a scale factor.
|
||||
"""
|
||||
try:
|
||||
self.textstate.font = self.fontmap[literal_name(fontid)]
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('Undefined Font id: %r' % fontid)
|
||||
self.textstate.font = self.rsrcmgr.get_font(None, {})
|
||||
self.textstate.fontsize = fontsize
|
||||
return
|
||||
|
||||
def do_Tr(self, render):
|
||||
"""Set the text rendering mode"""
|
||||
self.textstate.render = render
|
||||
return
|
||||
|
||||
def do_Ts(self, rise):
|
||||
"""Set the text rise
|
||||
|
||||
:param rise: a number expressed in unscaled text space units
|
||||
"""
|
||||
self.textstate.rise = rise
|
||||
return
|
||||
|
||||
def do_Td(self, tx, ty):
|
||||
"""Move text position"""
|
||||
(a, b, c, d, e, f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
def do_TD(self, tx, ty):
|
||||
"""Move text position and set leading"""
|
||||
(a, b, c, d, e, f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
||||
self.textstate.leading = ty
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
def do_Tm(self, a, b, c, d, e, f):
|
||||
"""Set text matrix and text line matrix"""
|
||||
self.textstate.matrix = (a, b, c, d, e, f)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
def do_T_a(self):
|
||||
"""Move to start of next text line"""
|
||||
(a, b, c, d, e, f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e,
|
||||
self.textstate.leading*d+f)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
def do_TJ(self, seq):
|
||||
"""Show text, allowing individual glyph positioning"""
|
||||
if self.textstate.font is None:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('No font specified!')
|
||||
return
|
||||
self.device.render_string(self.textstate, seq, self.ncs,
|
||||
self.graphicstate.copy())
|
||||
return
|
||||
|
||||
def do_Tj(self, s):
|
||||
"""Show text"""
|
||||
self.do_TJ([s])
|
||||
return
|
||||
|
||||
def do__q(self, s):
|
||||
"""Move to next line and show text
|
||||
|
||||
The ' (single quote) operator.
|
||||
"""
|
||||
self.do_T_a()
|
||||
self.do_TJ([s])
|
||||
return
|
||||
|
||||
def do__w(self, aw, ac, s):
|
||||
"""Set word and character spacing, move to next line, and show text
|
||||
|
||||
The " (double quote) operator.
|
||||
"""
|
||||
self.do_Tw(aw)
|
||||
self.do_Tc(ac)
|
||||
self.do_TJ([s])
|
||||
return
|
||||
|
||||
def do_BI(self):
|
||||
"""Begin inline image object"""
|
||||
return
|
||||
|
||||
def do_ID(self):
|
||||
"""Begin inline image data"""
|
||||
return
|
||||
|
||||
def do_EI(self, obj):
|
||||
"""End inline image object"""
|
||||
if isinstance(obj, PDFStream) and 'W' in obj and 'H' in obj:
|
||||
iobjid = str(id(obj))
|
||||
self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
||||
self.device.render_image(iobjid, obj)
|
||||
self.device.end_figure(iobjid)
|
||||
return
|
||||
|
||||
def do_Do(self, xobjid):
|
||||
"""Invoke named XObject"""
|
||||
xobjid = literal_name(xobjid)
|
||||
try:
|
||||
xobj = stream_value(self.xobjmap[xobjid])
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
||||
return
|
||||
log.info('Processing xobj: %r', xobj)
|
||||
subtype = xobj.get('Subtype')
|
||||
if subtype is LITERAL_FORM and 'BBox' in xobj:
|
||||
interpreter = self.dup()
|
||||
bbox = list_value(xobj['BBox'])
|
||||
matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
|
||||
# According to PDF reference 1.7 section 4.9.1, XObjects in
|
||||
# earlier PDFs (prior to v1.2) use the page's Resources entry
|
||||
# instead of having their own Resources entry.
|
||||
xobjres = xobj.get('Resources')
|
||||
if xobjres:
|
||||
resources = dict_value(xobjres)
|
||||
else:
|
||||
resources = self.resources.copy()
|
||||
self.device.begin_figure(xobjid, bbox, matrix)
|
||||
interpreter.render_contents(resources, [xobj],
|
||||
ctm=mult_matrix(matrix, self.ctm))
|
||||
self.device.end_figure(xobjid)
|
||||
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
|
||||
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
||||
self.device.render_image(xobjid, xobj)
|
||||
self.device.end_figure(xobjid)
|
||||
else:
|
||||
# unsupported xobject type.
|
||||
pass
|
||||
return
|
||||
|
||||
def process_page(self, page):
|
||||
log.info('Processing page: %r', page)
|
||||
(x0, y0, x1, y1) = page.mediabox
|
||||
if page.rotate == 90:
|
||||
ctm = (0, -1, 1, 0, -y0, x1)
|
||||
elif page.rotate == 180:
|
||||
ctm = (-1, 0, 0, -1, x1, y1)
|
||||
elif page.rotate == 270:
|
||||
ctm = (0, 1, -1, 0, y1, -x0)
|
||||
else:
|
||||
ctm = (1, 0, 0, 1, -x0, -y0)
|
||||
self.device.begin_page(page, ctm)
|
||||
self.render_contents(page.resources, page.contents, ctm=ctm)
|
||||
self.device.end_page(page)
|
||||
return
|
||||
|
||||
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
|
||||
"""Render the content streams.
|
||||
|
||||
This method may be called recursively.
|
||||
"""
|
||||
log.info('render_contents: resources=%r, streams=%r, ctm=%r',
|
||||
resources, streams, ctm)
|
||||
self.init_resources(resources)
|
||||
self.init_state(ctm)
|
||||
self.execute(list_value(streams))
|
||||
return
|
||||
|
||||
def execute(self, streams):
|
||||
try:
|
||||
parser = PDFContentParser(streams)
|
||||
except PSEOF:
|
||||
# empty page
|
||||
return
|
||||
while 1:
|
||||
try:
|
||||
(_, obj) = parser.nextobject()
|
||||
except PSEOF:
|
||||
break
|
||||
if isinstance(obj, PSKeyword):
|
||||
name = keyword_name(obj)
|
||||
method = 'do_%s' % name.replace('*', '_a').replace('"', '_w')\
|
||||
.replace("'", '_q')
|
||||
if hasattr(self, method):
|
||||
func = getattr(self, method)
|
||||
nargs = func.__code__.co_argcount-1
|
||||
if nargs:
|
||||
args = self.pop(nargs)
|
||||
log.debug('exec: %s %r', name, args)
|
||||
if len(args) == nargs:
|
||||
func(*args)
|
||||
else:
|
||||
log.debug('exec: %s', name)
|
||||
func()
|
||||
else:
|
||||
if settings.STRICT:
|
||||
error_msg = 'Unknown operator: %r' % name
|
||||
raise PDFInterpreterError(error_msg)
|
||||
else:
|
||||
self.push(obj)
|
||||
return
|
||||
@@ -1,148 +0,0 @@
|
||||
import logging
|
||||
import warnings
|
||||
from . import settings
|
||||
from .psparser import LIT
|
||||
from .pdftypes import PDFObjectNotFound
|
||||
from .pdftypes import resolve1
|
||||
from .pdftypes import int_value
|
||||
from .pdftypes import list_value
|
||||
from .pdftypes import dict_value
|
||||
from .pdfparser import PDFParser
|
||||
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
|
||||
from .pdfdocument import PDFTextExtractionNotAllowedWarning
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# some predefined literals and keywords.
|
||||
LITERAL_PAGE = LIT('Page')
|
||||
LITERAL_PAGES = LIT('Pages')
|
||||
|
||||
|
||||
class PDFPage:
|
||||
"""An object that holds the information about a page.
|
||||
|
||||
A PDFPage object is merely a convenience class that has a set
|
||||
of keys and values, which describe the properties of a page
|
||||
and point to its contents.
|
||||
|
||||
Attributes:
|
||||
doc: a PDFDocument object.
|
||||
pageid: any Python object that can uniquely identify the page.
|
||||
attrs: a dictionary of page attributes.
|
||||
contents: a list of PDFStream objects that represents the page content.
|
||||
lastmod: the last modified time of the page.
|
||||
resources: a list of resources used by the page.
|
||||
mediabox: the physical size of the page.
|
||||
cropbox: the crop rectangle of the page.
|
||||
rotate: the page rotation (in degree).
|
||||
annots: the page annotations.
|
||||
beads: a chain that represents natural reading order.
|
||||
"""
|
||||
|
||||
def __init__(self, doc, pageid, attrs):
|
||||
"""Initialize a page object.
|
||||
|
||||
doc: a PDFDocument object.
|
||||
pageid: any Python object that can uniquely identify the page.
|
||||
attrs: a dictionary of page attributes.
|
||||
"""
|
||||
self.doc = doc
|
||||
self.pageid = pageid
|
||||
self.attrs = dict_value(attrs)
|
||||
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
||||
self.resources = resolve1(self.attrs.get('Resources', dict()))
|
||||
self.mediabox = resolve1(self.attrs['MediaBox'])
|
||||
if 'CropBox' in self.attrs:
|
||||
self.cropbox = resolve1(self.attrs['CropBox'])
|
||||
else:
|
||||
self.cropbox = self.mediabox
|
||||
self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
|
||||
self.annots = self.attrs.get('Annots')
|
||||
self.beads = self.attrs.get('B')
|
||||
if 'Contents' in self.attrs:
|
||||
contents = resolve1(self.attrs['Contents'])
|
||||
else:
|
||||
contents = []
|
||||
if not isinstance(contents, list):
|
||||
contents = [contents]
|
||||
self.contents = contents
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFPage: Resources={!r}, MediaBox={!r}>'\
|
||||
.format(self.resources, self.mediabox)
|
||||
|
||||
INHERITABLE_ATTRS = {'Resources', 'MediaBox', 'CropBox', 'Rotate'}
|
||||
|
||||
@classmethod
|
||||
def create_pages(cls, document):
|
||||
def search(obj, parent):
|
||||
if isinstance(obj, int):
|
||||
objid = obj
|
||||
tree = dict_value(document.getobj(objid)).copy()
|
||||
else:
|
||||
objid = obj.objid
|
||||
tree = dict_value(obj).copy()
|
||||
for (k, v) in parent.items():
|
||||
if k in cls.INHERITABLE_ATTRS and k not in tree:
|
||||
tree[k] = v
|
||||
|
||||
tree_type = tree.get('Type')
|
||||
if tree_type is None and not settings.STRICT: # See #64
|
||||
tree_type = tree.get('type')
|
||||
|
||||
if tree_type is LITERAL_PAGES and 'Kids' in tree:
|
||||
log.info('Pages: Kids=%r', tree['Kids'])
|
||||
for c in list_value(tree['Kids']):
|
||||
yield from search(c, tree)
|
||||
elif tree_type is LITERAL_PAGE:
|
||||
log.info('Page: %r', tree)
|
||||
yield (objid, tree)
|
||||
pages = False
|
||||
if 'Pages' in document.catalog:
|
||||
objects = search(document.catalog['Pages'], document.catalog)
|
||||
for (objid, tree) in objects:
|
||||
yield cls(document, objid, tree)
|
||||
pages = True
|
||||
if not pages:
|
||||
# fallback when /Pages is missing.
|
||||
for xref in document.xrefs:
|
||||
for objid in xref.get_objids():
|
||||
try:
|
||||
obj = document.getobj(objid)
|
||||
if isinstance(obj, dict) \
|
||||
and obj.get('Type') is LITERAL_PAGE:
|
||||
yield cls(document, objid, obj)
|
||||
except PDFObjectNotFound:
|
||||
pass
|
||||
return
|
||||
|
||||
@classmethod
|
||||
def get_pages(cls, fp,
|
||||
pagenos=None, maxpages=0, password='',
|
||||
caching=True, check_extractable=False):
|
||||
# Create a PDF parser object associated with the file object.
|
||||
parser = PDFParser(fp)
|
||||
# Create a PDF document object that stores the document structure.
|
||||
doc = PDFDocument(parser, password=password, caching=caching)
|
||||
# Check if the document allows text extraction.
|
||||
# If not, warn the user and proceed.
|
||||
if not doc.is_extractable:
|
||||
if check_extractable:
|
||||
error_msg = 'Text extraction is not allowed: %r' % fp
|
||||
raise PDFTextExtractionNotAllowed(error_msg)
|
||||
else:
|
||||
warning_msg = 'The PDF %r contains a metadata field '\
|
||||
'indicating that it should not allow ' \
|
||||
'text extraction. Ignoring this field ' \
|
||||
'and proceeding.' % fp
|
||||
warnings.warn(warning_msg, PDFTextExtractionNotAllowedWarning)
|
||||
# Process each page contained in the document.
|
||||
for (pageno, page) in enumerate(cls.create_pages(doc)):
|
||||
if pagenos and (pageno not in pagenos):
|
||||
continue
|
||||
yield page
|
||||
if maxpages and maxpages <= pageno+1:
|
||||
break
|
||||
return
|
||||
@@ -1,170 +0,0 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from .psparser import PSStackParser
|
||||
from .psparser import PSSyntaxError
|
||||
from .psparser import PSEOF
|
||||
from .psparser import KWD
|
||||
from . import settings
|
||||
from .pdftypes import PDFException
|
||||
from .pdftypes import PDFStream
|
||||
from .pdftypes import PDFObjRef
|
||||
from .pdftypes import int_value
|
||||
from .pdftypes import dict_value
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PDFSyntaxError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFParser(PSStackParser):
|
||||
"""
|
||||
PDFParser fetch PDF objects from a file stream.
|
||||
It can handle indirect references by referring to
|
||||
a PDF document set by set_document method.
|
||||
It also reads XRefs at the end of every PDF file.
|
||||
|
||||
Typical usage:
|
||||
parser = PDFParser(fp)
|
||||
parser.read_xref()
|
||||
parser.read_xref(fallback=True) # optional
|
||||
parser.set_document(doc)
|
||||
parser.seek(offset)
|
||||
parser.nextobject()
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, fp):
|
||||
PSStackParser.__init__(self, fp)
|
||||
self.doc = None
|
||||
self.fallback = False
|
||||
return
|
||||
|
||||
def set_document(self, doc):
|
||||
"""Associates the parser with a PDFDocument object."""
|
||||
self.doc = doc
|
||||
return
|
||||
|
||||
KEYWORD_R = KWD(b'R')
|
||||
KEYWORD_NULL = KWD(b'null')
|
||||
KEYWORD_ENDOBJ = KWD(b'endobj')
|
||||
KEYWORD_STREAM = KWD(b'stream')
|
||||
KEYWORD_XREF = KWD(b'xref')
|
||||
KEYWORD_STARTXREF = KWD(b'startxref')
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
"""Handles PDF-related keywords."""
|
||||
|
||||
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
|
||||
self.add_results(*self.pop(1))
|
||||
|
||||
elif token is self.KEYWORD_ENDOBJ:
|
||||
self.add_results(*self.pop(4))
|
||||
|
||||
elif token is self.KEYWORD_NULL:
|
||||
# null object
|
||||
self.push((pos, None))
|
||||
|
||||
elif token is self.KEYWORD_R:
|
||||
# reference to indirect object
|
||||
try:
|
||||
((_, objid), (_, genno)) = self.pop(2)
|
||||
(objid, genno) = (int(objid), int(genno))
|
||||
obj = PDFObjRef(self.doc, objid, genno)
|
||||
self.push((pos, obj))
|
||||
except PSSyntaxError:
|
||||
pass
|
||||
|
||||
elif token is self.KEYWORD_STREAM:
|
||||
# stream object
|
||||
((_, dic),) = self.pop(1)
|
||||
dic = dict_value(dic)
|
||||
objlen = 0
|
||||
if not self.fallback:
|
||||
try:
|
||||
objlen = int_value(dic['Length'])
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('/Length is undefined: %r' % dic)
|
||||
self.seek(pos)
|
||||
try:
|
||||
(_, line) = self.nextline() # 'stream'
|
||||
except PSEOF:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('Unexpected EOF')
|
||||
return
|
||||
pos += len(line)
|
||||
self.fp.seek(pos)
|
||||
data = bytearray(self.fp.read(objlen))
|
||||
self.seek(pos+objlen)
|
||||
while 1:
|
||||
try:
|
||||
(linepos, line) = self.nextline()
|
||||
except PSEOF:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('Unexpected EOF')
|
||||
break
|
||||
if b'endstream' in line:
|
||||
i = line.index(b'endstream')
|
||||
objlen += i
|
||||
if self.fallback:
|
||||
data += line[:i]
|
||||
break
|
||||
objlen += len(line)
|
||||
if self.fallback:
|
||||
data += line
|
||||
data = bytes(data)
|
||||
self.seek(pos+objlen)
|
||||
# XXX limit objlen not to exceed object boundary
|
||||
log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos,
|
||||
objlen, dic, data[:10])
|
||||
obj = PDFStream(dic, data, self.doc.decipher)
|
||||
self.push((pos, obj))
|
||||
|
||||
else:
|
||||
# others
|
||||
self.push((pos, token))
|
||||
|
||||
return
|
||||
|
||||
|
||||
class PDFStreamParser(PDFParser):
|
||||
"""
|
||||
PDFStreamParser is used to parse PDF content streams
|
||||
that is contained in each page and has instructions
|
||||
for rendering the page. A reference to a PDF document is
|
||||
needed because a PDF content stream can also have
|
||||
indirect references to other objects in the same document.
|
||||
"""
|
||||
|
||||
def __init__(self, data):
|
||||
PDFParser.__init__(self, BytesIO(data))
|
||||
return
|
||||
|
||||
def flush(self):
|
||||
self.add_results(*self.popall())
|
||||
return
|
||||
|
||||
KEYWORD_OBJ = KWD(b'obj')
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
if token is self.KEYWORD_R:
|
||||
# reference to indirect object
|
||||
try:
|
||||
((_, objid), (_, genno)) = self.pop(2)
|
||||
(objid, genno) = (int(objid), int(genno))
|
||||
obj = PDFObjRef(self.doc, objid, genno)
|
||||
self.push((pos, obj))
|
||||
except PSSyntaxError:
|
||||
pass
|
||||
return
|
||||
elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
|
||||
if settings.STRICT:
|
||||
# See PDF Spec 3.4.6: Only the object values are stored in the
|
||||
# stream; the obj and endobj keywords are not used.
|
||||
raise PDFSyntaxError('Keyword endobj found in stream')
|
||||
return
|
||||
# others
|
||||
self.push((pos, token))
|
||||
return
|
||||
@@ -1,323 +0,0 @@
|
||||
import zlib
|
||||
import logging
|
||||
from .lzw import lzwdecode
|
||||
from .ascii85 import ascii85decode
|
||||
from .ascii85 import asciihexdecode
|
||||
from .runlength import rldecode
|
||||
from .ccitt import ccittfaxdecode
|
||||
from .psparser import PSException
|
||||
from .psparser import PSObject
|
||||
from .psparser import LIT
|
||||
from . import settings
|
||||
from .utils import apply_png_predictor
|
||||
from .utils import isnumber
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
LITERAL_CRYPT = LIT('Crypt')
|
||||
|
||||
# Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
|
||||
LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
|
||||
LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
|
||||
LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85'))
|
||||
LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
|
||||
LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
|
||||
LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
|
||||
LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
|
||||
LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),)
|
||||
|
||||
|
||||
class PDFObject(PSObject):
|
||||
pass
|
||||
|
||||
|
||||
class PDFException(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFTypeError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFValueError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFObjectNotFound(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFNotImplementedError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFObjRef(PDFObject):
|
||||
|
||||
def __init__(self, doc, objid, _):
|
||||
if objid == 0:
|
||||
if settings.STRICT:
|
||||
raise PDFValueError('PDF object id cannot be 0.')
|
||||
self.doc = doc
|
||||
self.objid = objid
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFObjRef:%d>' % (self.objid)
|
||||
|
||||
def resolve(self, default=None):
|
||||
try:
|
||||
return self.doc.getobj(self.objid)
|
||||
except PDFObjectNotFound:
|
||||
return default
|
||||
|
||||
|
||||
def resolve1(x, default=None):
|
||||
"""Resolves an object.
|
||||
|
||||
If this is an array or dictionary, it may still contains
|
||||
some indirect objects inside.
|
||||
"""
|
||||
while isinstance(x, PDFObjRef):
|
||||
x = x.resolve(default=default)
|
||||
return x
|
||||
|
||||
|
||||
def resolve_all(x, default=None):
|
||||
"""Recursively resolves the given object and all the internals.
|
||||
|
||||
Make sure there is no indirect reference within the nested object.
|
||||
This procedure might be slow.
|
||||
"""
|
||||
while isinstance(x, PDFObjRef):
|
||||
x = x.resolve(default=default)
|
||||
if isinstance(x, list):
|
||||
x = [resolve_all(v, default=default) for v in x]
|
||||
elif isinstance(x, dict):
|
||||
for (k, v) in x.items():
|
||||
x[k] = resolve_all(v, default=default)
|
||||
return x
|
||||
|
||||
|
||||
def decipher_all(decipher, objid, genno, x):
|
||||
"""Recursively deciphers the given object.
|
||||
"""
|
||||
if isinstance(x, bytes):
|
||||
return decipher(objid, genno, x)
|
||||
if isinstance(x, list):
|
||||
x = [decipher_all(decipher, objid, genno, v) for v in x]
|
||||
elif isinstance(x, dict):
|
||||
for (k, v) in x.items():
|
||||
x[k] = decipher_all(decipher, objid, genno, v)
|
||||
return x
|
||||
|
||||
|
||||
def int_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, int):
|
||||
if settings.STRICT:
|
||||
raise PDFTypeError('Integer required: %r' % x)
|
||||
return 0
|
||||
return x
|
||||
|
||||
|
||||
def float_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, float):
|
||||
if settings.STRICT:
|
||||
raise PDFTypeError('Float required: %r' % x)
|
||||
return 0.0
|
||||
return x
|
||||
|
||||
|
||||
def num_value(x):
|
||||
x = resolve1(x)
|
||||
if not isnumber(x):
|
||||
if settings.STRICT:
|
||||
raise PDFTypeError('Int or Float required: %r' % x)
|
||||
return 0
|
||||
return x
|
||||
|
||||
|
||||
def uint_value(x, n_bits):
|
||||
"""Resolve number and interpret it as a two's-complement unsigned number"""
|
||||
x = int_value(x)
|
||||
if x > 0:
|
||||
return x
|
||||
else:
|
||||
return x + 2**n_bits
|
||||
|
||||
|
||||
def str_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, bytes):
|
||||
if settings.STRICT:
|
||||
raise PDFTypeError('String required: %r' % x)
|
||||
return ''
|
||||
return x
|
||||
|
||||
|
||||
def list_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, (list, tuple)):
|
||||
if settings.STRICT:
|
||||
raise PDFTypeError('List required: %r' % x)
|
||||
return []
|
||||
return x
|
||||
|
||||
|
||||
def dict_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, dict):
|
||||
if settings.STRICT:
|
||||
log.error('PDFTypeError : Dict required: %r', x)
|
||||
raise PDFTypeError('Dict required: %r' % x)
|
||||
return {}
|
||||
return x
|
||||
|
||||
|
||||
def stream_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, PDFStream):
|
||||
if settings.STRICT:
|
||||
raise PDFTypeError('PDFStream required: %r' % x)
|
||||
return PDFStream({}, b'')
|
||||
return x
|
||||
|
||||
|
||||
class PDFStream(PDFObject):
|
||||
|
||||
def __init__(self, attrs, rawdata, decipher=None):
|
||||
assert isinstance(attrs, dict), str(type(attrs))
|
||||
self.attrs = attrs
|
||||
self.rawdata = rawdata
|
||||
self.decipher = decipher
|
||||
self.data = None
|
||||
self.objid = None
|
||||
self.genno = None
|
||||
return
|
||||
|
||||
def set_objid(self, objid, genno):
|
||||
self.objid = objid
|
||||
self.genno = genno
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
if self.data is None:
|
||||
assert self.rawdata is not None
|
||||
return '<PDFStream(%r): raw=%d, %r>' % \
|
||||
(self.objid, len(self.rawdata), self.attrs)
|
||||
else:
|
||||
assert self.data is not None
|
||||
return '<PDFStream(%r): len=%d, %r>' % \
|
||||
(self.objid, len(self.data), self.attrs)
|
||||
|
||||
def __contains__(self, name):
|
||||
return name in self.attrs
|
||||
|
||||
def __getitem__(self, name):
|
||||
return self.attrs[name]
|
||||
|
||||
def get(self, name, default=None):
|
||||
return self.attrs.get(name, default)
|
||||
|
||||
def get_any(self, names, default=None):
|
||||
for name in names:
|
||||
if name in self.attrs:
|
||||
return self.attrs[name]
|
||||
return default
|
||||
|
||||
def get_filters(self):
|
||||
filters = self.get_any(('F', 'Filter'))
|
||||
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
|
||||
if not filters:
|
||||
return []
|
||||
if not isinstance(filters, list):
|
||||
filters = [filters]
|
||||
if not isinstance(params, list):
|
||||
# Make sure the parameters list is the same as filters.
|
||||
params = [params] * len(filters)
|
||||
if settings.STRICT and len(params) != len(filters):
|
||||
raise PDFException("Parameters len filter mismatch")
|
||||
# resolve filter if possible
|
||||
_filters = []
|
||||
for fltr in filters:
|
||||
if hasattr(fltr, 'resolve'):
|
||||
fltr = fltr.resolve()[0]
|
||||
_filters.append(fltr)
|
||||
# return list solves https://github.com/pdfminer/pdfminer.six/issues/15
|
||||
return list(zip(_filters, params))
|
||||
|
||||
def decode(self):
|
||||
assert self.data is None \
|
||||
and self.rawdata is not None, str((self.data, self.rawdata))
|
||||
data = self.rawdata
|
||||
if self.decipher:
|
||||
# Handle encryption
|
||||
data = self.decipher(self.objid, self.genno, data, self.attrs)
|
||||
filters = self.get_filters()
|
||||
if not filters:
|
||||
self.data = data
|
||||
self.rawdata = None
|
||||
return
|
||||
for (f, params) in filters:
|
||||
if f in LITERALS_FLATE_DECODE:
|
||||
# will get errors if the document is encrypted.
|
||||
try:
|
||||
data = zlib.decompress(data)
|
||||
except zlib.error as e:
|
||||
if settings.STRICT:
|
||||
error_msg = 'Invalid zlib bytes: {!r}, {!r}'\
|
||||
.format(e, data)
|
||||
raise PDFException(error_msg)
|
||||
data = b''
|
||||
elif f in LITERALS_LZW_DECODE:
|
||||
data = lzwdecode(data)
|
||||
elif f in LITERALS_ASCII85_DECODE:
|
||||
data = ascii85decode(data)
|
||||
elif f in LITERALS_ASCIIHEX_DECODE:
|
||||
data = asciihexdecode(data)
|
||||
elif f in LITERALS_RUNLENGTH_DECODE:
|
||||
data = rldecode(data)
|
||||
elif f in LITERALS_CCITTFAX_DECODE:
|
||||
data = ccittfaxdecode(data, params)
|
||||
elif f in LITERALS_DCT_DECODE:
|
||||
# This is probably a JPG stream
|
||||
# it does not need to be decoded twice.
|
||||
# Just return the stream to the user.
|
||||
pass
|
||||
elif f in LITERALS_JBIG2_DECODE:
|
||||
pass
|
||||
elif f == LITERAL_CRYPT:
|
||||
# not yet..
|
||||
raise PDFNotImplementedError('/Crypt filter is unsupported')
|
||||
else:
|
||||
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
||||
# apply predictors
|
||||
if params and 'Predictor' in params:
|
||||
pred = int_value(params['Predictor'])
|
||||
if pred == 1:
|
||||
# no predictor
|
||||
pass
|
||||
elif 10 <= pred:
|
||||
# PNG predictor
|
||||
colors = int_value(params.get('Colors', 1))
|
||||
columns = int_value(params.get('Columns', 1))
|
||||
raw_bits_per_component = params.get('BitsPerComponent', 8)
|
||||
bitspercomponent = int_value(raw_bits_per_component)
|
||||
data = apply_png_predictor(pred, colors, columns,
|
||||
bitspercomponent, data)
|
||||
else:
|
||||
error_msg = 'Unsupported predictor: %r' % pred
|
||||
raise PDFNotImplementedError(error_msg)
|
||||
self.data = data
|
||||
self.rawdata = None
|
||||
return
|
||||
|
||||
def get_data(self):
|
||||
if self.data is None:
|
||||
self.decode()
|
||||
return self.data
|
||||
|
||||
def get_rawdata(self):
|
||||
return self.rawdata
|
||||
@@ -1,625 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import logging
|
||||
|
||||
|
||||
from . import settings
|
||||
from .utils import choplist
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PSException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class PSEOF(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PSSyntaxError(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PSTypeError(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PSValueError(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PSObject:
|
||||
"""Base class for all PS or PDF-related data types."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class PSLiteral(PSObject):
|
||||
|
||||
"""A class that represents a PostScript literal.
|
||||
|
||||
Postscript literals are used as identifiers, such as
|
||||
variable names, property names and dictionary keys.
|
||||
Literals are case sensitive and denoted by a preceding
|
||||
slash sign (e.g. "/Name")
|
||||
|
||||
Note: Do not create an instance of PSLiteral directly.
|
||||
Always use PSLiteralTable.intern().
|
||||
"""
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
|
||||
def __repr__(self):
|
||||
name = self.name
|
||||
return '/%r' % name
|
||||
|
||||
|
||||
class PSKeyword(PSObject):
|
||||
|
||||
"""A class that represents a PostScript keyword.
|
||||
|
||||
PostScript keywords are a dozen of predefined words.
|
||||
Commands and directives in PostScript are expressed by keywords.
|
||||
They are also used to denote the content boundaries.
|
||||
|
||||
Note: Do not create an instance of PSKeyword directly.
|
||||
Always use PSKeywordTable.intern().
|
||||
"""
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
name = self.name
|
||||
return '/%r' % name
|
||||
|
||||
|
||||
class PSSymbolTable:
|
||||
"""A utility class for storing PSLiteral/PSKeyword objects.
|
||||
|
||||
Interned objects can be checked its identity with "is" operator.
|
||||
"""
|
||||
|
||||
def __init__(self, klass):
|
||||
self.dict = {}
|
||||
self.klass = klass
|
||||
return
|
||||
|
||||
def intern(self, name):
|
||||
if name in self.dict:
|
||||
lit = self.dict[name]
|
||||
else:
|
||||
lit = self.klass(name)
|
||||
self.dict[name] = lit
|
||||
return lit
|
||||
|
||||
|
||||
PSLiteralTable = PSSymbolTable(PSLiteral)
|
||||
PSKeywordTable = PSSymbolTable(PSKeyword)
|
||||
LIT = PSLiteralTable.intern
|
||||
KWD = PSKeywordTable.intern
|
||||
KEYWORD_PROC_BEGIN = KWD(b'{')
|
||||
KEYWORD_PROC_END = KWD(b'}')
|
||||
KEYWORD_ARRAY_BEGIN = KWD(b'[')
|
||||
KEYWORD_ARRAY_END = KWD(b']')
|
||||
KEYWORD_DICT_BEGIN = KWD(b'<<')
|
||||
KEYWORD_DICT_END = KWD(b'>>')
|
||||
|
||||
|
||||
def literal_name(x):
|
||||
if not isinstance(x, PSLiteral):
|
||||
if settings.STRICT:
|
||||
raise PSTypeError('Literal required: {!r}'.format(x))
|
||||
else:
|
||||
name = x
|
||||
else:
|
||||
name = x.name
|
||||
try:
|
||||
name = str(name, 'utf-8')
|
||||
except Exception:
|
||||
pass
|
||||
return name
|
||||
|
||||
|
||||
def keyword_name(x):
|
||||
if not isinstance(x, PSKeyword):
|
||||
if settings.STRICT:
|
||||
raise PSTypeError('Keyword required: %r' % x)
|
||||
else:
|
||||
name = x
|
||||
else:
|
||||
name = str(x.name, 'utf-8', 'ignore')
|
||||
return name
|
||||
|
||||
|
||||
EOL = re.compile(br'[\r\n]')
|
||||
SPC = re.compile(br'\s')
|
||||
NONSPC = re.compile(br'\S')
|
||||
HEX = re.compile(br'[0-9a-fA-F]')
|
||||
END_LITERAL = re.compile(br'[#/%\[\]()<>{}\s]')
|
||||
END_HEX_STRING = re.compile(br'[^\s0-9a-fA-F]')
|
||||
HEX_PAIR = re.compile(br'[0-9a-fA-F]{2}|.')
|
||||
END_NUMBER = re.compile(br'[^0-9]')
|
||||
END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]')
|
||||
END_STRING = re.compile(br'[()\134]')
|
||||
OCT_STRING = re.compile(br'[0-7]')
|
||||
ESC_STRING = {
|
||||
b'b': 8,
|
||||
b't': 9,
|
||||
b'n': 10,
|
||||
b'f': 12,
|
||||
b'r': 13,
|
||||
b'(': 40,
|
||||
b')': 41,
|
||||
b'\\': 92
|
||||
}
|
||||
|
||||
|
||||
class PSBaseParser:
|
||||
|
||||
"""Most basic PostScript parser that performs only tokenization.
|
||||
"""
|
||||
BUFSIZ = 4096
|
||||
|
||||
def __init__(self, fp):
|
||||
self.fp = fp
|
||||
self.seek(0)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp,
|
||||
self.bufpos)
|
||||
|
||||
def flush(self):
|
||||
return
|
||||
|
||||
def close(self):
|
||||
self.flush()
|
||||
return
|
||||
|
||||
def tell(self):
|
||||
return self.bufpos+self.charpos
|
||||
|
||||
def poll(self, pos=None, n=80):
|
||||
pos0 = self.fp.tell()
|
||||
if not pos:
|
||||
pos = self.bufpos+self.charpos
|
||||
self.fp.seek(pos)
|
||||
log.info('poll(%d): %r', pos, self.fp.read(n))
|
||||
self.fp.seek(pos0)
|
||||
return
|
||||
|
||||
def seek(self, pos):
|
||||
"""Seeks the parser to the given position.
|
||||
"""
|
||||
log.debug('seek: %r', pos)
|
||||
self.fp.seek(pos)
|
||||
# reset the status for nextline()
|
||||
self.bufpos = pos
|
||||
self.buf = b''
|
||||
self.charpos = 0
|
||||
# reset the status for nexttoken()
|
||||
self._parse1 = self._parse_main
|
||||
self._curtoken = b''
|
||||
self._curtokenpos = 0
|
||||
self._tokens = []
|
||||
return
|
||||
|
||||
def fillbuf(self):
|
||||
if self.charpos < len(self.buf):
|
||||
return
|
||||
# fetch next chunk.
|
||||
self.bufpos = self.fp.tell()
|
||||
self.buf = self.fp.read(self.BUFSIZ)
|
||||
if not self.buf:
|
||||
raise PSEOF('Unexpected EOF')
|
||||
self.charpos = 0
|
||||
return
|
||||
|
||||
def nextline(self):
|
||||
"""Fetches a next line that ends either with \\r or \\n.
|
||||
"""
|
||||
linebuf = b''
|
||||
linepos = self.bufpos + self.charpos
|
||||
eol = False
|
||||
while 1:
|
||||
self.fillbuf()
|
||||
if eol:
|
||||
c = self.buf[self.charpos:self.charpos+1]
|
||||
# handle b'\r\n'
|
||||
if c == b'\n':
|
||||
linebuf += c
|
||||
self.charpos += 1
|
||||
break
|
||||
m = EOL.search(self.buf, self.charpos)
|
||||
if m:
|
||||
linebuf += self.buf[self.charpos:m.end(0)]
|
||||
self.charpos = m.end(0)
|
||||
if linebuf[-1:] == b'\r':
|
||||
eol = True
|
||||
else:
|
||||
break
|
||||
else:
|
||||
linebuf += self.buf[self.charpos:]
|
||||
self.charpos = len(self.buf)
|
||||
log.debug('nextline: %r, %r', linepos, linebuf)
|
||||
|
||||
return (linepos, linebuf)
|
||||
|
||||
def revreadlines(self):
|
||||
"""Fetches a next line backword.
|
||||
|
||||
This is used to locate the trailers at the end of a file.
|
||||
"""
|
||||
self.fp.seek(0, 2)
|
||||
pos = self.fp.tell()
|
||||
buf = b''
|
||||
while 0 < pos:
|
||||
prevpos = pos
|
||||
pos = max(0, pos-self.BUFSIZ)
|
||||
self.fp.seek(pos)
|
||||
s = self.fp.read(prevpos-pos)
|
||||
if not s:
|
||||
break
|
||||
while 1:
|
||||
n = max(s.rfind(b'\r'), s.rfind(b'\n'))
|
||||
if n == -1:
|
||||
buf = s + buf
|
||||
break
|
||||
yield s[n:] + buf
|
||||
s = s[:n]
|
||||
buf = b''
|
||||
return
|
||||
|
||||
def _parse_main(self, s, i):
|
||||
m = NONSPC.search(s, i)
|
||||
if not m:
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
c = s[j:j+1]
|
||||
self._curtokenpos = self.bufpos+j
|
||||
if c == b'%':
|
||||
self._curtoken = b'%'
|
||||
self._parse1 = self._parse_comment
|
||||
return j+1
|
||||
elif c == b'/':
|
||||
self._curtoken = b''
|
||||
self._parse1 = self._parse_literal
|
||||
return j+1
|
||||
elif c in b'-+' or c.isdigit():
|
||||
self._curtoken = c
|
||||
self._parse1 = self._parse_number
|
||||
return j+1
|
||||
elif c == b'.':
|
||||
self._curtoken = c
|
||||
self._parse1 = self._parse_float
|
||||
return j+1
|
||||
elif c.isalpha():
|
||||
self._curtoken = c
|
||||
self._parse1 = self._parse_keyword
|
||||
return j+1
|
||||
elif c == b'(':
|
||||
self._curtoken = b''
|
||||
self.paren = 1
|
||||
self._parse1 = self._parse_string
|
||||
return j+1
|
||||
elif c == b'<':
|
||||
self._curtoken = b''
|
||||
self._parse1 = self._parse_wopen
|
||||
return j+1
|
||||
elif c == b'>':
|
||||
self._curtoken = b''
|
||||
self._parse1 = self._parse_wclose
|
||||
return j+1
|
||||
else:
|
||||
self._add_token(KWD(c))
|
||||
return j+1
|
||||
|
||||
def _add_token(self, obj):
|
||||
self._tokens.append((self._curtokenpos, obj))
|
||||
return
|
||||
|
||||
def _parse_comment(self, s, i):
|
||||
m = EOL.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
self._parse1 = self._parse_main
|
||||
# We ignore comments.
|
||||
# self._tokens.append(self._curtoken)
|
||||
return j
|
||||
|
||||
def _parse_literal(self, s, i):
|
||||
m = END_LITERAL.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
c = s[j:j+1]
|
||||
if c == b'#':
|
||||
self.hex = b''
|
||||
self._parse1 = self._parse_literal_hex
|
||||
return j+1
|
||||
try:
|
||||
self._curtoken = str(self._curtoken, 'utf-8')
|
||||
except Exception:
|
||||
pass
|
||||
self._add_token(LIT(self._curtoken))
|
||||
self._parse1 = self._parse_main
|
||||
return j
|
||||
|
||||
def _parse_literal_hex(self, s, i):
|
||||
c = s[i:i+1]
|
||||
if HEX.match(c) and len(self.hex) < 2:
|
||||
self.hex += c
|
||||
return i+1
|
||||
if self.hex:
|
||||
self._curtoken += bytes((int(self.hex, 16),))
|
||||
self._parse1 = self._parse_literal
|
||||
return i
|
||||
|
||||
def _parse_number(self, s, i):
|
||||
m = END_NUMBER.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
c = s[j:j+1]
|
||||
if c == b'.':
|
||||
self._curtoken += c
|
||||
self._parse1 = self._parse_float
|
||||
return j+1
|
||||
try:
|
||||
self._add_token(int(self._curtoken))
|
||||
except ValueError:
|
||||
pass
|
||||
self._parse1 = self._parse_main
|
||||
return j
|
||||
|
||||
def _parse_float(self, s, i):
|
||||
m = END_NUMBER.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
try:
|
||||
self._add_token(float(self._curtoken))
|
||||
except ValueError:
|
||||
pass
|
||||
self._parse1 = self._parse_main
|
||||
return j
|
||||
|
||||
def _parse_keyword(self, s, i):
|
||||
m = END_KEYWORD.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
if self._curtoken == b'true':
|
||||
token = True
|
||||
elif self._curtoken == b'false':
|
||||
token = False
|
||||
else:
|
||||
token = KWD(self._curtoken)
|
||||
self._add_token(token)
|
||||
self._parse1 = self._parse_main
|
||||
return j
|
||||
|
||||
def _parse_string(self, s, i):
|
||||
m = END_STRING.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
c = s[j:j+1]
|
||||
if c == b'\\':
|
||||
self.oct = b''
|
||||
self._parse1 = self._parse_string_1
|
||||
return j+1
|
||||
if c == b'(':
|
||||
self.paren += 1
|
||||
self._curtoken += c
|
||||
return j+1
|
||||
if c == b')':
|
||||
self.paren -= 1
|
||||
if self.paren:
|
||||
# WTF, they said balanced parens need no special treatment.
|
||||
self._curtoken += c
|
||||
return j+1
|
||||
self._add_token(self._curtoken)
|
||||
self._parse1 = self._parse_main
|
||||
return j+1
|
||||
|
||||
def _parse_string_1(self, s, i):
|
||||
c = s[i:i+1]
|
||||
if OCT_STRING.match(c) and len(self.oct) < 3:
|
||||
self.oct += c
|
||||
return i+1
|
||||
if self.oct:
|
||||
self._curtoken += bytes((int(self.oct, 8),))
|
||||
self._parse1 = self._parse_string
|
||||
return i
|
||||
if c in ESC_STRING:
|
||||
self._curtoken += bytes((ESC_STRING[c],))
|
||||
self._parse1 = self._parse_string
|
||||
return i+1
|
||||
|
||||
def _parse_wopen(self, s, i):
|
||||
c = s[i:i+1]
|
||||
if c == b'<':
|
||||
self._add_token(KEYWORD_DICT_BEGIN)
|
||||
self._parse1 = self._parse_main
|
||||
i += 1
|
||||
else:
|
||||
self._parse1 = self._parse_hexstring
|
||||
return i
|
||||
|
||||
def _parse_wclose(self, s, i):
|
||||
c = s[i:i+1]
|
||||
if c == b'>':
|
||||
self._add_token(KEYWORD_DICT_END)
|
||||
i += 1
|
||||
self._parse1 = self._parse_main
|
||||
return i
|
||||
|
||||
def _parse_hexstring(self, s, i):
|
||||
m = END_HEX_STRING.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
token = HEX_PAIR.sub(lambda m: bytes((int(m.group(0), 16),)),
|
||||
SPC.sub(b'', self._curtoken))
|
||||
self._add_token(token)
|
||||
self._parse1 = self._parse_main
|
||||
return j
|
||||
|
||||
def nexttoken(self):
|
||||
while not self._tokens:
|
||||
self.fillbuf()
|
||||
self.charpos = self._parse1(self.buf, self.charpos)
|
||||
token = self._tokens.pop(0)
|
||||
log.debug('nexttoken: %r', token)
|
||||
return token
|
||||
|
||||
|
||||
class PSStackParser(PSBaseParser):
|
||||
def __init__(self, fp):
|
||||
PSBaseParser.__init__(self, fp)
|
||||
self.reset()
|
||||
return
|
||||
|
||||
def reset(self):
|
||||
self.context = []
|
||||
self.curtype = None
|
||||
self.curstack = []
|
||||
self.results = []
|
||||
return
|
||||
|
||||
def seek(self, pos):
|
||||
PSBaseParser.seek(self, pos)
|
||||
self.reset()
|
||||
return
|
||||
|
||||
def push(self, *objs):
|
||||
self.curstack.extend(objs)
|
||||
return
|
||||
|
||||
def pop(self, n):
|
||||
objs = self.curstack[-n:]
|
||||
self.curstack[-n:] = []
|
||||
return objs
|
||||
|
||||
def popall(self):
|
||||
objs = self.curstack
|
||||
self.curstack = []
|
||||
return objs
|
||||
|
||||
def add_results(self, *objs):
|
||||
try:
|
||||
log.debug('add_results: %r', objs)
|
||||
except Exception:
|
||||
log.debug('add_results: (unprintable object)')
|
||||
self.results.extend(objs)
|
||||
return
|
||||
|
||||
def start_type(self, pos, type):
|
||||
self.context.append((pos, self.curtype, self.curstack))
|
||||
(self.curtype, self.curstack) = (type, [])
|
||||
log.debug('start_type: pos=%r, type=%r', pos, type)
|
||||
return
|
||||
|
||||
def end_type(self, type):
|
||||
if self.curtype != type:
|
||||
raise PSTypeError('Type mismatch: {!r} != {!r}'
|
||||
.format(self.curtype, type))
|
||||
objs = [obj for (_, obj) in self.curstack]
|
||||
(pos, self.curtype, self.curstack) = self.context.pop()
|
||||
log.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
|
||||
return (pos, objs)
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
return
|
||||
|
||||
def nextobject(self):
|
||||
"""Yields a list of objects.
|
||||
|
||||
Arrays and dictionaries are represented as Python lists and
|
||||
dictionaries.
|
||||
|
||||
:return: keywords, literals, strings, numbers, arrays and dictionaries.
|
||||
"""
|
||||
while not self.results:
|
||||
(pos, token) = self.nexttoken()
|
||||
if isinstance(token, (int, float, bool, str, bytes, PSLiteral)):
|
||||
# normal token
|
||||
self.push((pos, token))
|
||||
elif token == KEYWORD_ARRAY_BEGIN:
|
||||
# begin array
|
||||
self.start_type(pos, 'a')
|
||||
elif token == KEYWORD_ARRAY_END:
|
||||
# end array
|
||||
try:
|
||||
self.push(self.end_type('a'))
|
||||
except PSTypeError:
|
||||
if settings.STRICT:
|
||||
raise
|
||||
elif token == KEYWORD_DICT_BEGIN:
|
||||
# begin dictionary
|
||||
self.start_type(pos, 'd')
|
||||
elif token == KEYWORD_DICT_END:
|
||||
# end dictionary
|
||||
try:
|
||||
(pos, objs) = self.end_type('d')
|
||||
if len(objs) % 2 != 0:
|
||||
error_msg = 'Invalid dictionary construct: %r' % objs
|
||||
raise PSSyntaxError(error_msg)
|
||||
d = {literal_name(k): v
|
||||
for (k, v) in choplist(2, objs) if v is not None}
|
||||
self.push((pos, d))
|
||||
except PSTypeError:
|
||||
if settings.STRICT:
|
||||
raise
|
||||
elif token == KEYWORD_PROC_BEGIN:
|
||||
# begin proc
|
||||
self.start_type(pos, 'p')
|
||||
elif token == KEYWORD_PROC_END:
|
||||
# end proc
|
||||
try:
|
||||
self.push(self.end_type('p'))
|
||||
except PSTypeError:
|
||||
if settings.STRICT:
|
||||
raise
|
||||
elif isinstance(token, PSKeyword):
|
||||
log.debug('do_keyword: pos=%r, token=%r, stack=%r', pos,
|
||||
token, self.curstack)
|
||||
self.do_keyword(pos, token)
|
||||
else:
|
||||
log.error('unknown token: pos=%r, token=%r, stack=%r', pos,
|
||||
token, self.curstack)
|
||||
self.do_keyword(pos, token)
|
||||
raise
|
||||
if self.context:
|
||||
continue
|
||||
else:
|
||||
self.flush()
|
||||
obj = self.results.pop(0)
|
||||
try:
|
||||
log.debug('nextobject: %r', obj)
|
||||
except Exception:
|
||||
log.debug('nextobject: (unprintable object)')
|
||||
return obj
|
||||
@@ -1,40 +0,0 @@
|
||||
#
|
||||
# RunLength decoder (Adobe version) implementation based on PDF Reference
|
||||
# version 1.4 section 3.3.4.
|
||||
#
|
||||
# * public domain *
|
||||
#
|
||||
|
||||
|
||||
def rldecode(data):
|
||||
"""
|
||||
RunLength decoder (Adobe version) implementation based on PDF Reference
|
||||
version 1.4 section 3.3.4:
|
||||
The RunLengthDecode filter decodes data that has been encoded in a
|
||||
simple byte-oriented format based on run length. The encoded data
|
||||
is a sequence of runs, where each run consists of a length byte
|
||||
followed by 1 to 128 bytes of data. If the length byte is in the
|
||||
range 0 to 127, the following length + 1 (1 to 128) bytes are
|
||||
copied literally during decompression. If length is in the range
|
||||
129 to 255, the following single byte is to be copied 257 - length
|
||||
(2 to 128) times during decompression. A length value of 128
|
||||
denotes EOD.
|
||||
"""
|
||||
decoded = b''
|
||||
i = 0
|
||||
while i < len(data):
|
||||
length = data[i]
|
||||
if length == 128:
|
||||
break
|
||||
|
||||
if length >= 0 and length < 128:
|
||||
for j in range(i+1, (i+1)+(length+1)):
|
||||
decoded += bytes((data[j],))
|
||||
i = (i+1) + (length+1)
|
||||
|
||||
if length > 128:
|
||||
run = bytes((data[i+1],))*(257-length)
|
||||
decoded += run
|
||||
i = (i+1) + 1
|
||||
|
||||
return decoded
|
||||
@@ -1 +0,0 @@
|
||||
STRICT = False
|
||||
@@ -1,406 +0,0 @@
|
||||
"""
|
||||
Miscellaneous Routines.
|
||||
"""
|
||||
import io
|
||||
import pathlib
|
||||
import struct
|
||||
from html import escape
|
||||
|
||||
import chardet # For str encoding detection
|
||||
|
||||
# from sys import maxint as INF doesn't work anymore under Python3, but PDF
|
||||
# still uses 32 bits ints
|
||||
INF = (1 << 31) - 1
|
||||
|
||||
|
||||
class open_filename(object):
|
||||
"""
|
||||
Context manager that allows opening a filename
|
||||
(str or pathlib.PurePath type is supported) and closes it on exit,
|
||||
(just like `open`), but does nothing for file-like objects.
|
||||
"""
|
||||
def __init__(self, filename, *args, **kwargs):
|
||||
if isinstance(filename, pathlib.PurePath):
|
||||
filename = str(filename)
|
||||
if isinstance(filename, str):
|
||||
self.file_handler = open(filename, *args, **kwargs)
|
||||
self.closing = True
|
||||
elif isinstance(filename, io.IOBase):
|
||||
self.file_handler = filename
|
||||
self.closing = False
|
||||
else:
|
||||
raise TypeError('Unsupported input type: %s' % type(filename))
|
||||
|
||||
def __enter__(self):
|
||||
return self.file_handler
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
if self.closing:
|
||||
self.file_handler.close()
|
||||
return False
|
||||
|
||||
|
||||
def make_compat_bytes(in_str):
|
||||
"Converts to bytes, encoding to unicode."
|
||||
assert isinstance(in_str, str), str(type(in_str))
|
||||
return in_str.encode()
|
||||
|
||||
|
||||
def make_compat_str(in_str):
|
||||
"""Converts to string, guessing encoding."""
|
||||
assert isinstance(in_str, (bytes, str)), str(type(in_str))
|
||||
if isinstance(in_str, bytes):
|
||||
enc = chardet.detect(in_str)
|
||||
in_str = in_str.decode(enc['encoding'])
|
||||
return in_str
|
||||
|
||||
|
||||
def shorten_str(s, size):
|
||||
if size < 7:
|
||||
return s[:size]
|
||||
if len(s) > size:
|
||||
length = (size - 5) // 2
|
||||
return '{} ... {}'.format(s[:length], s[-length:])
|
||||
else:
|
||||
return s
|
||||
|
||||
|
||||
def compatible_encode_method(bytesorstring, encoding='utf-8',
|
||||
erraction='ignore'):
|
||||
"""When Py2 str.encode is called, it often means bytes.encode in Py3.
|
||||
|
||||
This does either.
|
||||
"""
|
||||
if isinstance(bytesorstring, str):
|
||||
return bytesorstring
|
||||
assert isinstance(bytesorstring, bytes), str(type(bytesorstring))
|
||||
return bytesorstring.decode(encoding, erraction)
|
||||
|
||||
|
||||
def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
||||
if bitspercomponent != 8:
|
||||
# unsupported
|
||||
raise ValueError("Unsupported `bitspercomponent': %d" %
|
||||
bitspercomponent)
|
||||
nbytes = colors * columns * bitspercomponent // 8
|
||||
buf = b''
|
||||
line0 = b'\x00' * columns
|
||||
for i in range(0, len(data), nbytes + 1):
|
||||
ft = data[i]
|
||||
i += 1
|
||||
line1 = data[i:i + nbytes]
|
||||
line2 = b''
|
||||
if ft == 0:
|
||||
# PNG none
|
||||
line2 += line1
|
||||
elif ft == 1:
|
||||
# PNG sub (UNTESTED)
|
||||
c = 0
|
||||
for b in line1:
|
||||
c = (c + b) & 255
|
||||
line2 += bytes((c,))
|
||||
elif ft == 2:
|
||||
# PNG up
|
||||
for (a, b) in zip(line0, line1):
|
||||
c = (a + b) & 255
|
||||
line2 += bytes((c,))
|
||||
elif ft == 3:
|
||||
# PNG average (UNTESTED)
|
||||
c = 0
|
||||
for (a, b) in zip(line0, line1):
|
||||
c = ((c + a + b) // 2) & 255
|
||||
line2 += bytes((c,))
|
||||
else:
|
||||
# unsupported
|
||||
raise ValueError("Unsupported predictor value: %d" % ft)
|
||||
buf += line2
|
||||
line0 = line2
|
||||
return buf
|
||||
|
||||
|
||||
# Matrix operations
|
||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
||||
|
||||
|
||||
def mult_matrix(m1, m0):
|
||||
(a1, b1, c1, d1, e1, f1) = m1
|
||||
(a0, b0, c0, d0, e0, f0) = m0
|
||||
"""Returns the multiplication of two matrices."""
|
||||
return (a0 * a1 + c0 * b1, b0 * a1 + d0 * b1,
|
||||
a0 * c1 + c0 * d1, b0 * c1 + d0 * d1,
|
||||
a0 * e1 + c0 * f1 + e0, b0 * e1 + d0 * f1 + f0)
|
||||
|
||||
|
||||
def translate_matrix(m, v):
|
||||
"""Translates a matrix by (x, y)."""
|
||||
(a, b, c, d, e, f) = m
|
||||
(x, y) = v
|
||||
return a, b, c, d, x * a + y * c + e, x * b + y * d + f
|
||||
|
||||
|
||||
def apply_matrix_pt(m, v):
|
||||
(a, b, c, d, e, f) = m
|
||||
(x, y) = v
|
||||
"""Applies a matrix to a point."""
|
||||
return a * x + c * y + e, b * x + d * y + f
|
||||
|
||||
|
||||
def apply_matrix_norm(m, v):
|
||||
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
|
||||
(a, b, c, d, e, f) = m
|
||||
(p, q) = v
|
||||
return a * p + c * q, b * p + d * q
|
||||
|
||||
|
||||
# Utility functions
|
||||
|
||||
def isnumber(x):
|
||||
return isinstance(x, (int, float))
|
||||
|
||||
|
||||
def uniq(objs):
|
||||
"""Eliminates duplicated elements."""
|
||||
done = set()
|
||||
for obj in objs:
|
||||
if obj in done:
|
||||
continue
|
||||
done.add(obj)
|
||||
yield obj
|
||||
return
|
||||
|
||||
|
||||
def fsplit(pred, objs):
|
||||
"""Split a list into two classes according to the predicate."""
|
||||
t = []
|
||||
f = []
|
||||
for obj in objs:
|
||||
if pred(obj):
|
||||
t.append(obj)
|
||||
else:
|
||||
f.append(obj)
|
||||
return t, f
|
||||
|
||||
|
||||
def drange(v0, v1, d):
|
||||
"""Returns a discrete range."""
|
||||
return range(int(v0) // d, int(v1 + d) // d)
|
||||
|
||||
|
||||
def get_bound(pts):
|
||||
"""Compute a minimal rectangle that covers all the points."""
|
||||
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
|
||||
for (x, y) in pts:
|
||||
x0 = min(x0, x)
|
||||
y0 = min(y0, y)
|
||||
x1 = max(x1, x)
|
||||
y1 = max(y1, y)
|
||||
return x0, y0, x1, y1
|
||||
|
||||
|
||||
def pick(seq, func, maxobj=None):
|
||||
"""Picks the object obj where func(obj) has the highest value."""
|
||||
maxscore = None
|
||||
for obj in seq:
|
||||
score = func(obj)
|
||||
if maxscore is None or maxscore < score:
|
||||
(maxscore, maxobj) = (score, obj)
|
||||
return maxobj
|
||||
|
||||
|
||||
def choplist(n, seq):
|
||||
"""Groups every n elements of the list."""
|
||||
r = []
|
||||
for x in seq:
|
||||
r.append(x)
|
||||
if len(r) == n:
|
||||
yield tuple(r)
|
||||
r = []
|
||||
return
|
||||
|
||||
|
||||
def nunpack(s, default=0):
|
||||
"""Unpacks 1 to 4 or 8 byte integers (big endian)."""
|
||||
length = len(s)
|
||||
if not length:
|
||||
return default
|
||||
elif length == 1:
|
||||
return ord(s)
|
||||
elif length == 2:
|
||||
return struct.unpack('>H', s)[0]
|
||||
elif length == 3:
|
||||
return struct.unpack('>L', b'\x00' + s)[0]
|
||||
elif length == 4:
|
||||
return struct.unpack('>L', s)[0]
|
||||
elif length == 8:
|
||||
return struct.unpack('>Q', s)[0]
|
||||
else:
|
||||
raise TypeError('invalid length: %d' % length)
|
||||
|
||||
|
||||
PDFDocEncoding = ''.join(chr(x) for x in (
|
||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
||||
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
|
||||
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
|
||||
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
|
||||
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
|
||||
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
|
||||
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
|
||||
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
|
||||
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
|
||||
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
|
||||
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
|
||||
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
|
||||
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
|
||||
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
|
||||
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
|
||||
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
|
||||
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
|
||||
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
|
||||
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
|
||||
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
|
||||
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
|
||||
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
|
||||
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
|
||||
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
|
||||
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
|
||||
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
|
||||
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
|
||||
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
|
||||
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
|
||||
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
|
||||
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
|
||||
))
|
||||
|
||||
|
||||
def decode_text(s):
|
||||
"""Decodes a PDFDocEncoding string to Unicode."""
|
||||
if s.startswith(b'\xfe\xff'):
|
||||
return str(s[2:], 'utf-16be', 'ignore')
|
||||
else:
|
||||
return ''.join(PDFDocEncoding[c] for c in s)
|
||||
|
||||
|
||||
def enc(x):
|
||||
"""Encodes a string for SGML/XML/HTML"""
|
||||
if isinstance(x, bytes):
|
||||
return ''
|
||||
return escape(x)
|
||||
|
||||
|
||||
def bbox2str(bbox):
|
||||
(x0, y0, x1, y1) = bbox
|
||||
return '{:.3f},{:.3f},{:.3f},{:.3f}'.format(x0, y0, x1, y1)
|
||||
|
||||
|
||||
def matrix2str(m):
|
||||
(a, b, c, d, e, f) = m
|
||||
return '[{:.2f},{:.2f},{:.2f},{:.2f}, ({:.2f},{:.2f})]'\
|
||||
.format(a, b, c, d, e, f)
|
||||
|
||||
|
||||
def vecBetweenBoxes(obj1, obj2):
|
||||
"""A distance function between two TextBoxes.
|
||||
|
||||
Consider the bounding rectangle for obj1 and obj2.
|
||||
Return vector between 2 boxes boundaries if they don't overlap, otherwise
|
||||
returns vector betweeen boxes centers
|
||||
|
||||
+------+..........+ (x1, y1)
|
||||
| obj1 | :
|
||||
+------+www+------+
|
||||
: | obj2 |
|
||||
(x0, y0) +..........+------+
|
||||
"""
|
||||
(x0, y0) = (min(obj1.x0, obj2.x0), min(obj1.y0, obj2.y0))
|
||||
(x1, y1) = (max(obj1.x1, obj2.x1), max(obj1.y1, obj2.y1))
|
||||
(ow, oh) = (x1 - x0, y1 - y0)
|
||||
(iw, ih) = (ow - obj1.width - obj2.width, oh - obj1.height - obj2.height)
|
||||
if iw < 0 and ih < 0:
|
||||
# if one is inside another we compute euclidean distance
|
||||
(xc1, yc1) = ((obj1.x0 + obj1.x1) / 2, (obj1.y0 + obj1.y1) / 2)
|
||||
(xc2, yc2) = ((obj2.x0 + obj2.x1) / 2, (obj2.y0 + obj2.y1) / 2)
|
||||
return xc1 - xc2, yc1 - yc2
|
||||
else:
|
||||
return max(0, iw), max(0, ih)
|
||||
|
||||
|
||||
class Plane:
|
||||
"""A set-like data structure for objects placed on a plane.
|
||||
|
||||
Can efficiently find objects in a certain rectangular area.
|
||||
It maintains two parallel lists of objects, each of
|
||||
which is sorted by its x or y coordinate.
|
||||
"""
|
||||
|
||||
def __init__(self, bbox, gridsize=50):
|
||||
self._seq = [] # preserve the object order.
|
||||
self._objs = set()
|
||||
self._grid = {}
|
||||
self.gridsize = gridsize
|
||||
(self.x0, self.y0, self.x1, self.y1) = bbox
|
||||
|
||||
def __repr__(self):
|
||||
return '<Plane objs=%r>' % list(self)
|
||||
|
||||
def __iter__(self):
|
||||
return (obj for obj in self._seq if obj in self._objs)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._objs)
|
||||
|
||||
def __contains__(self, obj):
|
||||
return obj in self._objs
|
||||
|
||||
def _getrange(self, bbox):
|
||||
(x0, y0, x1, y1) = bbox
|
||||
if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0:
|
||||
return
|
||||
x0 = max(self.x0, x0)
|
||||
y0 = max(self.y0, y0)
|
||||
x1 = min(self.x1, x1)
|
||||
y1 = min(self.y1, y1)
|
||||
for grid_y in drange(y0, y1, self.gridsize):
|
||||
for grid_x in drange(x0, x1, self.gridsize):
|
||||
yield (grid_x, grid_y)
|
||||
|
||||
def extend(self, objs):
|
||||
for obj in objs:
|
||||
self.add(obj)
|
||||
|
||||
def add(self, obj):
|
||||
"""place an object."""
|
||||
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
||||
if k not in self._grid:
|
||||
r = []
|
||||
self._grid[k] = r
|
||||
else:
|
||||
r = self._grid[k]
|
||||
r.append(obj)
|
||||
self._seq.append(obj)
|
||||
self._objs.add(obj)
|
||||
|
||||
def remove(self, obj):
|
||||
"""displace an object."""
|
||||
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
||||
try:
|
||||
self._grid[k].remove(obj)
|
||||
except (KeyError, ValueError):
|
||||
pass
|
||||
self._objs.remove(obj)
|
||||
|
||||
def find(self, bbox):
|
||||
"""finds objects that are in a certain area."""
|
||||
(x0, y0, x1, y1) = bbox
|
||||
done = set()
|
||||
for k in self._getrange(bbox):
|
||||
if k not in self._grid:
|
||||
continue
|
||||
for obj in self._grid[k]:
|
||||
if obj in done:
|
||||
continue
|
||||
done.add(obj)
|
||||
if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 \
|
||||
or y1 <= obj.y0:
|
||||
continue
|
||||
yield obj
|
||||
Binary file not shown.
Binary file not shown.
-177
@@ -1,177 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
RED=$'\x1b[31m'
|
||||
GREEN=$'\x1b[32m'
|
||||
GREY=$'\x1b[90m'
|
||||
RESET=$'\x1b[39m'
|
||||
|
||||
[[ $# -lt 1 ]] && {
|
||||
echo "$0 'needle' where/ [/usr/bin/find options]"
|
||||
echo "example: $0 's3cr3t' /mnt/share/ -size -10M ! -iname '*.wav' ! -iname '*.mp3'"
|
||||
exit
|
||||
}
|
||||
|
||||
function fork(){
|
||||
needle="$1"
|
||||
tempdir="$2"
|
||||
ln -s "$(realpath $0)" "$tempdir/$(basename $0)"
|
||||
( cd "$tempdir"; "./$(basename $0)" "$needle" "." "${opts[@]}"; )
|
||||
}
|
||||
|
||||
needle="$1"
|
||||
shift
|
||||
where="$1"
|
||||
shift
|
||||
opts=("$@")
|
||||
|
||||
find "$where" "${opts[@]}" -type f -print 2> /dev/null |
|
||||
while read path
|
||||
do
|
||||
filename=$(basename "$path")
|
||||
filename=${filename%\?*}
|
||||
ext=${filename##*.}
|
||||
[[ $filename = $ext ]] && ext=''
|
||||
mime=$(file -bi "$path")
|
||||
mime=${mime%' '*}
|
||||
#echo "$path"
|
||||
case $mime in
|
||||
*/xml\;)
|
||||
content=$(cat "$path")
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[xml] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
;;
|
||||
*/*html*)
|
||||
codepage=$(uchardet "$path")
|
||||
content=$(cat "$path" | iconv -f $codepage | lynx -nolist -dump -stdin)
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[html] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
;;
|
||||
text/*|*/*script\;)
|
||||
content=$(cat "$path")
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[text] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
;;
|
||||
application/msword\;)
|
||||
content=$(catdoc "$path")
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[doc] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
;;
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.document\;)
|
||||
content=$(unzip -p "$path" | grep -a '<w:r' | sed 's/<w:p[^<\/]*>/ /g' | sed 's/<[^<]*>//g' | grep -a -v '^[[:space:]]*$' | sed G)
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[docx] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
;;
|
||||
application/vnd.ms-excel\;)
|
||||
content=$(xls2csv -x "$path")
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[xls] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
;;
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\;)
|
||||
content=$(unzip -p "$path" | grep -a -e '<si><t>' -e '<vt:lpstr>' | sed 's/<[^<\/]*>/ /g' | sed 's/<[^<]*>//g')
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[xlsx] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
;;
|
||||
application/pdf\;)
|
||||
content=$(pdf2txt -t text "$path" 2> /dev/null)
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[pdf] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
;;
|
||||
application/x-executable\;|application/x*dos*)
|
||||
content=$(rabin2 -z "$path" 2> /dev/null)
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[exe] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
;;
|
||||
application/x-object\;|application/x-sharedlib|application/x-executable\;)
|
||||
content=$(rabin2 -z "$path" 2> /dev/null)
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[elf] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
;;
|
||||
application/*compressed*|application/*zip*|application/*rar*|application/*tar*|application/*gzip*)
|
||||
content=$(7z l "$path" | tail -n +13)
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[archive] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
temp=$(tempfile)
|
||||
rm $temp && mkdir -p "$temp/$path"
|
||||
7z x "$path" -o"$temp/$path" 1> /dev/null 2> /dev/null
|
||||
fork "$needle" "$temp"
|
||||
rm -r "$temp"
|
||||
#break
|
||||
;;
|
||||
image/*)
|
||||
content=$(identify -verbose "$path" 2> /dev/null)
|
||||
#content=$(tesseract "$path" stdout -l eng; tesseract "$path" stdout -l rus)
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[img] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
;;
|
||||
message/*)
|
||||
content=$(mu view "$path")
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[message] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
temp=$(tempfile)
|
||||
rm $temp && mkdir -p "$temp/$path"
|
||||
cp "$path" "$temp/$path/"
|
||||
munpack -t -f -C "$(realpath $temp/$path)" "$(basename $path)" > /dev/null
|
||||
rm "$temp/$path/$(basename $path)"
|
||||
fork "$needle" "$temp"
|
||||
rm -r "$temp"
|
||||
#break
|
||||
;;
|
||||
application/octet-stream\;)
|
||||
#content=$(strings "$path")
|
||||
#if echo "$content"|grep -q -ai "$needle"; then
|
||||
# echo $GREEN "[raw] $path" $RESET
|
||||
# echo "$content"|grep -ai "$needle" --color=auto
|
||||
#fi
|
||||
false
|
||||
;;
|
||||
application/x-raw-disk-image\;)
|
||||
content=$(binwalk "$path")
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[disk] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
file "$path" | grep -q text &&
|
||||
{
|
||||
content=$(cat "$path")
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[unknown] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
} || {
|
||||
content=$(strings "$path")
|
||||
if echo "$content"|grep -q -ai "$needle"; then
|
||||
echo $GREEN "[unknown] $path" $RESET
|
||||
echo "$content"|grep -ai "$needle" --color=auto
|
||||
fi
|
||||
}
|
||||
;;
|
||||
esac
|
||||
done
|
||||
@@ -1,32 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
GREEN=$'\x1b[32m'
|
||||
RESET=$'\x1b[39m'
|
||||
|
||||
LIMIT=10
|
||||
OFFSET=1
|
||||
|
||||
while getopts "c:o:" opt
|
||||
do
|
||||
case $opt in
|
||||
c) LIMIT=$OPTARG;;
|
||||
o) OFFSET=$OPTARG;;
|
||||
esac
|
||||
done
|
||||
|
||||
[[ $(($#-$OPTIND)) -lt 1 ]] && {
|
||||
echo $0 [opts] words.db QUERY
|
||||
echo "opts:"
|
||||
echo " -c count"
|
||||
echo " -o offset"
|
||||
exit
|
||||
}
|
||||
|
||||
DB="${@:$OPTIND:1}"
|
||||
shift $OPTIND
|
||||
echo $GREEN
|
||||
#echo "SELECT uri FROM words WHERE text MATCH '$*' limit $LIMIT offset $OFFSET;" | sqlite3 "$DB"
|
||||
echo "SELECT uri FROM words WHERE text LIKE '%$*%' limit $LIMIT offset $OFFSET;" | sqlite3 "$DB"
|
||||
echo $RESET
|
||||
#echo "SELECT text FROM words WHERE text MATCH '$*' limit $LIMIT offset $OFFSET;" | sqlite3 "$DB" | grep -i -o -P ".{0,100}$*..{0,100}" | grep -i --color=auto "$*"
|
||||
echo "SELECT text FROM words WHERE text LIKE '%$*%' limit $LIMIT offset $OFFSET;" | sqlite3 "$DB" | grep -i -o -P ".{0,100}$*..{0,100}" | grep -i --color=auto "$*"
|
||||
@@ -1,21 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
USERAGENT="Mozilla"
|
||||
IGNORE_EXT="gif,GIF,jpg,JPG,png,PNG,ico,ICO,svg,SVG,woff,ttf,eot"
|
||||
|
||||
[ $# -lt 1 ] && {
|
||||
echo "$0 url [/usr/bin/wget options]"
|
||||
echo "example: $0 --level 5 --wait 2 --domains www.site.com --quota=10000000 -A html,php -R pdf,jpg -X uploads --no-parent http://site.com/path/to"
|
||||
exit
|
||||
}
|
||||
|
||||
function crawl(){
|
||||
wget --no-check-certificate --recursive --spider -e robots=off -U $USERAGENT -O "/tmp/spider" --no-verbose $* 2>&1 | sed -rn 's|.*URL:[ ]*([^ ]+).*|\1|p'
|
||||
}
|
||||
|
||||
function save(){
|
||||
wget --no-check-certificate --recursive -N -e robots=off -U $USERAGENT --no-verbose -R "$IGNORE_EXT" $* 2>&1 | sed -rn 's|.*URL:[ ]*([^ ]+).*|\1|p'
|
||||
}
|
||||
|
||||
#crawl $*
|
||||
save $*
|
||||
Executable
+194
@@ -0,0 +1,194 @@
|
||||
#!/usr/bin/python3
|
||||
import csv
|
||||
import json
|
||||
from hashlib import md5
|
||||
from opensearchpy import OpenSearch
|
||||
from os import path
|
||||
from datetime import datetime
|
||||
from colorama import Fore
|
||||
import argparse
|
||||
|
||||
|
||||
CREDS = ('admin', 'admin')
|
||||
parser = argparse.ArgumentParser( description='search machine control tool' )
|
||||
parser.add_argument("opensearch", type=str, default="localhost:9200", help="opensearch address (localhost:9200)")
|
||||
parser.add_argument("-i", "--index", type=str, metavar="index", default="", help="index where to search")
|
||||
parser.add_argument("-o", "--offset", type=int, metavar="offset", default=0, help="offset results in query")
|
||||
parser.add_argument("-c", "--count", type=int, metavar="count", default=10, help="count results in query")
|
||||
parser.add_argument("-init", action="store_true", help="init index")
|
||||
parser.add_argument("-drop", action="store_true", help="drop index")
|
||||
parser.add_argument("-import", dest="file_import", metavar="input.csv", help="import data")
|
||||
parser.add_argument("-delete", dest="file_delete", metavar="input.csv", help="delete data")
|
||||
parser.add_argument("-query", metavar="query", help="search query")
|
||||
parser.add_argument("-cache", metavar="cache", help="get cache of a document")
|
||||
args = parser.parse_args()
|
||||
|
||||
host,port = args.opensearch.split(":")
|
||||
client = OpenSearch(
|
||||
hosts = [{'host': host, 'port': int(port)}],
|
||||
http_compress = True,
|
||||
http_auth = CREDS,
|
||||
use_ssl = True,
|
||||
verify_certs = False,
|
||||
ssl_assert_hostname = False,
|
||||
ssl_show_warn = False
|
||||
)
|
||||
|
||||
def indexes():
|
||||
for index in client.indices.get("*"):
|
||||
print(index, client.cat.count(index))
|
||||
|
||||
def info(index):
|
||||
print(json.dumps(client.indices.get_settings(index=index), indent=4))
|
||||
#json.dumps(client.indices.get_mapping(index=index))
|
||||
|
||||
def init(index):
|
||||
SETTINGS = {
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"timestamp": {"type": "text"},
|
||||
"inurl": { "type" : "text" },
|
||||
"site": { "type" : "text" },
|
||||
"ext": { "type" : "text" },
|
||||
"intitle": { "type" : "text" },
|
||||
"intext": { "type" : "text" },
|
||||
"filetype": { "type" : "text" }
|
||||
}
|
||||
},
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"russian": {
|
||||
"type": "custom",
|
||||
"tokenizer": "standard",
|
||||
"filter": ["lowercase", "russian_stop"],
|
||||
},
|
||||
"autocomplete": {
|
||||
"type": "custom",
|
||||
"tokenizer": "standard",
|
||||
"filter": ["lowercase", "russian_stop", "autocomplete_filter"]
|
||||
}
|
||||
},
|
||||
"filter": {
|
||||
"russian_stop": {
|
||||
"type": "stop",
|
||||
"stopwords": "_russian_"
|
||||
},
|
||||
"autocomplete_filter": {
|
||||
"type": "edge_ngram",
|
||||
"min_gram": 1,
|
||||
"max_gram": 20
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response = client.indices.create(index, body=SETTINGS)
|
||||
print(response)
|
||||
|
||||
def add(index, source):
|
||||
csv.field_size_limit(2**32)
|
||||
reader = csv.reader(open(source, errors="surrogateescape"), delimiter=',', quotechar='"')
|
||||
for row in reader:
|
||||
try:
|
||||
timestamp,filepath,ext,filetype,content,*_ = row
|
||||
|
||||
document = {
|
||||
"timestamp": datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S'),
|
||||
"inurl": filepath,
|
||||
"site": path.splitext(path.basename(source))[0],
|
||||
"ext": ext,
|
||||
"intitle": "",
|
||||
"intext": content,
|
||||
"filetype": filetype
|
||||
}
|
||||
|
||||
response = client.index(
|
||||
index = index,
|
||||
id = md5(filepath.encode()).hexdigest(),
|
||||
body = document,
|
||||
refresh = True
|
||||
)
|
||||
#print(response)
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
|
||||
def query(index, text):
|
||||
query = {
|
||||
"size": args.count,
|
||||
"from": args.offset,
|
||||
"query": {
|
||||
"query_string": {
|
||||
"query": text,
|
||||
"fields": ["inurl^100","intitle^50","intext^5"],
|
||||
"default_operator": "AND",
|
||||
"fuzziness": "AUTO",
|
||||
"analyzer": "russian"
|
||||
}
|
||||
},
|
||||
"highlight": {
|
||||
"order": "score",
|
||||
"fields": {
|
||||
"*": {
|
||||
"pre_tags" : [ Fore.RED ],
|
||||
"post_tags" : [ Fore.RESET ],
|
||||
"fragment_size": 50,
|
||||
"number_of_fragments": 3
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response = client.search(
|
||||
index = index,
|
||||
body = query
|
||||
)
|
||||
for result in response['hits']['hits']:
|
||||
print("{G}{uri} {B}{cache}{R}".format(
|
||||
uri=result['highlight']['inurl'][0] if result['highlight'].get('inurl') else result['_source']['inurl'],
|
||||
cache=result['_id'],
|
||||
G=Fore.GREEN, B=Fore.LIGHTBLACK_EX, R=Fore.RESET))
|
||||
print(" ... ".join(result['highlight'].get('intext',[])))
|
||||
|
||||
def cache(index, _id):
|
||||
result = client.get(index='test',id=_id)
|
||||
print(result["_source"]["intext"])
|
||||
|
||||
def delete(index, source):
|
||||
csv.field_size_limit(2**32)
|
||||
reader = csv.reader(open(source, errors="surrogateescape"), delimiter=',', quotechar='"')
|
||||
for row in reader:
|
||||
try:
|
||||
timestamp,filepath,ext,filetype,content,*_ = row
|
||||
response = client.delete(
|
||||
index = index,
|
||||
id = md5(filepath.encode()).hexdigest(),
|
||||
)
|
||||
print(response)
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
|
||||
def drop(index):
|
||||
response = client.indices.delete(
|
||||
index = index
|
||||
)
|
||||
print(response)
|
||||
|
||||
if args.init:
|
||||
init(index=args.index)
|
||||
elif args.drop:
|
||||
drop(index=args.index)
|
||||
elif args.file_import:
|
||||
add(index=args.index, source=args.file_import)
|
||||
elif args.file_delete:
|
||||
delete(index=args.index, source=args.file_delete)
|
||||
elif args.query:
|
||||
query(index=args.index, text=args.query)
|
||||
elif args.cache:
|
||||
cache(index=args.index, _id=args.cache)
|
||||
else:
|
||||
if args.index:
|
||||
info(index=args.index)
|
||||
else:
|
||||
indexes()
|
||||
@@ -0,0 +1,38 @@
|
||||
#!/bin/bash
|
||||
|
||||
GREEN=$'\x1b[32m'
|
||||
RESET=$'\x1b[39m'
|
||||
|
||||
MATCH=50
|
||||
LIMIT=10
|
||||
OFFSET=0
|
||||
URI='%'
|
||||
|
||||
while getopts "m:c:o:u:" opt
|
||||
do
|
||||
case $opt in
|
||||
m) MATCH=$OPTARG;;
|
||||
c) LIMIT=$OPTARG;;
|
||||
o) OFFSET=$OPTARG;;
|
||||
u) URI=$OPTARG;;
|
||||
esac
|
||||
done
|
||||
|
||||
[[ $(($#-$OPTIND)) -lt 1 ]] && [[ $URI = '%' ]] && {
|
||||
echo $0 [opts] words.db QUERY
|
||||
echo "opts:"
|
||||
echo " -m match"
|
||||
echo " -c count"
|
||||
echo " -o offset"
|
||||
echo " -u fragment"
|
||||
exit
|
||||
}
|
||||
|
||||
DB="${@:$OPTIND:1}"
|
||||
shift $OPTIND
|
||||
IFS='=%='
|
||||
echo "SELECT uri,text FROM words WHERE uri LIKE '$URI' and text LIKE '%$*%' limit $LIMIT offset $OFFSET;" | sqlite3 -separator '=%=' "$DB" | while read uri text
|
||||
do
|
||||
echo $GREEN"$uri"$RESET
|
||||
echo "$text" | grep -i -o -P ".{0,$MATCH}$*..{0,$MATCH}" | grep -i --color=auto "$*"
|
||||
done
|
||||
@@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
|
||||
USERAGENT="Mozilla"
|
||||
IGNORE_EXT="gif,GIF,jpg,JPG,png,PNG,ico,ICO,svg,SVG,woff,ttf,eot"
|
||||
|
||||
[ $# -lt 1 ] && {
|
||||
echo "$0 url [/usr/bin/wget options]"
|
||||
echo "example: $0 --level 5 --wait 2 --domains www.site.com --limit-size=10000000 -A html,php -R pdf,jpg -X uploads --no-parent http://site.com/path/to"
|
||||
echo "example: $0 --level 2 --wait 1 --limit-size=500k ftp://target.com/"
|
||||
exit
|
||||
}
|
||||
|
||||
function crawl(){
|
||||
$(dirname "$0")/bin/wget --no-check-certificate --recursive --spider -e robots=off -U $USERAGENT -O "/tmp/spider" --no-verbose $* 2>&1 | sed -rn 's|.*URL:[ ]*([^ ]+).*|\1|p'
|
||||
}
|
||||
|
||||
function save(){
|
||||
$(dirname "$0")/bin/wget --no-check-certificate --recursive -N -e robots=off -U $USERAGENT --no-verbose -R "$IGNORE_EXT" $* 2>&1 | sed -rn 's|.*URL:[ ]*([^ ]+).*|\1|p'
|
||||
}
|
||||
|
||||
#crawl $*
|
||||
save $*
|
||||
|
||||
#https://yurichev.com/wget.html
|
||||
@@ -1,7 +0,0 @@
|
||||
cd c:\path\to\crawl\windows
|
||||
.\crawl.ps1 ..\path\to > out.log
|
||||
.\grep.ps1 ..\path\to s3cr3t
|
||||
|
||||
cme smb -d dom -u adm -p pas -X '.\grep.ps1 c:\users s3cr3t > c:\grep.log' targets.txt
|
||||
sleep 3600
|
||||
cme smb -d dom -u adm -p pas -x 'type c:\grep.log' targets.txt
|
||||
@@ -1,66 +0,0 @@
|
||||
echo "begin $PID"
|
||||
$ErrorActionPreference = 'SilentlyContinue'
|
||||
$TIMEOUT=15
|
||||
$haystack = $args[0]
|
||||
$needle = $args[1]
|
||||
$files = 0
|
||||
$exts = @()
|
||||
$exts += @("*.doc","*.docx")
|
||||
$exts += @("*.xls","*.xlsx")
|
||||
$exts += @("*.pdf")
|
||||
$exts += @("*.zip")
|
||||
$exts += @("*.txt","*.bat","*.vbs","*.ps1","*.reg","*.cfg","*.conf","*.xml","*.log")
|
||||
#$exts += @("*.exe","*.dll")
|
||||
$opts = @{
|
||||
"Path" = $haystack
|
||||
"Recurse" = $true
|
||||
"Include" = $exts
|
||||
}
|
||||
|
||||
Get-ChildItem @opts 2> $null | % {
|
||||
if((Get-Item $_.FullName) -isnot [System.IO.DirectoryInfo]) {
|
||||
$files += 1
|
||||
}
|
||||
}
|
||||
$i = 1
|
||||
Get-ChildItem @opts 2> $null | % {
|
||||
if((Get-Item $_.FullName) -isnot [System.IO.DirectoryInfo]) {
|
||||
$file = @{}
|
||||
$file.name = $_.Name
|
||||
$file.path = $_.FullName
|
||||
$file.ext = $_.Extension
|
||||
$file.content = ""
|
||||
#echo "[*] $($file.path)"
|
||||
$job = $null
|
||||
switch -regex ($file.ext) {
|
||||
'.txt|.bat|.vbs|.ps1|.reg|.cfg|.conf|.xml' { $job = Start-Job -FilePath .\lib\plaintext.ps1 -argumentlist $file.path }
|
||||
'.doc*' { $job = Start-Job -FilePath .\lib\word.ps1 -argumentlist $file.path }
|
||||
'.xls*' { $job = Start-Job -FilePath .\lib\excel.ps1 -argumentlist $file.path }
|
||||
'.pdf' { $job = Start-Job -FilePath .\lib\pdf.ps1 -argumentlist $file.path -Init ([ScriptBlock]::Create("Set-Location '$pwd'")) }
|
||||
'.zip|.7z|.tar|.gz|.gzip|.gz' { $job = Start-Job -FilePath .\lib\archive.ps1 -argumentlist $file.path,"grep.ps1",$needle -Init ([ScriptBlock]::Create("Set-Location '$pwd'")) }
|
||||
'.exe|.dll' { $job = Start-Job -FilePath .\lib\executable.ps1 -argumentlist $file.path -Init ([ScriptBlock]::Create("Set-Location '$pwd'")) }
|
||||
}
|
||||
if($job)
|
||||
{
|
||||
Wait-Job -timeout $TIMEOUT $job > $null
|
||||
$file.content = Receive-Job $job
|
||||
#echo $file.content
|
||||
Stop-Job $job
|
||||
Remove-Job $job
|
||||
}
|
||||
if(echo $file.content | select-string $needle) {
|
||||
Write-Output "[+] [$i/$files] $($file.path)"
|
||||
echo $file.content | select-string -Pattern $needle
|
||||
#Write-Host -ForegroundColor green (echo $file.content | select-string -Pattern $needle)
|
||||
#highlight(echo $file.content | select-string $needle)
|
||||
}
|
||||
elseif($file.content -eq 0) {
|
||||
echo "[!] [$i/$files] $($file.path)"
|
||||
}
|
||||
elseif($i % 1 -eq 0) {
|
||||
echo "[*] [$i/$files] $($file.path)"
|
||||
}
|
||||
$i += 1
|
||||
}
|
||||
}
|
||||
echo 'done'
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user