Diff
checker
Text
Text
Images
Documents
Excel
Folders
Legal
Enterprise
Desktop
Pricing
Sign in
Download Diffchecker Desktop
Compare text
Find the difference between two text files
Tools
History
Real-time editor
Hide unchanged lines
Disable line wrap
Layout
Split
Unified
Diff precision
Smart
Word
Char
Syntax highlighting
Choose syntax
Ignore
Transform text
Go to first change
Edit input
Diffchecker Desktop
The most secure way to run Diffchecker. Get the Diffchecker Desktop app: your diffs never leave your computer!
Get Desktop
camelot/handlers.py
Created
5 years ago
Diff never expires
Clear
Export
Share
Explain
42 removals
Lines
Total
Removed
Characters
Total
Removed
To continue using this feature, upgrade to
Diff
checker
Pro
View Pricing
169 lines
Copy
77 additions
Lines
Total
Added
Characters
Total
Added
To continue using this feature, upgrade to
Diff
checker
Pro
View Pricing
181 lines
Copy
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import os
import os
import sys
import sys
Copy
Copied
Copy
Copied
from PyPDF2
import
PdfFileReader, PdfFileWriter
import
fitz
from .core import TableList
from .core import TableList
from .parsers import Stream, Lattice
from .parsers import Stream, Lattice
from .utils import (
from .utils import (
TemporaryDirectory,
TemporaryDirectory,
get_page_layout,
get_page_layout,
get_text_objects,
get_text_objects,
get_rotation,
get_rotation,
is_url,
is_url,
download_url,
download_url,
)
)
class PDFHandler(object):
class PDFHandler(object):
"""Handles all operations like temp directory creation, splitting
"""Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the
file into single page PDFs, parsing each PDF and then removing the
temp directory.
temp directory.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
pages : str, optional (default: '1')
Comma-separated page numbers.
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
password : str, optional (default: None)
Password for decryption.
Password for decryption.
"""
"""
def __init__(self, filepath, pages="1", password=None):
def __init__(self, filepath, pages="1", password=None):
if is_url(filepath):
if is_url(filepath):
filepath = download_url(filepath)
filepath = download_url(filepath)
self.filepath = filepath
self.filepath = filepath
if not filepath.lower().endswith(".pdf"):
if not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")
raise NotImplementedError("File format not supported")
if password is None:
if password is None:
self.password = ""
self.password = ""
else:
else:
self.password = password
self.password = password
if sys.version_info[0] < 3:
if sys.version_info[0] < 3:
self.password = self.password.encode("ascii")
self.password = self.password.encode("ascii")
self.pages = self._get_pages(self.filepath, pages)
self.pages = self._get_pages(self.filepath, pages)
def _get_pages(self, filepath, pages):
def _get_pages(self, filepath, pages):
"""Converts pages string to list of ints.
"""Converts pages string to list of ints.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
pages : str, optional (default: '1')
Comma-separated page numbers.
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Example: '1,3,4' or '1,4-end' or 'all'.
Returns
Returns
-------
-------
P : list
P : list
List of int page numbers.
List of int page numbers.
"""
"""
page_numbers = []
page_numbers = []
if pages == "1":
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
page_numbers.append({"start": 1, "end": 1})
else:
else:
Copy
Copied
Copy
Copied
instream =
open(filepath
, "rb")
with fitz.
open(filepath
) as
infile
:
infile
= PdfFileReader(instream, strict=False)
if infile.
needsPass:
if infile.
isEncrypted:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
if pages == "all":
if pages == "all":
page_numbers.append({"start": 1, "end": infile.
pageCount
})
page_numbers.append({"start": 1, "end": infile.
getNumPages()
})
else:
else:
for r in pages.split(","):
for r in pages.split(","):
if "-" in r:
if "-" in r:
a, b = r.split("-")
a, b = r.split("-")
if b == "end":
if b == "end":
b = infile.
pageCount
b = infile.
getNumPages()
page_numbers.append(
page_numbers.append(
{"start": int(a), "end": int(b)})
{"start": int(a), "end": int(b)})
else:
else:
page_numbers.append(
{"start": int(r), "end": int(r)})
page_numbers.append(
instream.close()
{"start": int(r), "end": int(r)})
P = []
P = []
for p in page_numbers:
for p in page_numbers:
P.extend(range(p["start"], p["end"] + 1))
P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P))
return sorted(set(P))
def _save_page(self, filepath, page, temp):
def _save_page(self, filepath, page, temp):
"""Saves specified page from PDF into a temporary directory.
"""Saves specified page from PDF into a temporary directory.
Parameters
Parameters
----------
----------
filepath : str
filepath : str
Filepath or URL of the PDF file.
Filepath or URL of the PDF file.
page : int
page : int
Page number.
Page number.
temp : str
temp : str
Tmp directory.
Tmp directory.
"""
"""
Copy
Copied
Copy
Copied
with
open(filepath, "rb") as fileobj:
with
fitz.open(filepath) as infile:
infile = PdfFileReader(fileobj, strict=False)
if infile.needsPass:
if infile.isEncrypted:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
froot, fext = os.path.splitext(fpath)
Copy
Copied
Copy
Copied
p = infile
.getPage(
page - 1
)
p = infile
[
page - 1
]
outfile =
PdfFileWriter
()
p.setRotation(0
)
outfile.
addPage(p)
outfile =
fitz.open
()
with open(fpath, "wb") as f:
outpage =
outfile.
newPage(-1, width=p.rect.width,
outfile.
write(f)
height=p.rect.height)
outpage.showPDFpage(outpage.rect, infile, pno=page-1)
outfile.
save(fpath)
layout, dim = get_page_layout(fpath)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
if rotation != "":
Copy
Copied
Copy
Copied
fpath_new = "".join(
[froot.replace("page", "p"), "_rotated", fext])
fpath_new = "".join(
[froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
os.rename(fpath, fpath_new)
Copy
Copied
Copy
Copied
instream = open(fpath_new, "rb")
infile =
fitz.open(fpath_new
)
infile =
PdfFileReader(instream, strict=False
)
if infile.
needsPass
:
if infile.
isEncrypted
:
infile.
authenticate
(self.password)
infile.
decrypt
(self.password)
outfile =
fitz.open
()
outfile =
PdfFileWriter
()
p = infile
[0]
p = infile
.getPage(0)
outpage = outfile.newPage(-1, width=p.rect.width,
height=p.rect.height)
outpage.showPDFpage(outpage.rect, infile, pno=0)
if rotation == "anticlockwise":
if rotation == "anticlockwise":
Copy
Copied
Copy
Copied
p.rotateClockwise(
90)
outpage.setRotation((p.rotation +
90)
% 360)
elif rotation == "clockwise":
elif rotation == "clockwise":
Copy
Copied
Copy
Copied
p.rotateCounterClockwise(90)
outpage.setRotation((p.rotation + 270) % 360)
outfile.addPage(p)
with open(fpath, "wb") as f:
outfile.save(fpath)
outfile.write(f)
instream.close()
def parse(
def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
):
):
"""Extracts tables by calling parser.get_tables on all single
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
page PDFs.
Parameters
Parameters
----------
----------
flavor : str (default: 'lattice')
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
Lattice is used by default.
suppress_stdout : str (default: False)
suppress_stdout : str (default: False)
Suppress logs and warnings.
Suppress logs and warnings.
layout_kwargs : dict, optional (default: {})
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
kwargs : dict
kwargs : dict
See camelot.read_pdf kwargs.
See camelot.read_pdf kwargs.
Returns
Returns
-------
-------
tables : camelot.core.TableList
tables : camelot.core.TableList
List of tables found in PDF.
List of tables found in PDF.
"""
"""
tables = []
tables = []
with TemporaryDirectory() as tempdir:
with TemporaryDirectory() as tempdir:
Copy
Copied
Copy
Copied
for p in self.pages:
try:
self._save_page(self.filepath, p, tempdir)
for p in self.pages:
pages = [
self._save_page(self.filepath, p, tempdir)
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
pages = [
]
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
parser = Lattice(
**kwargs) if flavor == "lattice" else Stream(**kwargs)
]
for p in pages:
parser = Lattice(
t = parser.extract_tables(
**kwargs) if flavor == "lattice" else Stream(**kwargs)
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
for p in pages:
)
t = parser.extract_tables(
tables.extend(t)
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
)
tables.extend(t)
except ValueError as err:
if str(err) == "document closed or encrypted":
raise ValueError("file has not been decrypted") from err
raise
return TableList(sorted(tables))
return TableList(sorted(tables))
Saved diffs
Original text
Open file
# -*- coding: utf-8 -*- import os import sys from PyPDF2 import PdfFileReader, PdfFileWriter from .core import TableList from .parsers import Stream, Lattice from .utils import ( TemporaryDirectory, get_page_layout, get_text_objects, get_rotation, is_url, download_url, ) class PDFHandler(object): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. """ def __init__(self, filepath, pages="1", password=None): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: self.password = "" else: self.password = password if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- P : list List of int page numbers. """ page_numbers = [] if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: instream = open(filepath, "rb") infile = PdfFileReader(instream, strict=False) if infile.isEncrypted: infile.decrypt(self.password) if pages == "all": page_numbers.append({"start": 1, "end": infile.getNumPages()}) else: for r in pages.split(","): if "-" in r: a, b = r.split("-") if b == "end": b = infile.getNumPages() page_numbers.append({"start": int(a), "end": int(b)}) else: page_numbers.append({"start": int(r), "end": int(r)}) instream.close() P = [] for p in page_numbers: P.extend(range(p["start"], p["end"] + 1)) return sorted(set(P)) def _save_page(self, filepath, page, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. page : int Page number. temp : str Tmp directory. """ with open(filepath, "rb") as fileobj: infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: infile.decrypt(self.password) fpath = os.path.join(temp, f"page-{page}.pdf") froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) outfile = PdfFileWriter() outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) layout, dim = get_page_layout(fpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) instream = open(fpath_new, "rb") infile = PdfFileReader(instream, strict=False) if infile.isEncrypted: infile.decrypt(self.password) outfile = PdfFileWriter() p = infile.getPage(0) if rotation == "anticlockwise": p.rotateClockwise(90) elif rotation == "clockwise": p.rotateCounterClockwise(90) outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) instream.close() def parse( self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. Parameters ---------- flavor : str (default: 'lattice') The parsing method to use ('lattice' or 'stream'). Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. kwargs : dict See camelot.read_pdf kwargs. Returns ------- tables : camelot.core.TableList List of tables found in PDF. """ tables = [] with TemporaryDirectory() as tempdir: for p in self.pages: self._save_page(self.filepath, p, tempdir) pages = [ os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages ] parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: t = parser.extract_tables( p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs ) tables.extend(t) return TableList(sorted(tables))
Changed text
Open file
# -*- coding: utf-8 -*- import os import sys import fitz from .core import TableList from .parsers import Stream, Lattice from .utils import ( TemporaryDirectory, get_page_layout, get_text_objects, get_rotation, is_url, download_url, ) class PDFHandler(object): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. """ def __init__(self, filepath, pages="1", password=None): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: self.password = "" else: self.password = password if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- P : list List of int page numbers. """ page_numbers = [] if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: with fitz.open(filepath) as infile: if infile.needsPass: infile.authenticate(self.password) if pages == "all": page_numbers.append({"start": 1, "end": infile.pageCount}) else: for r in pages.split(","): if "-" in r: a, b = r.split("-") if b == "end": b = infile.pageCount page_numbers.append( {"start": int(a), "end": int(b)}) else: page_numbers.append( {"start": int(r), "end": int(r)}) P = [] for p in page_numbers: P.extend(range(p["start"], p["end"] + 1)) return sorted(set(P)) def _save_page(self, filepath, page, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. page : int Page number. temp : str Tmp directory. """ with fitz.open(filepath) as infile: if infile.needsPass: infile.authenticate(self.password) fpath = os.path.join(temp, f"page-{page}.pdf") froot, fext = os.path.splitext(fpath) p = infile[page - 1] p.setRotation(0) outfile = fitz.open() outpage = outfile.newPage(-1, width=p.rect.width, height=p.rect.height) outpage.showPDFpage(outpage.rect, infile, pno=page-1) outfile.save(fpath) layout, dim = get_page_layout(fpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": fpath_new = "".join( [froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) infile = fitz.open(fpath_new) if infile.needsPass: infile.authenticate(self.password) outfile = fitz.open() p = infile[0] outpage = outfile.newPage(-1, width=p.rect.width, height=p.rect.height) outpage.showPDFpage(outpage.rect, infile, pno=0) if rotation == "anticlockwise": outpage.setRotation((p.rotation + 90) % 360) elif rotation == "clockwise": outpage.setRotation((p.rotation + 270) % 360) outfile.save(fpath) def parse( self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. Parameters ---------- flavor : str (default: 'lattice') The parsing method to use ('lattice' or 'stream'). Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. kwargs : dict See camelot.read_pdf kwargs. Returns ------- tables : camelot.core.TableList List of tables found in PDF. """ tables = [] with TemporaryDirectory() as tempdir: try: for p in self.pages: self._save_page(self.filepath, p, tempdir) pages = [ os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages ] parser = Lattice( **kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: t = parser.extract_tables( p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs ) tables.extend(t) except ValueError as err: if str(err) == "document closed or encrypted": raise ValueError("file has not been decrypted") from err raise return TableList(sorted(tables))
Find difference