Compare text

Find the difference between two text files

Real-time editor

Hide unchanged lines

Disable line wrap

Layout

Diff precision

Syntax highlighting

Diffchecker Desktop The most secure way to run Diffchecker. Get the Diffchecker Desktop app: your diffs never leave your computer!Get Desktop

camelot/handlers.py

Created 5 years agoDiff never expires

42 removals

Lines
Total
Removed

Characters
Total
Removed

To continue using this feature, upgrade to Diffchecker Pro View Pricing

169 lines

77 additions

Lines
Total
Added

Characters
Total
Added

To continue using this feature, upgrade to Diffchecker Pro View Pricing

181 lines

# -*- coding: utf-8 -*-

import os

import sys

from PyPDF2 import PdfFileReader, PdfFileWriter

import fitz

from .core import TableList

from .parsers import Stream, Lattice

from .utils import (

TemporaryDirectory,

get_page_layout,

get_text_objects,

get_rotation,

is_url,

download_url,

)

class PDFHandler(object):

"""Handles all operations like temp directory creation, splitting

file into single page PDFs, parsing each PDF and then removing the

temp directory.

Parameters

----------

filepath : str

Filepath or URL of the PDF file.

pages : str, optional (default: '1')

Comma-separated page numbers.

Example: '1,3,4' or '1,4-end' or 'all'.

password : str, optional (default: None)

Password for decryption.

"""

def __init__(self, filepath, pages="1", password=None):

if is_url(filepath):

filepath = download_url(filepath)

self.filepath = filepath

if not filepath.lower().endswith(".pdf"):

raise NotImplementedError("File format not supported")

if password is None:

self.password = ""

else:

self.password = password

if sys.version_info[0] < 3:

self.password = self.password.encode("ascii")

self.pages = self._get_pages(self.filepath, pages)

def _get_pages(self, filepath, pages):

"""Converts pages string to list of ints.

Parameters

----------

filepath : str

Filepath or URL of the PDF file.

pages : str, optional (default: '1')

Comma-separated page numbers.

Example: '1,3,4' or '1,4-end' or 'all'.

Returns

-------

P : list

List of int page numbers.

"""

page_numbers = []

if pages == "1":

page_numbers.append({"start": 1, "end": 1})

else:

instream = open(filepath, "rb")

with fitz.open(filepath) as infile:

infile = PdfFileReader(instream, strict=False)

if infile.needsPass:

if infile.isEncrypted:

infile.authenticate(self.password)

infile.decrypt(self.password)

if pages == "all":

page_numbers.append({"start": 1, "end": infile.pageCount})

page_numbers.append({"start": 1, "end": infile.getNumPages()})

else:

for r in pages.split(","):

if "-" in r:

a, b = r.split("-")

if b == "end":

b = infile.pageCount

b = infile.getNumPages()

page_numbers.append(

page_numbers.append({"start": int(a), "end": int(b)})

{"start": int(a), "end": int(b)})

else:

page_numbers.append({"start": int(r), "end": int(r)})

page_numbers.append(

instream.close()

{"start": int(r), "end": int(r)})

P = []

for p in page_numbers:

P.extend(range(p["start"], p["end"] + 1))

return sorted(set(P))

def _save_page(self, filepath, page, temp):

"""Saves specified page from PDF into a temporary directory.

Parameters

----------

filepath : str

Filepath or URL of the PDF file.

page : int

Page number.

temp : str

Tmp directory.

"""

with open(filepath, "rb") as fileobj:

with fitz.open(filepath) as infile:

infile = PdfFileReader(fileobj, strict=False)

if infile.needsPass:

if infile.isEncrypted:

infile.authenticate(self.password)

infile.decrypt(self.password)

fpath = os.path.join(temp, f"page-{page}.pdf")

froot, fext = os.path.splitext(fpath)

p = infile.getPage(page - 1)

p = infile[page - 1]

outfile = PdfFileWriter()

p.setRotation(0)

outfile.addPage(p)

outfile = fitz.open()

with open(fpath, "wb") as f:

outpage = outfile.newPage(-1, width=p.rect.width,

outfile.write(f)

height=p.rect.height)

outpage.showPDFpage(outpage.rect, infile, pno=page-1)

outfile.save(fpath)

layout, dim = get_page_layout(fpath)

# fix rotated PDF

chars = get_text_objects(layout, ltype="char")

horizontal_text = get_text_objects(layout, ltype="horizontal_text")

vertical_text = get_text_objects(layout, ltype="vertical_text")

rotation = get_rotation(chars, horizontal_text, vertical_text)

if rotation != "":

fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])

fpath_new = "".join(

[froot.replace("page", "p"), "_rotated", fext])

os.rename(fpath, fpath_new)

instream = open(fpath_new, "rb")

infile = fitz.open(fpath_new)

infile = PdfFileReader(instream, strict=False)

if infile.needsPass:

if infile.isEncrypted:

infile.authenticate(self.password)

infile.decrypt(self.password)

outfile = fitz.open()

outfile = PdfFileWriter()

p = infile[0]

p = infile.getPage(0)

outpage = outfile.newPage(-1, width=p.rect.width,

height=p.rect.height)

outpage.showPDFpage(outpage.rect, infile, pno=0)

if rotation == "anticlockwise":

p.rotateClockwise(90)

outpage.setRotation((p.rotation + 90) % 360)

elif rotation == "clockwise":

p.rotateCounterClockwise(90)

outpage.setRotation((p.rotation + 270) % 360)

outfile.addPage(p)

with open(fpath, "wb") as f:

outfile.save(fpath)

outfile.write(f)

instream.close()

def parse(

self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs

"""Extracts tables by calling parser.get_tables on all single

page PDFs.

Parameters

----------

flavor : str (default: 'lattice')

The parsing method to use ('lattice' or 'stream').

Lattice is used by default.

suppress_stdout : str (default: False)

Suppress logs and warnings.

layout_kwargs : dict, optional (default: {})

A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.

kwargs : dict

See camelot.read_pdf kwargs.

Returns

-------

tables : camelot.core.TableList

List of tables found in PDF.

"""

tables = []

with TemporaryDirectory() as tempdir:

for p in self.pages:

try:

self._save_page(self.filepath, p, tempdir)

for p in self.pages:

pages = [

self._save_page(self.filepath, p, tempdir)

os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages

pages = [

]

os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages

parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)

]

for p in pages:

parser = Lattice(

t = parser.extract_tables(

**kwargs) if flavor == "lattice" else Stream(**kwargs)

p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs

for p in pages:

)

t = parser.extract_tables(

tables.extend(t)

p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs

)

tables.extend(t)

except ValueError as err:

if str(err) == "document closed or encrypted":

raise ValueError("file has not been decrypted") from err

raise

return TableList(sorted(tables))

Saved diffs

Original text

Open file

# -*- coding: utf-8 -*-

import os
import sys

from PyPDF2 import PdfFileReader, PdfFileWriter

from .core import TableList
from .parsers import Stream, Lattice
from .utils import (
    TemporaryDirectory,
    get_page_layout,
    get_text_objects,
    get_rotation,
    is_url,
    download_url,
)

class PDFHandler(object):
    """Handles all operations like temp directory creation, splitting
    file into single page PDFs, parsing each PDF and then removing the
    temp directory.
    Parameters
    ----------
    filepath : str
        Filepath or URL of the PDF file.
    pages : str, optional (default: '1')
        Comma-separated page numbers.
        Example: '1,3,4' or '1,4-end' or 'all'.
    password : str, optional (default: None)
        Password for decryption.
    """

def __init__(self, filepath, pages="1", password=None):
        if is_url(filepath):
            filepath = download_url(filepath)
        self.filepath = filepath
        if not filepath.lower().endswith(".pdf"):
            raise NotImplementedError("File format not supported")

if password is None:
            self.password = ""
        else:
            self.password = password
            if sys.version_info[0] < 3:
                self.password = self.password.encode("ascii")
        self.pages = self._get_pages(self.filepath, pages)

def _get_pages(self, filepath, pages):
        """Converts pages string to list of ints.
        Parameters
        ----------
        filepath : str
            Filepath or URL of the PDF file.
        pages : str, optional (default: '1')
            Comma-separated page numbers.
            Example: '1,3,4' or '1,4-end' or 'all'.
        Returns
        -------
        P : list
            List of int page numbers.
        """
        page_numbers = []
        if pages == "1":
            page_numbers.append({"start": 1, "end": 1})
        else:
            instream = open(filepath, "rb")
            infile = PdfFileReader(instream, strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
            if pages == "all":
                page_numbers.append({"start": 1, "end": infile.getNumPages()})
            else:
                for r in pages.split(","):
                    if "-" in r:
                        a, b = r.split("-")
                        if b == "end":
                            b = infile.getNumPages()
                        page_numbers.append({"start": int(a), "end": int(b)})
                    else:
                        page_numbers.append({"start": int(r), "end": int(r)})
            instream.close()
        P = []
        for p in page_numbers:
            P.extend(range(p["start"], p["end"] + 1))
        return sorted(set(P))

def _save_page(self, filepath, page, temp):
        """Saves specified page from PDF into a temporary directory.
        Parameters
        ----------
        filepath : str
            Filepath or URL of the PDF file.
        page : int
            Page number.
        temp : str
            Tmp directory.
        """
        with open(filepath, "rb") as fileobj:
            infile = PdfFileReader(fileobj, strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
            fpath = os.path.join(temp, f"page-{page}.pdf")
            froot, fext = os.path.splitext(fpath)
            p = infile.getPage(page - 1)
            outfile = PdfFileWriter()
            outfile.addPage(p)
            with open(fpath, "wb") as f:
                outfile.write(f)
            layout, dim = get_page_layout(fpath)
            # fix rotated PDF
            chars = get_text_objects(layout, ltype="char")
            horizontal_text = get_text_objects(layout, ltype="horizontal_text")
            vertical_text = get_text_objects(layout, ltype="vertical_text")
            rotation = get_rotation(chars, horizontal_text, vertical_text)
            if rotation != "":
                fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
                os.rename(fpath, fpath_new)
                instream = open(fpath_new, "rb")
                infile = PdfFileReader(instream, strict=False)
                if infile.isEncrypted:
                    infile.decrypt(self.password)
                outfile = PdfFileWriter()
                p = infile.getPage(0)
                if rotation == "anticlockwise":
                    p.rotateClockwise(90)
                elif rotation == "clockwise":
                    p.rotateCounterClockwise(90)
                outfile.addPage(p)
                with open(fpath, "wb") as f:
                    outfile.write(f)
                instream.close()

def parse(
        self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
    ):
        """Extracts tables by calling parser.get_tables on all single
        page PDFs.
        Parameters
        ----------
        flavor : str (default: 'lattice')
            The parsing method to use ('lattice' or 'stream').
            Lattice is used by default.
        suppress_stdout : str (default: False)
            Suppress logs and warnings.
        layout_kwargs : dict, optional (default: {})
            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
        kwargs : dict
            See camelot.read_pdf kwargs.
        Returns
        -------
        tables : camelot.core.TableList
            List of tables found in PDF.
        """
        tables = []
        with TemporaryDirectory() as tempdir:
            for p in self.pages:
                self._save_page(self.filepath, p, tempdir)
            pages = [
                os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
            ]
            parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
            for p in pages:
                t = parser.extract_tables(
                    p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
                )
                tables.extend(t)
        return TableList(sorted(tables))

Changed text

Open file

# -*- coding: utf-8 -*-

import os
import sys

import fitz

def _get_pages(self, filepath, pages):
        """Converts pages string to list of ints.
        Parameters
        ----------
        filepath : str
            Filepath or URL of the PDF file.
        pages : str, optional (default: '1')
            Comma-separated page numbers.
            Example: '1,3,4' or '1,4-end' or 'all'.
        Returns
        -------
        P : list
            List of int page numbers.
        """
        page_numbers = []
        if pages == "1":
            page_numbers.append({"start": 1, "end": 1})
        else:
            with fitz.open(filepath) as infile:
                if infile.needsPass:
                    infile.authenticate(self.password)
                if pages == "all":
                    page_numbers.append({"start": 1, "end": infile.pageCount})
                else:
                    for r in pages.split(","):
                        if "-" in r:
                            a, b = r.split("-")
                            if b == "end":
                                b = infile.pageCount
                            page_numbers.append(
                                {"start": int(a), "end": int(b)})
                        else:
                            page_numbers.append(
                                {"start": int(r), "end": int(r)})
        P = []
        for p in page_numbers:
            P.extend(range(p["start"], p["end"] + 1))
        return sorted(set(P))

def _save_page(self, filepath, page, temp):
        """Saves specified page from PDF into a temporary directory.
        Parameters
        ----------
        filepath : str
            Filepath or URL of the PDF file.
        page : int
            Page number.
        temp : str
            Tmp directory.
        """
        with fitz.open(filepath) as infile:
            if infile.needsPass:
                infile.authenticate(self.password)
            fpath = os.path.join(temp, f"page-{page}.pdf")
            froot, fext = os.path.splitext(fpath)
            p = infile[page - 1]
            p.setRotation(0)
            outfile = fitz.open()
            outpage = outfile.newPage(-1, width=p.rect.width,
                                      height=p.rect.height)
            outpage.showPDFpage(outpage.rect, infile, pno=page-1)
            outfile.save(fpath)

layout, dim = get_page_layout(fpath)
            # fix rotated PDF
            chars = get_text_objects(layout, ltype="char")
            horizontal_text = get_text_objects(layout, ltype="horizontal_text")
            vertical_text = get_text_objects(layout, ltype="vertical_text")
            rotation = get_rotation(chars, horizontal_text, vertical_text)
            if rotation != "":
                fpath_new = "".join(
                    [froot.replace("page", "p"), "_rotated", fext])
                os.rename(fpath, fpath_new)
                infile = fitz.open(fpath_new)
                if infile.needsPass:
                    infile.authenticate(self.password)
                outfile = fitz.open()
                p = infile[0]

outpage = outfile.newPage(-1, width=p.rect.width,
                                          height=p.rect.height)
                outpage.showPDFpage(outpage.rect, infile, pno=0)
                if rotation == "anticlockwise":
                    outpage.setRotation((p.rotation + 90) % 360)
                elif rotation == "clockwise":
                    outpage.setRotation((p.rotation + 270) % 360)

outfile.save(fpath)

def parse(
        self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
    ):
        """Extracts tables by calling parser.get_tables on all single
        page PDFs.
        Parameters
        ----------
        flavor : str (default: 'lattice')
            The parsing method to use ('lattice' or 'stream').
            Lattice is used by default.
        suppress_stdout : str (default: False)
            Suppress logs and warnings.
        layout_kwargs : dict, optional (default: {})
            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
        kwargs : dict
            See camelot.read_pdf kwargs.
        Returns
        -------
        tables : camelot.core.TableList
            List of tables found in PDF.
        """
        tables = []
        with TemporaryDirectory() as tempdir:
            try:
                for p in self.pages:
                    self._save_page(self.filepath, p, tempdir)
                pages = [
                    os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
                ]
                parser = Lattice(
                    **kwargs) if flavor == "lattice" else Stream(**kwargs)
                for p in pages:
                    t = parser.extract_tables(
                        p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
                    )
                    tables.extend(t)
            except ValueError as err:
                if str(err) == "document closed or encrypted":
                    raise ValueError("file has not been decrypted") from err

raise

return TableList(sorted(tables))