Source code for pdfhandler.pdf_handler

"""High-level utilities for inspecting and modifying PDF files.

This module exposes :class:`PdfHandler`, a convenience wrapper around ``pikepdf``
and ``pdfminer`` for:

* text extraction and word counting
* encryption, decryption, and permission inspection
* moving, deleting, and resizing PDFs
* merging PDFs with optional separator pages
"""

import logging
import re
import shutil
from importlib.resources import as_file
from pathlib import Path
from typing import Literal, cast

import pikepdf
from colorama import Fore, init
from pdfminer.high_level import extract_text

PageNumberType = int | str | list[int | str] | None
PathType = Path | str
PathNoneType = PathType | None



[docs]
class PdfHandler:
    """Helper for common operations on a single PDF file.

    The handler validates the input path on construction and then provides
    methods for:

    * extracting text and counting words
    * checking and changing encryption / permissions
    * moving, deleting, and resizing the file
    * merging PDFs and inserting separator pages
    """

    # Handles mixed strings like "1, 3, 5-9 and 12"
    _page_number_regex = re.compile(r"(?:[\s,]*|(?:\sand\s))(\d+)(?:-(\d+))?")

    def __init__(self, pdf_path: PathType):
        """Create a PdfHandler for the given path.

        Parameters
        ----------
        pdf_path : str | Path
            Path to an existing ``.pdf`` file.

        Raises
        ------
        ValueError
            If the path does not end with ``.pdf`` (case-insensitive).
        FileExistsError
            If no file exists at the resolved path.
        """
        self.pdf_path = Path(str(pdf_path)).resolve()
        if self.pdf_path.suffix.lower() != ".pdf":
            raise ValueError(f"PDF path suffix must be '.pdf', not {self.pdf_path.suffix}")
        if not self.pdf_path.exists():
            raise FileExistsError(f"No file exists at {self.pdf_path}")

    @classmethod
    def _get_page_numbers_from_str(cls, page_numbers: str) -> list[int]:
        """Parse a 1-indexed page-string into a sorted list of page numbers.

        Accepts mixed formatting with commas, whitespace, ``"and"``, and ranges
        using hyphens. For example, the input string ``"1, 3, 5-7 and 10"``
        will return ``[1, 3, 5, 6, 7, 10]``.

        This method does not adjust for zero-based indexing; callers must
        subtract one if 0-indexed page indices are required.

        Parameters
        ----------
        page_numbers : str
            A string representing page numbers to include. Acceptable formats
            include:

            * single numbers (e.g., ``"3"``)
            * comma-separated numbers (e.g., ``"1, 2, 3"``)
            * ranges using hyphens (e.g., ``"4-6"``)
            * ``"and"`` as a delimiter (e.g., ``"2 and 5"``)
            * mixed input (e.g., ``"1, 3-4 and 6"``)

        Returns
        -------
        list[int]
            A sorted list of unique 1-indexed page numbers. Returns an empty
            list if no valid numbers are found.
        """
        page_number_matches = re.findall(cls._page_number_regex, page_numbers)

        page_numbers_set: set[int] = set()
        for match in page_number_matches:
            if match[-1] == "":
                page_numbers_set.add(int(match[0]))
            else:
                for page_number in range(int(match[0]), int(match[-1]) + 1):
                    page_numbers_set.add(page_number)

        page_numbers_list = sorted(page_numbers_set)
        return page_numbers_list

    @classmethod
    def _parse_page_numbers(cls, pages: PageNumberType) -> list[int]:
        """Normalize user-provided page numbers to 0-indexed page indices.

        Page numbers in user input are assumed to be 1-indexed and are converted
        to 0-indexed integers.

        Acceptable input formats
        ------------------------
        * ``None`` is not supported here (the caller should handle this)
        * a single ``int`` or ``str`` (e.g., ``3`` or ``"3"``)
        * a range string using a hyphen (e.g., ``"5-7"``)
        * a comma/space/``"and"``-delimited string
          (e.g., ``"1, 3 and 5-6"``)
        * a list of any combination of ``int`` and ``str``
          (e.g., ``[1, "3-4", "6 and 8"]``)

        Parameters
        ----------
        pages : PageNumberType
            The page numbers to extract, excluding ``None``. See acceptable
            formats above.

        Returns
        -------
        list[int]
            A sorted list of 0-indexed page indices.
        """
        if isinstance(pages, str):
            pages_list = cls._get_page_numbers_from_str(pages)
        elif isinstance(pages, int):
            pages_list = [pages]
        elif isinstance(pages, list):
            new_pages: set[int] = set()
            for page in pages:
                if isinstance(page, str):
                    for p in cls._get_page_numbers_from_str(page):
                        new_pages.add(p)
                else:
                    new_pages.add(int(page))
            pages_list = sorted(new_pages)
        else:
            msg = (
                f"pages must be an int, str, list[int | str], or similar value, not {type(pages)!r}"
            )
            raise TypeError(msg)

        return [int(page) - 1 for page in pages_list]


[docs]
    def get_pdf_text(self, pages: PageNumberType = None) -> str:
        """Extract text from the PDF, optionally from specific pages.

        Parameters
        ----------
        pages : PageNumberType, optional
            Pages to extract text from. If ``None`` (default), all pages are
            included. Acceptable formats include:

            * a single int or str (e.g., ``5`` or ``"5"``)
            * a range as a str (e.g., ``"2-4"``)
            * a comma/space/``"and"``-delimited str
              (e.g., ``"1, 3 and 5-6"``)
            * a list of ints and/or strs (e.g., ``[1, "3", "5-7"]``)

        Returns
        -------
        str
            The extracted text as a single string. Returns an empty string if
            no text is found.
        """
        if pages is None:
            with pikepdf.open(self.pdf_path) as pdf:
                page_indices = list(range(len(pdf.pages)))
        else:
            page_indices = cast(list[int], self._parse_page_numbers(pages))

        pdf_text = extract_text(self.pdf_path, page_numbers=page_indices).strip()
        return pdf_text



[docs]
    def word_count(self, pages: PageNumberType = None) -> int:
        """Count the number of words in the PDF.

        Parameters
        ----------
        pages : PageNumberType, optional
            Pages to include in the word count. If ``None`` (default), all
            pages are included. See :meth:`get_pdf_text` for accepted formats.

        Returns
        -------
        int
            The total number of words found on the specified pages.
        """
        text = self.get_pdf_text(pages)
        words = re.findall(r"\b\w+\b", text)
        return len(words)



[docs]
    def pdf_is_encrypted(self) -> bool:
        """Return whether the PDF is encrypted.

        Returns
        -------
        bool
            ``True`` if the PDF is encrypted, ``False`` otherwise.
        """
        with pikepdf.open(self.pdf_path) as pike_doc:
            return pike_doc.is_encrypted


    def _get_output_path(
        self,
        in_place: bool,
        output: PathNoneType,
        suffix: str,
    ) -> Path:
        """Resolve the output path for saving a modified PDF.

        Parameters
        ----------
        in_place : bool
            If ``True``, returns the original PDF path (the file is overwritten
            in place).
        output : str | Path | None
            The desired output path. Ignored if ``in_place`` is ``True``. If
            ``None``, a default path is generated using ``suffix``.
        suffix : str
            Suffix to append to the original filename when ``output`` is
            ``None``.

        Returns
        -------
        Path
            The resolved path for saving the output PDF.

        Raises
        ------
        ValueError
            If ``output`` is provided and does not have a ``.pdf`` extension.
        """
        if in_place:
            output_path = self.pdf_path
        elif output is None:
            output_path = self.pdf_path.parent / self.pdf_path.stem / f"{suffix}.pdf"
        else:
            output_path = Path(str(output))
            if output_path.suffix.lower() != ".pdf":
                msg = (
                    "output should either be None or be a path-like object "
                    "with a '.pdf' suffix. "
                    f"Got {output!r}."
                )
                raise ValueError(msg)

        return output_path


[docs]
    def save_pike_pdf(
        self,
        output: PathNoneType,
        in_place: bool = False,
        crypt_type: str | None = None,
        password: str | None = None,
        owner_password: str | None = None,
        extract: bool = True,
        modify_annotation: bool = True,
        modify_assembly: bool = True,
        modify_form: bool = True,
        modify_other: bool = True,
        print_lowres: bool = True,
        print_highres: bool = True,
    ) -> None:
        """Save the PDF with optional encryption or decryption applied.

        Parameters
        ----------
        output : str | Path | None
            Destination for the saved file. Ignored if ``in_place`` is ``True``.
            If ``None``, a new file is saved with a suffix such as
            ``"-Encrypted"`` or ``"-Decrypted"`` depending on usage.
        in_place : bool, default False
            If ``True``, overwrites the original file. If ``False``, creates a
            new file.
        crypt_type : str | None, default None
            A preset encryption mode. Must be one of:

            * ``"decrypt"`` : disables encryption entirely
            * ``"encrypt"`` : enables encryption with all permissions set
              to ``False``
            * ``"no_copy"`` : like ``"decrypt"`` but with extract permission
              set to ``False``
            * ``None`` : uses the individual permission arguments below
        password : str | None, default None
            User password for opening the encrypted PDF. If ``None`` or an
            empty string, no password is required to open.
        owner_password : str | None, default None
            Owner password used to set permissions. A default value is used if
            this is ``None``.
        extract : bool, default True
            Whether users can extract text or images.
        modify_annotation : bool, default True
            Whether users can modify annotations.
        modify_assembly : bool, default True
            Whether users can rearrange pages or merge documents.
        modify_form : bool, default True
            Whether users can fill in or edit form fields.
        modify_other : bool, default True
            Whether users can make general modifications.
        print_lowres : bool, default True
            Whether users can print in low resolution.
        print_highres : bool, default True
            Whether users can print in high resolution.

        Raises
        ------
        ValueError
            If ``crypt_type`` is invalid or if the resolved output path is
            invalid.
        """
        if password is None:
            password = ""  # nosec B105
        if owner_password is None:
            owner_password = "1234abcd"  # nosec B105

        output = self._get_output_path(in_place=in_place, output=output, suffix="")

        if crypt_type is not None:
            crypt_type = crypt_type.lower().strip()
        match crypt_type:
            case "decrypt":
                pike_encryption: pikepdf.Encryption | None = None
            case "encrypt":
                extract = False
                modify_annotation = False
                modify_assembly = False
                modify_form = False
                modify_other = False
                print_lowres = False
                print_highres = False
                pike_encryption = None
            case "no_copy":
                extract = False
                pike_encryption = None
            case None:
                pike_encryption = None
            case _:
                msg = (
                    "crypt_type must be one of ['encrypt', 'decrypt', "
                    f"'no_copy', None], not {crypt_type!r}"
                )
                raise ValueError(msg)

        if crypt_type != "decrypt":
            pike_encryption = pikepdf.Encryption(
                user=password,
                owner=owner_password,
                allow=pikepdf.Permissions(
                    extract=extract,
                    modify_annotation=modify_annotation,
                    modify_assembly=modify_assembly,
                    modify_form=modify_form,
                    modify_other=modify_other,
                    print_lowres=print_lowres,
                    print_highres=print_highres,
                ),
            )

        with pikepdf.open(self.pdf_path) as pike_doc:
            pike_doc.save(output, encryption=pike_encryption)



[docs]
    def get_pdf_permissions(self) -> dict[str, bool]:
        """Return the current permission settings of the PDF.

        Returns
        -------
        dict[str, bool]
            A dictionary mapping permission names to boolean values. Keys
            include:

            * ``"extract"``
            * ``"modify_annotation"``
            * ``"modify_assembly"``
            * ``"modify_form"``
            * ``"modify_other"``
            * ``"print_lowres"``
            * ``"print_highres"``
        """
        with pikepdf.open(self.pdf_path) as pike_doc:
            permissions_dict = {
                "extract": pike_doc.allow.extract,
                "modify_annotation": pike_doc.allow.modify_annotation,
                "modify_assembly": pike_doc.allow.modify_assembly,
                "modify_form": pike_doc.allow.modify_form,
                "modify_other": pike_doc.allow.modify_other,
                "print_lowres": pike_doc.allow.print_lowres,
                "print_highres": pike_doc.allow.print_highres,
            }

        return permissions_dict



[docs]
    def print_permissions(self) -> None:
        """Print encryption and permission status to the console.

        Output is color-coded using ``colorama``:

        * green for enabled permissions
        * red for disabled permissions
        """
        init()

        print(f"Permissions for {self.pdf_path}")

        is_encrypted = self.pdf_is_encrypted()
        print(
            "\tIs encrypted: "
            f"{Fore.LIGHTRED_EX if is_encrypted else Fore.LIGHTGREEN_EX}"
            f"{is_encrypted}{Fore.RESET}"
        )

        permissions_dict = self.get_pdf_permissions()
        for key, val in permissions_dict.items():
            print(f"\t\t{key}: {Fore.LIGHTGREEN_EX if val else Fore.LIGHTRED_EX}{val}{Fore.RESET}")



[docs]
    def encrypt(
        self,
        output: PathNoneType = None,
        in_place: bool = False,
        password: str | None = None,
        owner_password: str | None = None,
    ) -> None:
        """Encrypt the PDF if it is not already encrypted.

        This creates an encrypted version of the PDF using restrictive
        permissions by default. If ``in_place`` is ``False``, the encrypted
        file is saved to a new path; otherwise, the original file is
        overwritten.

        For fine-grained control over permissions, use :meth:`save_pike_pdf`
        directly.

        Parameters
        ----------
        output : str | Path | None, default None
            Destination path for the encrypted PDF. Ignored if
            ``in_place=True``. If ``None``, a new file is created with
            ``"-Encrypted"`` appended to the original name.
        in_place : bool, default False
            Whether to overwrite the original file in place.
        password : str | None, default None
            The user password required to open the PDF. If ``None`` or empty,
            no password is required to view.
        owner_password : str | None, default None
            The owner password used to set encryption and permissions.
        """
        if not self.pdf_is_encrypted():
            logging.info("Encrypting PDF: %s", self.pdf_path)

            output_path = self._get_output_path(in_place, output, "-Encrypted")

            self.save_pike_pdf(
                output_path,
                crypt_type="encrypt",
                password=password,
                owner_password=owner_password,
            )
        else:
            logging.info("PDF already encrypted: %s", self.pdf_path)



[docs]
    def decrypt(
        self,
        output: PathNoneType = None,
        in_place: bool = False,
        owner_password: str | None = None,
    ) -> None:
        """Decrypt the PDF if it is currently encrypted.

        If ``in_place`` is ``False`` (recommended), a decrypted copy is saved
        to a new file; otherwise, the original file is overwritten. If the PDF
        is not encrypted, no changes are made.

        Parameters
        ----------
        output : str | Path | None, default None
            Destination path for the decrypted PDF. Ignored if
            ``in_place=True``. If ``None``, a new file is created with
            ``"-Decrypted"`` appended to the original name.
        in_place : bool, default False
            Whether to overwrite the original file in place.
        owner_password : str | None, default None
            The owner password used to unlock and decrypt the PDF.
        """
        if owner_password is None:
            owner_password = "1234abcd"  # nosec B105

        if self.pdf_is_encrypted():
            logging.info("Decrypting PDF: %s", self.pdf_path)

            output_path = self._get_output_path(in_place, output, "-Decrypted")

            self.save_pike_pdf(output_path, crypt_type="decrypt")
        else:
            logging.info("PDF not encrypted, no changes made: %s", self.pdf_path)



[docs]
    def rm(self) -> None:
        """Delete the PDF file from disk."""
        self.pdf_path.unlink()



[docs]
    def mv(self, dst: PathType) -> None:
        """Move the PDF to a new location and update the internal path.

        Parameters
        ----------
        dst : str | Path
            Destination path, including the filename and ``.pdf`` extension.
        """
        dst_path = Path(str(dst))
        self.pdf_path.replace(dst_path)
        self.pdf_path = dst_path



[docs]
    def cp(self, new_path: PathNoneType = None) -> Path:
        """Copy the PDF to a specified location and return its Path.

        Parameters
        ----------
        new_path : str | Path | None, optional
            Path to the new copy. If None it will be saved to the original PDF's
            path with '-copy' embedded between the stem and suffix. (Default: None).
        """
        if new_path is None:
            new_path = self.pdf_path.parent / f"{self.pdf_path.stem}-copy.pdf"
        new_path = Path(str(new_path)).resolve()
        shutil.copy(self.pdf_path, new_path)
        return new_path



[docs]
    @classmethod
    def merge_pdfs(
        cls,
        pdf0_path: PathType,
        pdf1_path: PathType,
        output_path: PathType,
        add_separator: bool = False,
        separator_type: Literal["black", "blank"] = "black",
    ) -> None:
        """Merge two PDF files, placing the first file on top.

        Parameters
        ----------
        pdf0_path : str | Path
            Path to the first PDF, which will appear first in the output.
        pdf1_path : str | Path
            Path to the second PDF, which will appear after the first.
        output_path : str | Path
            Path to save the merged output PDF.
        add_separator : bool, default False
            If ``True``, insert a separator page between the PDFs.
        separator_type : {"black", "blank"}, default "black"
            Type of separator page to insert:

            * ``"black"`` : a black bar (~1 in height)
            * ``"blank"`` : a full blank page

        Raises
        ------
        ValueError
            If ``separator_type`` is not ``"black"`` or ``"blank"``.
        """
        with pikepdf.open(pdf0_path, allow_overwriting_input=True) as pdf0:
            if add_separator:
                match separator_type.lower():
                    case "black":
                        resource_file = "black_separator-636x72.pdf"
                    case "blank":
                        resource_file = "blank_page.pdf"
                    case _:
                        msg = (
                            "separator_type must be either 'black' or 'blank', "
                            f"not {separator_type!r}"
                        )
                        raise ValueError(msg)

                resource_path = Path(__file__).resolve().parent / "resources" / resource_file
                with (
                    as_file(resource_path) as sep_path,
                    pikepdf.open(sep_path) as sep_pdf,
                ):
                    pdf0.pages.extend(sep_pdf.pages)

            with pikepdf.open(pdf1_path) as pdf1:
                pdf0.pages.extend(pdf1.pages)

            pdf0.save(output_path)



[docs]
    def resize(
        self,
        width: int,
        height: int,
        output_path: PathNoneType = None,
    ) -> None:
        """Resize all pages in the PDF to the specified dimensions.

        Parameters
        ----------
        width : int
            Desired page width in points (1 inch = 72 points).
        height : int
            Desired page height in points (1 inch = 72 points).
        output_path : str | Path | None, default None
            Path to save the resized PDF. If ``None``, a new file is created in
            the same directory with the name pattern
            ``{original_name}-{width}x{height}.pdf``.

        Raises
        ------
        ValueError
            If ``output_path`` is provided and does not end with ``.pdf``.
        """
        if output_path is None:
            output_path = self.pdf_path.parent / f"{self.pdf_path.stem}-{width}x{height}.pdf"
        elif not str(output_path).lower().endswith(".pdf"):
            msg = f"output_path should end in .pdf, not {output_path!r}"
            raise ValueError(msg)

        pdf_dims_array = pikepdf.Array([0, 0, width, height])
        with pikepdf.open(self.pdf_path) as pdf:
            for page in pdf.pages:
                page.mediabox = pdf_dims_array
                page.cropbox = pdf_dims_array

            pdf.save(output_path)



[docs]
    @classmethod
    def pdfs_are_duplicates(cls, pdf0_path: PathType, pdf1_path: PathType) -> bool:
        """Return whether two PDFs have identical extracted text content.

        Text is extracted using :mod:`pdfminer`. Layout, formatting, and
        metadata differences are ignored.

        Parameters
        ----------
        pdf0_path : str | Path
            Path to the first PDF file.
        pdf1_path : str | Path
            Path to the second PDF file.

        Returns
        -------
        bool
            ``True`` if the extracted text from both PDFs is identical,
            ``False`` otherwise.
        """
        return extract_text(pdf0_path) == extract_text(pdf1_path)