Source code for pycharmers.cli.pdfmine

#coding: utf-8
import io
import os
import sys
import fitz
import camelot
import argparse
from PIL import Image

from ..utils._colorings import toBLUE, toGREEN, toRED
from ..utils._path import _makedirs
from ..utils.generic_utils import formatted_enumerator
from ..utils.print_utils import pretty_3quote

SUPPORTED_TARGETS = ["img", "image", "table"]

[docs]def pdfmine(argv=sys.argv[1:]):
    """Analyze PDF and extract various elements.

    Args:
        path (str)             : Path/to/input PDF file.
        -O/--output-path (str) : Path/to/output directory.
        -T/--target (str)      : Target to extract.
        --quiet (bool)         : Whether to make the output quiet.

    Note:
        When you run from the command line, execute as follows::
        
        $ pdfmine -I sample.pdf -T img
    """
    parser = argparse.ArgumentParser(prog="pdfmine", add_help=True)
    parser.add_argument("path", type=str, help="Path/to/input PDF file.")
    parser.add_argument("-O", "--output-dir",  type=str, default=None, help="Path/to/output directory.")
    parser.add_argument("-T", "--target",      type=str, choices=SUPPORTED_TARGETS, help="Target to extract.")
    parser.add_argument("--quiet",    action="store_true", help="Whether to make the output quiet")
    args = parser.parse_args(argv)

    input_path = args.path
    output_dir = args.output_dir or os.path.splitext(input_path)[0]
    _makedirs(output_dir)
    target = args.target
    verbose = not args.quiet

    if verbose: 
        print(*pretty_3quote(f"""
        [pdfmine]
        * Input PDF file is at {toBLUE(input_path)}
        * Extracted data will be saved at {toBLUE(output_dir)}
        * Extraction target is {toGREEN(target)}
        """))

    if target in ["img", "image"]:
        pdf_file = fitz.open(input_path)
        pdf_gen = formatted_enumerator(pdf_file, start=1)
        for page_idx, page in pdf_gen:
            img_list = page.getImageList()
            img_gen = formatted_enumerator(img_list, start=1)
            if verbose:
                if img_gen.total>0:
                    print(f"[+] Found a total of {toGREEN(img_gen.total)} images in {page_idx}")
                else:
                    print(f"[!] No images found on page {page_idx}")
            for img_idx, img in formatted_enumerator(img_list, start=1):
                print("    - ", end="")
                xref = img[0]
                base_image = pdf_file.extractImage(xref=xref)
                fp = os.path.join(output_dir, f"p{page_idx}_{img_idx}.{base_image['ext']}")
                try:
                    with open(fp, "wb"):
                        Image.open(io.BytesIO(base_image["image"])).save(fp)
                    msg = toGREEN("saved")
                except Exception as e:
                    msg = toRED(e)
                print(f"\033[1F\033[{28+len(str(xref))}G {msg}")
                
    elif target == "table":
        tables = camelot.read_pdf(input_path)
        table_gen = formatted_enumerator(tables, start=1)
        print(f"Found a total of {table_gen.total} tables.")
        for table_idx,table in table_gen:
            table.to_csv(os.path.join(output_dir, table_idx+".csv"))
Source code for pycharmers.cli.pdfmine

Other contents

Social link

Table of Contents