Source code for pycharmers.cli.pdfmine

#coding: utf-8
import io
import os
import sys
import fitz
import camelot
import argparse
from PIL import Image

from ..utils._colorings import toBLUE, toGREEN, toRED
from ..utils._path import _makedirs
from ..utils.generic_utils import formatted_enumerator
from ..utils.print_utils import pretty_3quote

SUPPORTED_TARGETS = ["img", "image", "table"]

[docs]def pdfmine(argv=sys.argv[1:]): """Analyze PDF and extract various elements. Args: path (str) : Path/to/input PDF file. -O/--output-path (str) : Path/to/output directory. -T/--target (str) : Target to extract. --quiet (bool) : Whether to make the output quiet. Note: When you run from the command line, execute as follows:: $ pdfmine -I sample.pdf -T img """ parser = argparse.ArgumentParser(prog="pdfmine", add_help=True) parser.add_argument("path", type=str, help="Path/to/input PDF file.") parser.add_argument("-O", "--output-dir", type=str, default=None, help="Path/to/output directory.") parser.add_argument("-T", "--target", type=str, choices=SUPPORTED_TARGETS, help="Target to extract.") parser.add_argument("--quiet", action="store_true", help="Whether to make the output quiet") args = parser.parse_args(argv) input_path = args.path output_dir = args.output_dir or os.path.splitext(input_path)[0] _makedirs(output_dir) target = args.target verbose = not args.quiet if verbose: print(*pretty_3quote(f""" [pdfmine] * Input PDF file is at {toBLUE(input_path)} * Extracted data will be saved at {toBLUE(output_dir)} * Extraction target is {toGREEN(target)} """)) if target in ["img", "image"]: pdf_file = fitz.open(input_path) pdf_gen = formatted_enumerator(pdf_file, start=1) for page_idx, page in pdf_gen: img_list = page.getImageList() img_gen = formatted_enumerator(img_list, start=1) if verbose: if img_gen.total>0: print(f"[+] Found a total of {toGREEN(img_gen.total)} images in {page_idx}") else: print(f"[!] No images found on page {page_idx}") for img_idx, img in formatted_enumerator(img_list, start=1): print(" - ", end="") xref = img[0] base_image = pdf_file.extractImage(xref=xref) fp = os.path.join(output_dir, f"p{page_idx}_{img_idx}.{base_image['ext']}") try: with open(fp, "wb"): Image.open(io.BytesIO(base_image["image"])).save(fp) msg = toGREEN("saved") except Exception as e: msg = toRED(e) print(f"\033[1F\033[{28+len(str(xref))}G {msg}") elif target == "table": tables = camelot.read_pdf(input_path) table_gen = formatted_enumerator(tables, start=1) print(f"Found a total of {table_gen.total} tables.") for table_idx,table in table_gen: table.to_csv(os.path.join(output_dir, table_idx+".csv"))