Source code for pycharmers.cli.revise_text

#coding: utf-8
import os
import re
import sys
import argparse
import docx
import pandas as pd

from ..utils._colorings import toRED, toGREEN
from ..utils.generic_utils import now_str

[docs]def revise_text(argv=sys.argv[1:]): """Revise word file. Args: -W/--word (str) : Path to the word file. -E/--excel (str) : Path to the excel file. --sheet-name (str) : SheetName for the Excel. --NG (str) : The column name which indicates NG words. Defaults to ``"NG"`` --OK (str) : The column name which indicates OK words. Defaults to ``"OK"`` Note: When you run from the command line, execute as follows:: $ revise-text --word sample.docx --excel wordlist.xlsx """ parser = argparse.ArgumentParser(prog="revise-text", add_help=True) parser.add_argument("-W", "--word", type=str, help="Path to the word file.") parser.add_argument("-E", "--excel", type=str, help="Path to the excel file.") parser.add_argument("--sheet-name", type=str, help="SheetName for the Excel.") parser.add_argument("--NG", type=str, default="NG", help="The column name which indicates NG words. Defaults to ``NG``") parser.add_argument("--OK", type=str, default="OK", help="The column name which indicates OK words. Defaults to ``OK``") args = parser.parse_args(argv) df_wordlist = pd.read_excel(args.excel, sheet_name=args.sheet_name).fillna("") NG2OK = dict(df_wordlist[[args.NG, args.OK]].values) def repl_create(para_no:int, para_digit:int, text_digit:int) -> callable: """Replace word while printing the logs. Args: para_no (int) : The paragraph number. para_digit (int) : The digit for the number of paragraphs. text_digit (int) : The maximum digit for the number of texts in a paragraph. Returns: callable: Replacement function for ``re.sub`` """ def repl(match): ng_word = match.group() ok_word = NG2OK[ng_word] print(f"\t[Para.{para_no:>0{para_digit}}] ({match.start():>0{text_digit}}-{match.end():>0{text_digit}}) Reveised {toRED(ng_word)} -> {toGREEN(ok_word)}") return ok_word return repl doc = docx.Document(args.word) para_digit = len(str(len(doc.paragraphs))) text_digit = len(str(max([len(t) for t in doc.paragraphs]))) for i,para in enumerate(doc.paragraphs): for ng in NG2OK.keys(): para.text = re.sub(pattern=ng, repl=repl_create(para_no=i, para_digit=para_digit, text_digit=text_digit), string=para.text) doc.save(now_str().join(os.path.splitext(args.word)))