Tutorial de OCRmyPDF: convierta documentos escaneados en archivos PDF/A con capacidad de búsqueda con extracción de texto lateral y procesamiento por lotes
def _purge(*prefijos): para el nombre en [m for m in list(sys.modules)
if any(m == p or m.startswith(p + “.”) for p in prefixes)]: del sys.modules[name]
def _load_ocrmypdf(): _purge(“PIL”, “ocrmypdf”) import ocrmypdf return ocrmypdf try: ocrmypdf = _load_ocrmypdf() excepto ImportError as e: if “_Ink” en str(e) o “PIL” en str(e): print(“Reparando un Pillow incompatible (reinstalando Pillow<12)...") sh(f'"{sys.executable}" -m pip install -q --force-reinstall "pillow<12"') try: ocrmypdf = _load_ocrmypdf() print("Pillow reparado - continúa sin reiniciar.") excepto Excepción: elevar RuntimeError ("Pillow aún es incompatible en esta sesión. Use el menú de Colab: " "Tiempo de ejecución > Reiniciar sesión, luego ejecute esta celda nuevamente.” ) else: rise from ocrmypdf.exceptions import (ExitCode, PriorOcrFoundError, EncryptedPdfError, MissingDependencyError, TaggedPDFError, DigitalSignatureError, DpiError, InputFileError, UnsupportedImageFormatError,) de ocrmypdf.helpers importar check_pdf de ocrmypdf.pdfa importar file_claims_pdfa importar img2pdf desde PIL importar imagen, ImageDraw, ImageFont, ImageFilter logging.basicConfig(level=logging.WARNING, format=”%(levelname)s: %(message)s”) logging.getLogger(“ocrmypdf”).setLevel(logging.WARNING) logging.getLogger(“pdfminer”).setLevel(logging.ERROR) logging.getLogger(“PIL”).setLevel(logging.WARNING) SAMPLE_TEXT_PAGES = [
“Optical Character Recognition, commonly abbreviated as OCR, is the ”
“process of converting images of typed or printed text into machine ”
“encoded text. This page was generated as a synthetic scan so that the ”
“OCRmyPDF pipeline has something realistic to recognize and search.”,
“On 14 March 2026 the archive contained 1,482 pages across 37 folders. ”
“Roughly 92 percent of those pages were scanned at 200 to 300 dots per ”
“inch. The remaining 8 percent were skewed and required deskewing before ”
“any reliable recognition was possible.”,
“After OCRmyPDF finishes, the output is a searchable PDF/A file. You can ”
“select text, copy it, and run full text search across thousands of ”
“documents. The original image resolution is preserved while a hidden ”
“text layer is placed accurately underneath the page image.”,
]
def _find_font(): para cand in ( “/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf”, “/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf”, ): si os.path.exists(cand): return cand return Ninguno _FONT_PATH = _find_font() FONT = ImageFont.truetype(_FONT_PATH, 40) if _FONT_PATH else ImageFont.load_default() def _add_speckle(img, n=6000, dark=60): “””Espolvorea motas claras y oscuras para imitar el ruido del escáner (motiva –clean).””” import random px = img.load() w, h = img.size for _ in range(n): px[random.randint(0, w – 1), random.randint(0, h – 1)] = random.randint(0, dark) return img def render_page(text, skew=False): “””Renderiza una página A4 (1654×2339 px ≈ 200 DPI) de texto oscuro sobre blanco.””” W, H = 1654, 2339 img = Image.new(“L”, (W, H), 255) draw = ImageDraw.Draw(img) draw.multiline_text((150, 180), textwrap.fill(texto, ancho=58), relleno=25, fuente=FONT, espaciado=18) si está sesgado: img = img.rotate(6, resample=Imagen.BICUBIC, expand=False, fillcolor=255) img = img.filter(ImageFilter.GaussianBlur(0.6)) img = _add_speckle(img) return img def build_scanned_pdf(pdf_path: Path, pages_text, skew_index=1): “””Renderiza páginas a PNG y envuélvelas sin pérdidas en un PDF de solo imagen.””” pngs = []
para i, texto en enumerate(pages_text): img = render_page(text, skew=(i == skew_index)) p = pdf_path.parent / f”_pg_{pdf_path.stem}_{i}.png” img.save(p, format=”PNG”, dpi=(200, 200)) pngs.append(str(p)) con open(pdf_path, “wb”) como f: f.write(img2pdf.convert(pngs)) for p in pngs: os.remove(p) return pdf_path def do_ocr(input_file, output_file, **kw): “””Contenedor alrededor de ocrmypdf.ocr() que desactiva la barra de progreso y la cronometra.””” kw.setdefault(“progress_bar”, False) t0 = time.perf_counter() rc = ocrmypdf.ocr(archivo_entrada, archivo_salida, **kw) devuelve rc, time.perf_counter() – t0 def tokens(s: str): devuelve re.findall(r”[a-z0-9]+”, s.lower()) def kb(ruta) -> str: return f”{Ruta(ruta).stat().st_size / 1024:,.1f} KB” def banner(título: str): línea = “─” * 74 print(f”\n{línea}\n {título}\n{línea}”)