Source code for internetarchivepdf.recode

# archive-pdf-tools
# Copyright (C) 2020-2021, Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Author: Merlijn Boris Wolf Wajer <merlijn@archive.org>
import sys
import os
import subprocess
from os import remove
from time import time
from datetime import datetime
from tempfile import mkstemp
from os.path import join
import shutil
import json
from glob import glob
import re
import io


from PIL import Image
import numpy as np
import fitz

from hocr.parse import (hocr_page_iterator, hocr_page_to_word_data,
        hocr_page_get_dimensions, hocr_page_get_scan_res)
from internetarchivepdf.mrc import create_mrc_hocr_components, \
        encode_mrc_images, encode_mrc_mask
from internetarchivepdf.grayconvert import special_gray_convert
from internetarchivepdf.pdfhacks import fast_insert_image, write_pdfa, \
        write_page_labels, write_basic_ua, write_metadata, write_pdf_toc
from internetarchivepdf.pdfrenderer import TessPDFRenderer
from internetarchivepdf.scandata import scandata_xml_get_skip_pages, \
        scandata_xml_get_page_numbers, scandata_xml_get_dpi_per_page, \
        scandata_xml_get_document_dpi
from internetarchivepdf.jpeg2000 import decode_jpeg2000, get_jpeg2000_info
from internetarchivepdf.const import (IMAGE_MODE_PASSTHROUGH, IMAGE_MODE_PIXMAP,
        IMAGE_MODE_MRC, RECODE_RUNTIME_WARNING_INVALID_PAGE_SIZE,
        RECODE_RUNTIME_WARNING_INVALID_PAGE_NUMBERS,
        RECODE_RUNTIME_WARNING_INVALID_JP2_HEADERS, JPEG2000_IMPL_KAKADU,
        JPEG2000_IMPL_OPENJPEG, JPEG2000_IMPL_GROK, JPEG2000_IMPL_PILLOW,
        COMPRESSOR_JPEG2000, COMPRESSOR_JPEG)

PDFA_MIN_UNITS = 3
PDFA_MAX_UNITS = 14400

Image.MAX_IMAGE_PIXELS = 625000000


[docs] def guess_dpi(w, h, expected_format=(8.27, 11.69), round_to=[72, 96, 150, 300, 600]): """ Guesstimate DPI for a given image. Args: * w (int): width of the image * h (int): height of the image * expected_format (tuple): (width_inch, height_inch) of expected document, defaults to european A4. * round_to (list of int): List of acceptable DPI values. Defaults to (72, 96, 150, 300, 600) Returns an int which is the best matching DPI picked from round_to. """ w_dpi = w / expected_format[0] h_dpi = h / expected_format[1] diffs = [] for dpi in round_to: diff = abs(w_dpi - dpi) + abs(h_dpi - dpi) diffs.append((dpi, diff)) sorted_diffs = sorted(diffs, key=lambda x: x[1]) return sorted_diffs[0][0]
def create_tess_textonly_pdf(hocr_file, save_path, in_pdf=None, image_files=None, dpi=None, skip_pages=None, dpi_pages=None, reporter=None, verbose=False, debug=False, stop_after=None, render_text_lines=False, tmp_dir=None, jpeg2000_implementation=None, errors=None): hocr_iter = hocr_page_iterator(hocr_file) render = TessPDFRenderer(render_text_lines=render_text_lines) render.BeginDocumentHandler() skipped_pages = 0 last_time = time() reporting_page_count = 0 if verbose: print('Starting page generation at', datetime.utcnow().isoformat()) for idx, hocr_page in enumerate(hocr_iter): w, h = hocr_page_get_dimensions(hocr_page) hocr_dpi = hocr_page_get_scan_res(hocr_page) # If scan_res is not found in hOCR, it returns (None, None) hocr_dpi = hocr_dpi[1] if skip_pages is not None and idx in skip_pages: if verbose: print('Skipping page %d' % idx) skipped_pages += 1 continue if stop_after is not None and (idx - skipped_pages) >= stop_after: break if in_pdf is not None: page = in_pdf[idx - skipped_pages] width = page.rect.width height = page.rect.height scaler = page.rect.width / w ppi = 72 / scaler elif image_files is not None: # Do not subtract skipped pages here try: imgfile = image_files[idx] except IndexError: raise IndexError('Number of pages in hOCR does not match number of images provided') if imgfile.endswith('.jp2'): size, _ = get_jpeg2000_info(imgfile, jpeg2000_implementation, errors) imwidth, imheight = size else: img = Image.open(imgfile) imwidth, imheight = img.size del img page_dpi = dpi per_page_dpi = None if dpi_pages is not None: try: per_page_dpi = int(dpi_pages[idx - skipped_pages]) page_dpi = per_page_dpi except: pass # Keep item-wide dpi if available # Both document level dpi is not available and per-page dpi is not # available, let's guesstimate # Assume european A4 (8.27",11.69") and guess DPI # to be one-of (72, 96, 150, 300, 600) if page_dpi is None: page_dpi = guess_dpi(imwidth, imheight, expected_format=(8.27, 11.69), round_to=(72, 96, 150, 300, 600)) page_width = imwidth / (page_dpi / 72) if page_width <= PDFA_MIN_UNITS or page_width >= PDFA_MAX_UNITS: if verbose: print('Page size invalid with current image size and dpi.') print('Image size: %d, %d. DPI: %d' % (imwidth, imheight, page_dpi)) # First let's try without per_page_dpi, is avail, then try to # guess the page dpi, if that also fails, then set to min # or max allowed size if per_page_dpi is not None and dpi: if verbose: print('Trying document level dpi:', dpi) page_width = imwidth / (dpi / 72) # If that didn't work, guess if page_width <= PDFA_MIN_UNITS or page_width >= PDFA_MAX_UNITS: page_dpi = guess_dpi(imwidth, imheight, expected_format=(8.27, 11.69), round_to=(72, 96, 150, 300, 600)) if verbose: print('Guessing DPI:', dpi) page_width = imwidth / (page_dpi / 72) # If even guessing fails, let's just set minimal values since # this typically only happens for really tiny images if page_width <= PDFA_MIN_UNITS or page_width >= PDFA_MAX_UNITS: page_width = PDFA_MIN_UNITS + 1 page_height = PDFA_MIN_UNITS + 1 # Add warning/error if errors is not None: errors.add(RECODE_RUNTIME_WARNING_INVALID_PAGE_SIZE) scaler = page_width / imwidth ppi = 72. / scaler width = page_width height = imheight * scaler font_scaler = 1 if hocr_dpi is not None: font_scaler = hocr_dpi / ppi else: font_scaler = 72. / ppi word_data = hocr_page_to_word_data(hocr_page, font_scaler) render.AddImageHandler(word_data, width, height, ppi=ppi, hocr_ppi=hocr_dpi) reporting_page_count += 1 if verbose: print('Finished page generation at', datetime.utcnow().isoformat()) print('Creating text pages took %.4f seconds' % (time() - last_time)) if reporter and reporting_page_count != 0: current_time = time() ms = int(((current_time - last_time) / reporting_page_count) * 1000) data = json.dumps({'text_pages': {'count': reporting_page_count, 'time-per': ms}}) subprocess.check_output(reporter, input=data.encode('utf-8')) render.EndDocumentHandler() fp = open(save_path, 'wb+') fp.write(render._data) fp.close() def get_timing_summary(timing_data): sums = {} # We expect this to always happen per page image_load_c = 0 for v in timing_data: key = v[0] val = v[1] if key == 'image_load': image_load_c += 1 if key not in sums: sums[key] = 0. sums[key] += val for k in sums.keys(): sums[k] = sums[k] / image_load_c for k in sums.keys(): # For statsd, in ms sums[k] = int(sums[k] * 1000) return sums def insert_images_mrc(to_pdf, hocr_file, from_pdf=None, image_files=None, dpi=None, dpi_pages=None, bg_compression_flags=None, fg_compression_flags=None, skip_pages=None, img_dir=None, jbig2=False, downsample=None, bg_downsample=None, fg_downsample=None, denoise_mask=None, reporter=None, hq_pages=None, hq_bg_compression_flags=None, hq_fg_compression_flags=None, verbose=False, debug=False, tmp_dir=None, report_every=None, stop_after=None, grayscale_pdf=False, force_1bit_output=None, jpeg2000_implementation=None, mrc_image_format=None, threads=None, errors=None): hocr_iter = hocr_page_iterator(hocr_file) skipped_pages = 0 last_time = time() timing_data = [] reporting_page_count = 0 downsampled = False #for idx, page in enumerate(to_pdf): for idx, hocr_page in enumerate(hocr_iter): if skip_pages is not None and idx in skip_pages: skipped_pages += 1 continue idx = idx - skipped_pages if stop_after is not None and idx >= stop_after: break picked_dpi = None hocr_dpi = hocr_page_get_scan_res(hocr_page) if dpi_pages is not None: picked_dpi = dpi_pages[idx] if picked_dpi is None: picked_dpi = hocr_dpi[1] if picked_dpi is None: picked_dpi = dpi if picked_dpi is not None: picked_dpi = int(picked_dpi) page = to_pdf[idx] if from_pdf is not None: # TODO: Support more images and their masks, if they exist (and # write them to the right place in the PDF) t = time() img = from_pdf[idx].get_images()[0] xref = img[0] maskxref = img[1] image = from_pdf.extract_image(xref) imgfd = io.BytesIO() imgfd.write(image["image"]) image = Image.open(imgfd) image.load() imgfd.close() if timing_data is not None: timing_data.append(('image_load', time()-t)) else: t = time() # Do not subtract skipped pages here imgfile = image_files[idx+skipped_pages] # Potentially special path if imgfile.endswith('.jp2') or imgfile.endswith('.jpx'): image = decode_jpeg2000(imgfile, reduce_=downsample, impl=jpeg2000_implementation, threads=threads, debug=debug) if downsample: downsampled = True else: image = Image.open(imgfile) image.load() if image.mode in ('RGBA', 'LA'): if image.mode == 'RGBA': image = image.convert('RGB') elif image.mode == 'LA': image = image.convert('L') if timing_data is not None: timing_data.append(('image_load', time()-t)) if grayscale_pdf and image.mode not in ('L', 'LA'): t = time() image = Image.fromarray(special_gray_convert(np.array(image))) if timing_data is not None: timing_data.append(('special_gray_convert', time()-t)) render_hq = hq_pages[idx] if downsample is not None and not downsampled: w, h = image.size image.thumbnail((w/downsample, h/downsample), resample=Image.LANCZOS, reducing_gap=None) downsampled = True hocr_word_data = hocr_page_to_word_data(hocr_page) if image.mode == '1': ww, hh = image.size mask_jb2, mask_png = encode_mrc_mask(np.array(image), tmp_dir=tmp_dir, jbig2=jbig2, timing_data=timing_data, debug=debug) t = time() if jbig2: mask_contents = open(mask_jb2, 'rb').read() remove(mask_jb2) else: mask_contents = open(mask_png, 'rb').read() # We currently always return the PNG file remove(mask_png) page.insert_image(page.rect, stream=mask_contents, width=ww, height=hh, alpha=0) if timing_data is not None: timing_data.append(('page_image_insertion', time() - t)) elif force_1bit_output == True: ww, hh = image.size mrc_gen = create_mrc_hocr_components(image, hocr_word_data, dpi=picked_dpi, downsample=downsample, bg_downsample=None if render_hq else bg_downsample, fg_downsample=None if render_hq else fg_downsample, denoise_mask=denoise_mask, timing_data=timing_data, errors=errors) np_mask = next(mrc_gen) np_mask = np_mask ^ np.ones(np_mask.shape, dtype=bool) mask_jb2, mask_png = encode_mrc_mask(np_mask, tmp_dir=tmp_dir, jbig2=jbig2, timing_data=timing_data, debug=debug) if jbig2: mask_contents = open(mask_jb2, 'rb').read() remove(mask_jb2) else: mask_contents = open(mask_png, 'rb').read() # We currently always return the PNG file remove(mask_png) page.insert_image(page.rect, stream=mask_contents, width=ww, height=hh, alpha=0) if timing_data is not None: timing_data.append(('page_image_insertion', time() - t)) else: mrc_gen = create_mrc_hocr_components(image, hocr_word_data, dpi=picked_dpi, downsample=downsample, bg_downsample=None if render_hq else bg_downsample, fg_downsample=None if render_hq else fg_downsample, denoise_mask=denoise_mask, timing_data=timing_data, errors=errors) # TODO: keep all these files on disk, and insert them into the pager # later? maybe? or just saveIncr() # TODO: maybe call the encode_mrc_{mask,foreground,background} # separately from here so that we can free the arrays sooner (and even # get the images separately from the create_mrc_hocr_components call) fast_insert_image_ok = jbig2 and image.mode in ('L', 'RGB') mask_f, bg_f, bg_s, fg_f, fg_s = encode_mrc_images(mrc_gen, bg_compression_flags=hq_bg_compression_flags if render_hq else bg_compression_flags, fg_compression_flags=hq_fg_compression_flags if render_hq else fg_compression_flags, tmp_dir=tmp_dir, jbig2=jbig2, timing_data=timing_data, jpeg2000_implementation=jpeg2000_implementation, mrc_image_format=mrc_image_format, embedded_jbig2=fast_insert_image_ok, threads=threads, debug=debug) if img_dir is not None: shutil.copy(mask_f, join(img_dir, '%.6d_mask.jbig2' % idx)) shutil.copy(bg_f, join(img_dir, '%.6d_bg.jp2' % idx)) shutil.copy(fg_f, join(img_dir, '%.6d_fg.jp2' % idx)) t = time() bg_contents = open(bg_f, 'rb').read() if not jbig2 or image.mode not in ('L', 'RGB'): # Tell PyMuPDF about width/height/alpha since it's faster this way page.insert_image(page.rect, stream=bg_contents, mask=None, overlay=False, width=bg_s[0], height=bg_s[1], alpha=0) else: fast_insert_image(page, page.rect, stream=bg_contents, mask=None, width=bg_s[0], height=bg_s[1], stream_fmt=mrc_image_format, gray=image.mode == 'L') fg_contents = open(fg_f, 'rb').read() mask_contents = open(mask_f, 'rb').read() # Tell PyMuPDF about width/height/alpha since it's faster this way if not jbig2 or image.mode not in ('L', 'RGB'): page.insert_image(page.rect, stream=fg_contents, mask=mask_contents, overlay=True, width=fg_s[0], height=fg_s[1], alpha=0) else: fast_insert_image(page, page.rect, stream=fg_contents, mask=mask_contents, width=fg_s[0], height=fg_s[1], stream_fmt=mrc_image_format, gray=image.mode == 'L') # Remove leftover files remove(mask_f) remove(bg_f) remove(fg_f) if timing_data is not None: timing_data.append(('page_image_insertion', time() - t)) reporting_page_count += 1 if report_every is not None and reporting_page_count % report_every == 0: print('Processed %d PDF pages.' % (idx + 1)) sys.stdout.flush() timing_sum = get_timing_summary(timing_data) timing_data = [] if reporter: current_time = time() ms = int(((current_time - last_time) / reporting_page_count) * 1000) data = json.dumps({'compress_pages': {'count': reporting_page_count, 'time-per': ms}, 'page_time_breakdown': timing_sum}) subprocess.check_output(reporter, input=data.encode('utf-8')) # Reset chunk timer last_time = time() # Reset chunk counter reporting_page_count = 0 if reporter and reporting_page_count != 0: current_time = time() ms = int(((current_time - last_time) / reporting_page_count) * 1000) timing_sum = get_timing_summary(timing_data) data = json.dumps({'compress_pages': {'count': reporting_page_count, 'time-per': ms}, 'page_time_breakdown': timing_sum}) subprocess.check_output(reporter, input=data.encode('utf-8')) if verbose: summary = get_timing_summary(timing_data) print('MRC time breakdown:', summary) def insert_images(from_pdf, to_pdf, mode, report_every=None, stop_after=None): # TODO: This hasn't been updated, should fix this up, only MRC is tested # really. # TODO: implement img_dir here for idx, page in enumerate(to_pdf): # XXX: TODO: FIXME: MEGAHACK: For some reason the _imgonly PDFs # generated by us have all images on all pages according to pymupdf, so # hack around that for now. img = sorted(from_pdf.getPageImageList(idx))[idx] #img = from_pdf.getPageImageList(idx)[0] xref = img[0] maskxref = img[1] if mode == IMAGE_MODE_PASSTHROUGH: image = from_pdf.extract_image(xref) page.insert_image(page.rect, stream=image["image"], overlay=False) elif mode == IMAGE_MODE_PIXMAP: pixmap = fitz.Pixmap(from_pdf, xref) page.insert_image(page.rect, pixmap=pixmap, overlay=False) if stop_after is not None and idx >= stop_after: break if report_every is not None and idx % report_every == 0: print('Processed %d PDF pages.' % (idx + 1)) sys.stdout.flush() # TODO: Document these options (like in bin/recode_pdf) def recode(from_pdf=None, from_imagestack=None, dpi=None, hocr_file=None, scandata_file=None, out_pdf=None, out_dir=None, reporter=None, grayscale_pdf=False, force_1bit_output=False, image_mode=IMAGE_MODE_MRC, jbig2=False, verbose=False, debug=False, tmp_dir=None, report_every=None, stop_after=None, jpeg2000_implementation=JPEG2000_IMPL_PILLOW, bg_compression_flags=None, fg_compression_flags=None, mrc_image_format=None, downsample=None, bg_downsample=None, fg_downsample=None, denoise_mask=None, hq_pages=None, hq_bg_compression_flags=None, hq_fg_compression_flags=None, threads=None, render_text_lines=False, metadata_url=None, metadata_title=None, metadata_author=None, metadata_creator=None, metadata_language=None, metadata_subject=None, metadata_creatortool=None, ignore_invalid_pagenumbers=False): # TODO: document that the scandata document dpi will override the dpi arg # TODO: Take hq-pages and reporter arg and change format (as lib call we # don't want to pass that as one string, I guess?) errors = set() in_pdf = None if from_pdf: in_pdf = fitz.open(from_pdf) image_files = None if from_imagestack: image_files = sorted(glob(from_imagestack)) hocr_file = hocr_file outfile = out_pdf stop = stop_after if stop is not None: stop -= 1 if verbose: from numpy.core._multiarray_umath import __cpu_features__ as cpu_have cpu = cpu_have for k, v in cpu.items(): if v: print('\t', k) reporter = reporter.split(' ') if reporter else None # TODO: overriding start_time = time() scandata_doc_dpi = None # Figure out if we have scandata, and figure out if we want to skip pages # based on scandata. skip_pages = [] dpi_pages = None if scandata_file is not None: skip_pages = scandata_xml_get_skip_pages(scandata_file) dpi_pages = scandata_xml_get_dpi_per_page(scandata_file) scandata_doc_dpi = scandata_xml_get_document_dpi(scandata_file) if scandata_doc_dpi is not None: # Let's prefer the DPI in the scandata file over the provided DPI dpi = scandata_doc_dpi # XXX: Maybe use a buffer, since the file is typically quite small fd, tess_tmp_path = mkstemp(prefix='pdfrenderer', suffix='.pdf', dir=tmp_dir) os.close(fd) if verbose: print('Creating text only PDF') # 1. Create text-only PDF from hOCR first, but honour page sizes of in_pdf create_tess_textonly_pdf(hocr_file, tess_tmp_path, in_pdf=in_pdf, image_files=image_files, dpi=dpi, skip_pages=skip_pages, dpi_pages=dpi_pages, reporter=reporter, verbose=verbose, debug=debug, stop_after=stop, render_text_lines=render_text_lines, tmp_dir=tmp_dir, jpeg2000_implementation=jpeg2000_implementation, errors=errors) if verbose: print('Inserting (and compressing) images') # 2. Load tesseract PDF and stick images in the PDF # We open the generated file but do not modify it in place outdoc = fitz.open(tess_tmp_path) HQ_PAGES = [False for x in range(outdoc.page_count)] if hq_pages is not None: index_range = map(int, hq_pages.split(',')) for i in index_range: # We want 0-indexed, not 1-indexed, but not negative numbers we want # to remain 1-indexed. if i > 0: i = i - 1 if abs(i) >= len(HQ_PAGES): # Page out of range, silently ignore for automation purposes. # We don't want scripts that call out tool to worry about how # many a PDF has exactly. E.g. if 1,2,3,4,-4,-3,-2,-1 is passed, # and a PDF has only three pages, let's just set them all to HQ # and not complain about 4 and -4 being out of range. continue # Mark page as HQ HQ_PAGES[i] = True if verbose: print('Converting with image mode:', image_mode) if image_mode == 2: insert_images_mrc(outdoc, hocr_file, from_pdf=in_pdf, image_files=image_files, dpi=dpi, dpi_pages=dpi_pages, bg_compression_flags=bg_compression_flags, fg_compression_flags=fg_compression_flags, skip_pages=skip_pages, img_dir=out_dir, jbig2=jbig2, downsample=downsample, bg_downsample=bg_downsample, fg_downsample=fg_downsample, denoise_mask=denoise_mask, reporter=reporter, hq_pages=HQ_PAGES, hq_bg_compression_flags=hq_bg_compression_flags, hq_fg_compression_flags=hq_fg_compression_flags, verbose=verbose, debug=debug, tmp_dir=tmp_dir, report_every=report_every, stop_after=stop, grayscale_pdf=grayscale_pdf, force_1bit_output=force_1bit_output, jpeg2000_implementation=jpeg2000_implementation, mrc_image_format=mrc_image_format, threads=threads, errors=errors) elif image_mode in (0, 1): # TODO: Update this codepath insert_images(in_pdf, outdoc, mode=image_mode, report_every=report_every, stop_after=stop) elif image_mode == 3: # 3 = skip pass # 3. Add PDF/A compliant data write_pdfa(outdoc) if scandata_file is not None: # XXX: we parse scandata twice now, let's not do that # 3b. Write page labels from scandata file, if present write_page_labels(outdoc, scandata_file, errors=errors, ignore_invalid=ignore_invalid_pagenumbers) write_pdf_toc(outdoc, scandata_file) lang_if_any = metadata_language[0] if metadata_language else None write_basic_ua(outdoc, language=lang_if_any) # 4. Write metadata extra_metadata = {} if metadata_url: extra_metadata['url'] = metadata_url if metadata_title: extra_metadata['title'] = metadata_title if metadata_creator: extra_metadata['creator'] = metadata_creator if metadata_author: extra_metadata['author'] = metadata_author if metadata_language: extra_metadata['language'] = metadata_language if metadata_subject: extra_metadata['subject'] = metadata_subject if metadata_creatortool: extra_metadata['creatortool'] = metadata_creatortool write_metadata(in_pdf, outdoc, extra_metadata=extra_metadata) # 5. Save mupdf_warnings = fitz.TOOLS.mupdf_warnings() if mupdf_warnings: print('mupdf warnings:', repr(mupdf_warnings)) if verbose: print('Saving PDF now') t = time() outdoc.save(outfile, deflate=True, pretty=True) save_time_ms = int((time() - t)*1000) if reporter: data = json.dumps({'time_to_save': {'time': save_time_ms}}) subprocess.check_output(reporter, input=data.encode('utf-8')) end_time = time() print('Processed %d pages at %.2f seconds/page' % (len(outdoc), (end_time - start_time) / len(outdoc))) if from_pdf is not None: oldsize = os.path.getsize(from_pdf) else: bytesum = 0 skipped_pages = 0 for idx, fname in enumerate(image_files): if skip_pages is not None and idx in skip_pages: skipped_pages += 1 continue if stop_after is not None and (idx - skipped_pages) > stop_after: break bytesum += os.path.getsize(fname) oldsize = bytesum newsize = os.path.getsize(out_pdf) compression_ratio = oldsize / newsize if verbose: print('Compression ratio: %f' % (compression_ratio)) # 5. Remove leftover files outdoc.close() remove(tess_tmp_path) return {'errors': errors, 'compression_ratio': compression_ratio}