Source code for internetarchivepdf.recode

# archive-pdf-tools
# Copyright (C) 2020-2021, Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Author: Merlijn Boris Wolf Wajer <merlijn@archive.org>
import sys
import os
import subprocess
from os import remove
from time import time
from datetime import datetime
from tempfile import mkstemp
from os.path import join
import shutil
import json
from glob import glob
import re
import io


from PIL import Image
import numpy as np
import fitz

from hocr.parse import (hocr_page_iterator, hocr_page_to_word_data,
        hocr_page_get_dimensions, hocr_page_get_scan_res)
from internetarchivepdf.mrc import create_mrc_hocr_components, \
        encode_mrc_images, encode_mrc_mask
from internetarchivepdf.grayconvert import special_gray_convert
from internetarchivepdf.pdfhacks import fast_insert_image, write_pdfa, \
        write_page_labels, write_basic_ua, write_metadata, write_pdf_toc
from internetarchivepdf.pdfrenderer import TessPDFRenderer
from internetarchivepdf.scandata import scandata_xml_get_skip_pages, \
        scandata_xml_get_page_numbers, scandata_xml_get_dpi_per_page, \
        scandata_xml_get_document_dpi
from internetarchivepdf.jpeg2000 import decode_jpeg2000, get_jpeg2000_info
from internetarchivepdf.const import (IMAGE_MODE_PASSTHROUGH, IMAGE_MODE_PIXMAP,
        IMAGE_MODE_MRC, RECODE_RUNTIME_WARNING_INVALID_PAGE_SIZE,
        RECODE_RUNTIME_WARNING_INVALID_PAGE_NUMBERS,
        RECODE_RUNTIME_WARNING_INVALID_JP2_HEADERS, JPEG2000_IMPL_KAKADU,
        JPEG2000_IMPL_OPENJPEG, JPEG2000_IMPL_GROK, JPEG2000_IMPL_PILLOW,
        COMPRESSOR_JPEG2000, COMPRESSOR_JPEG)

PDFA_MIN_UNITS = 3
PDFA_MAX_UNITS = 14400

Image.MAX_IMAGE_PIXELS = 625000000



[docs]
def guess_dpi(w, h, expected_format=(8.27, 11.69), round_to=[72, 96, 150, 300, 600]):
    """
    Guesstimate DPI for a given image.

    Args:

    * w (int): width of the image
    * h (int): height of the image
    * expected_format (tuple): (width_inch, height_inch) of expected document,
                               defaults to european A4.
    * round_to (list of int): List of acceptable DPI values.
                              Defaults to (72, 96, 150, 300, 600)

    Returns an int which is the best matching DPI picked from round_to.
    """
    w_dpi = w / expected_format[0]
    h_dpi = h / expected_format[1]
    diffs = []
    for dpi in round_to:
        diff = abs(w_dpi - dpi) + abs(h_dpi - dpi)
        diffs.append((dpi, diff))
    sorted_diffs = sorted(diffs, key=lambda x: x[1])
    return sorted_diffs[0][0]



def create_tess_textonly_pdf(hocr_file, save_path, in_pdf=None,
        image_files=None, dpi=None, skip_pages=None, dpi_pages=None,
        reporter=None,
        verbose=False, debug=False, stop_after=None,
        render_text_lines=False,
        tmp_dir=None,
        jpeg2000_implementation=None,
        errors=None):
    hocr_iter = hocr_page_iterator(hocr_file)

    render = TessPDFRenderer(render_text_lines=render_text_lines)
    render.BeginDocumentHandler()

    skipped_pages = 0

    last_time = time()
    reporting_page_count = 0

    if verbose:
        print('Starting page generation at', datetime.utcnow().isoformat())

    for idx, hocr_page in enumerate(hocr_iter):
        w, h = hocr_page_get_dimensions(hocr_page)
        hocr_dpi = hocr_page_get_scan_res(hocr_page)
        # If scan_res is not found in hOCR, it returns (None, None)
        hocr_dpi = hocr_dpi[1]

        if skip_pages is not None and idx in skip_pages:
            if verbose:
                print('Skipping page %d' % idx)
            skipped_pages += 1
            continue

        if stop_after is not None and (idx - skipped_pages) >= stop_after:
            break

        if in_pdf is not None:
            page = in_pdf[idx - skipped_pages]
            width = page.rect.width
            height = page.rect.height

            scaler = page.rect.width / w
            ppi = 72 / scaler
        elif image_files is not None:
            # Do not subtract skipped pages here
            try:
                imgfile = image_files[idx]
            except IndexError:
                raise IndexError('Number of pages in hOCR does not match number of images provided')

            if imgfile.endswith('.jp2'):
                size, _ = get_jpeg2000_info(imgfile, jpeg2000_implementation, errors)
                imwidth, imheight = size
            else:
                img = Image.open(imgfile)
                imwidth, imheight = img.size
                del img

            page_dpi = dpi
            per_page_dpi = None

            if dpi_pages is not None:
                try:
                    per_page_dpi = int(dpi_pages[idx - skipped_pages])
                    page_dpi = per_page_dpi
                except:
                    pass  # Keep item-wide dpi if available

            # Both document level dpi is not available and per-page dpi is not
            # available, let's guesstimate
            # Assume european A4 (8.27",11.69") and guess DPI
            # to be one-of (72, 96, 150, 300, 600)
            if page_dpi is None:
                page_dpi = guess_dpi(imwidth, imheight,
                                     expected_format=(8.27, 11.69),
                                     round_to=(72, 96, 150, 300, 600))

            page_width = imwidth / (page_dpi / 72)
            if page_width <= PDFA_MIN_UNITS or page_width >= PDFA_MAX_UNITS:
                if verbose:
                    print('Page size invalid with current image size and dpi.')
                    print('Image size: %d, %d. DPI: %d' % (imwidth, imheight,
                                                           page_dpi))

                # First let's try without per_page_dpi, is avail, then try to
                # guess the page dpi, if that also fails, then set to min
                # or max allowed size
                if per_page_dpi is not None and dpi:
                    if verbose:
                        print('Trying document level dpi:', dpi)
                    page_width = imwidth / (dpi / 72)

                # If that didn't work, guess
                if page_width <= PDFA_MIN_UNITS or page_width >= PDFA_MAX_UNITS:
                    page_dpi = guess_dpi(imwidth, imheight,
                                         expected_format=(8.27, 11.69),
                                         round_to=(72, 96, 150, 300, 600))
                    if verbose:
                        print('Guessing DPI:', dpi)
                    page_width = imwidth / (page_dpi / 72)

                # If even guessing fails, let's just set minimal values since
                # this typically only happens for really tiny images
                if page_width <= PDFA_MIN_UNITS or page_width >= PDFA_MAX_UNITS:
                    page_width = PDFA_MIN_UNITS + 1
                    page_height = PDFA_MIN_UNITS + 1

                # Add warning/error
                if errors is not None:
                    errors.add(RECODE_RUNTIME_WARNING_INVALID_PAGE_SIZE)

            scaler = page_width / imwidth

            ppi = 72. / scaler

            width = page_width
            height = imheight * scaler

        font_scaler = 1
        if hocr_dpi is not None:
            font_scaler = hocr_dpi / ppi
        else:
            font_scaler = 72. / ppi

        word_data = hocr_page_to_word_data(hocr_page, font_scaler)
        render.AddImageHandler(word_data, width, height, ppi=ppi, hocr_ppi=hocr_dpi)

        reporting_page_count += 1


    if verbose:
        print('Finished page generation at', datetime.utcnow().isoformat())
        print('Creating text pages took %.4f seconds' % (time() - last_time))


    if reporter and reporting_page_count != 0:
        current_time = time()
        ms = int(((current_time - last_time) / reporting_page_count) * 1000)

        data = json.dumps({'text_pages': {'count': reporting_page_count,
                                              'time-per': ms}})
        subprocess.check_output(reporter, input=data.encode('utf-8'))

    render.EndDocumentHandler()

    fp = open(save_path, 'wb+')
    fp.write(render._data)
    fp.close()


def get_timing_summary(timing_data):
    sums = {}

    # We expect this to always happen per page
    image_load_c = 0

    for v in timing_data:
        key = v[0]
        val = v[1]

        if key == 'image_load':
            image_load_c += 1

        if key not in sums:
            sums[key] = 0.

        sums[key] += val

    for k in sums.keys():
        sums[k] = sums[k] / image_load_c

    for k in sums.keys():
        # For statsd, in ms
        sums[k] = int(sums[k] * 1000)

    return sums



def insert_images_mrc(to_pdf, hocr_file, from_pdf=None, image_files=None,
        dpi=None, dpi_pages=None,
        bg_compression_flags=None, fg_compression_flags=None,
        skip_pages=None, img_dir=None, jbig2=False,
        downsample=None,
        bg_downsample=None,
        fg_downsample=None,
        denoise_mask=None, reporter=None,
        hq_pages=None, hq_bg_compression_flags=None, hq_fg_compression_flags=None,
        verbose=False, debug=False, tmp_dir=None, report_every=None,
        stop_after=None, grayscale_pdf=False,
        force_1bit_output=None,
        jpeg2000_implementation=None, mrc_image_format=None, threads=None,
        errors=None):
    hocr_iter = hocr_page_iterator(hocr_file)

    skipped_pages = 0

    last_time = time()
    timing_data = []
    reporting_page_count = 0

    downsampled = False

    #for idx, page in enumerate(to_pdf):
    for idx, hocr_page in enumerate(hocr_iter):
        if skip_pages is not None and idx in skip_pages:
            skipped_pages += 1
            continue

        idx = idx - skipped_pages

        if stop_after is not None and idx >= stop_after:
            break

        picked_dpi = None

        hocr_dpi = hocr_page_get_scan_res(hocr_page)

        if dpi_pages is not None:
            picked_dpi = dpi_pages[idx]
            if picked_dpi is None:
                picked_dpi = hocr_dpi[1]

        if picked_dpi is None:
            picked_dpi = dpi

        if picked_dpi is not None:
            picked_dpi = int(picked_dpi)

        page = to_pdf[idx]

        if from_pdf is not None:
            # TODO: Support more images and their masks, if they exist (and
            # write them to the right place in the PDF)
            t = time()

            img = from_pdf[idx].get_images()[0]
            xref = img[0]
            maskxref = img[1]

            image = from_pdf.extract_image(xref)
            imgfd = io.BytesIO()
            imgfd.write(image["image"])
            image = Image.open(imgfd)
            image.load()
            imgfd.close()

            if timing_data is not None:
                timing_data.append(('image_load', time()-t))
        else:
            t = time()
            # Do not subtract skipped pages here
            imgfile = image_files[idx+skipped_pages]

            # Potentially special path
            if imgfile.endswith('.jp2') or imgfile.endswith('.jpx'):
                image = decode_jpeg2000(imgfile, reduce_=downsample,
                        impl=jpeg2000_implementation, threads=threads, debug=debug)
                if downsample:
                    downsampled = True
            else:
                image = Image.open(imgfile)
                image.load()

            if image.mode in ('RGBA', 'LA'):
                if image.mode == 'RGBA':
                    image = image.convert('RGB')
                elif image.mode == 'LA':
                    image = image.convert('L')

            if timing_data is not None:
                timing_data.append(('image_load', time()-t))

        if grayscale_pdf and image.mode not in ('L', 'LA'):
            t = time()
            image = Image.fromarray(special_gray_convert(np.array(image)))
            if timing_data is not None:
                timing_data.append(('special_gray_convert', time()-t))

        render_hq = hq_pages[idx]

        if downsample is not None and not downsampled:
            w, h = image.size
            image.thumbnail((w/downsample, h/downsample),
                            resample=Image.LANCZOS, reducing_gap=None)
            downsampled = True

        hocr_word_data = hocr_page_to_word_data(hocr_page)

        if image.mode == '1':
            ww, hh = image.size
            mask_jb2, mask_png = encode_mrc_mask(np.array(image), tmp_dir=tmp_dir,
                    jbig2=jbig2, timing_data=timing_data, debug=debug)

            t = time()

            if jbig2:
                mask_contents = open(mask_jb2, 'rb').read()
                remove(mask_jb2)
            else:
                mask_contents = open(mask_png, 'rb').read()

            # We currently always return the PNG file
            remove(mask_png)

            page.insert_image(page.rect, stream=mask_contents,
                    width=ww, height=hh, alpha=0)

            if timing_data is not None:
                timing_data.append(('page_image_insertion', time() - t))

        elif force_1bit_output == True:
            ww, hh = image.size
            mrc_gen = create_mrc_hocr_components(image, hocr_word_data,
                    dpi=picked_dpi,
                    downsample=downsample,
                    bg_downsample=None if render_hq else bg_downsample,
                    fg_downsample=None if render_hq else fg_downsample,
                    denoise_mask=denoise_mask,
                    timing_data=timing_data, errors=errors)
            np_mask = next(mrc_gen)
            np_mask = np_mask ^ np.ones(np_mask.shape, dtype=bool)
            mask_jb2, mask_png = encode_mrc_mask(np_mask, tmp_dir=tmp_dir, jbig2=jbig2,
                    timing_data=timing_data, debug=debug)

            if jbig2:
                mask_contents = open(mask_jb2, 'rb').read()
                remove(mask_jb2)
            else:
                mask_contents = open(mask_png, 'rb').read()

            # We currently always return the PNG file
            remove(mask_png)

            page.insert_image(page.rect, stream=mask_contents,
                    width=ww, height=hh, alpha=0)

            if timing_data is not None:
                timing_data.append(('page_image_insertion', time() - t))
        else:
            mrc_gen = create_mrc_hocr_components(image, hocr_word_data,
                    dpi=picked_dpi,
                    downsample=downsample,
                    bg_downsample=None if render_hq else bg_downsample,
                    fg_downsample=None if render_hq else fg_downsample,
                    denoise_mask=denoise_mask,
                    timing_data=timing_data, errors=errors)


            # TODO: keep all these files on disk, and insert them into the pager
            # later? maybe? or just saveIncr()
            # TODO: maybe call the encode_mrc_{mask,foreground,background}
            # separately from here so that we can free the arrays sooner (and even
            # get the images separately from the create_mrc_hocr_components call)

            fast_insert_image_ok = jbig2 and image.mode in ('L', 'RGB')

            mask_f, bg_f, bg_s, fg_f, fg_s = encode_mrc_images(mrc_gen,
                    bg_compression_flags=hq_bg_compression_flags if render_hq else bg_compression_flags,
                    fg_compression_flags=hq_fg_compression_flags if render_hq else fg_compression_flags,
                    tmp_dir=tmp_dir, jbig2=jbig2, timing_data=timing_data,
                    jpeg2000_implementation=jpeg2000_implementation,
                    mrc_image_format=mrc_image_format,
                    embedded_jbig2=fast_insert_image_ok,
                    threads=threads,
                    debug=debug)

            if img_dir is not None:
                shutil.copy(mask_f, join(img_dir, '%.6d_mask.jbig2' % idx))
                shutil.copy(bg_f, join(img_dir, '%.6d_bg.jp2' % idx))
                shutil.copy(fg_f, join(img_dir, '%.6d_fg.jp2' % idx))


            t = time()
            bg_contents = open(bg_f, 'rb').read()
            if not jbig2 or image.mode not in ('L', 'RGB'):
                # Tell PyMuPDF about width/height/alpha since it's faster this way
                page.insert_image(page.rect, stream=bg_contents, mask=None,
                    overlay=False, width=bg_s[0], height=bg_s[1], alpha=0)
            else:
                fast_insert_image(page, page.rect, stream=bg_contents,
                                  mask=None, width=bg_s[0], height=bg_s[1],
                                  stream_fmt=mrc_image_format,
                                  gray=image.mode == 'L')

            fg_contents = open(fg_f, 'rb').read()
            mask_contents = open(mask_f, 'rb').read()

            # Tell PyMuPDF about width/height/alpha since it's faster this way
            if not jbig2 or image.mode not in ('L', 'RGB'):
                page.insert_image(page.rect, stream=fg_contents, mask=mask_contents,
                        overlay=True, width=fg_s[0], height=fg_s[1], alpha=0)
            else:
                fast_insert_image(page, page.rect, stream=fg_contents,
                                  mask=mask_contents, width=fg_s[0], height=fg_s[1],
                                  stream_fmt=mrc_image_format,
                                  gray=image.mode == 'L')

            # Remove leftover files
            remove(mask_f)
            remove(bg_f)
            remove(fg_f)
            if timing_data is not None:
                timing_data.append(('page_image_insertion', time() - t))

        reporting_page_count += 1

        if report_every is not None and reporting_page_count % report_every == 0:
            print('Processed %d PDF pages.' % (idx + 1))
            sys.stdout.flush()

            timing_sum = get_timing_summary(timing_data)
            timing_data = []

            if reporter:
                current_time = time()
                ms = int(((current_time - last_time) / reporting_page_count) * 1000)

                data = json.dumps({'compress_pages': {'count': reporting_page_count,
                                                 'time-per': ms},
                                   'page_time_breakdown': timing_sum})
                subprocess.check_output(reporter, input=data.encode('utf-8'))

                # Reset chunk timer
                last_time = time()
                # Reset chunk counter
                reporting_page_count = 0


    if reporter and reporting_page_count != 0:
        current_time = time()
        ms = int(((current_time - last_time) / reporting_page_count) * 1000)

        timing_sum = get_timing_summary(timing_data)

        data = json.dumps({'compress_pages': {'count': reporting_page_count,
                                         'time-per': ms},
                           'page_time_breakdown': timing_sum})
        subprocess.check_output(reporter, input=data.encode('utf-8'))

    if verbose:
        summary = get_timing_summary(timing_data)
        print('MRC time breakdown:', summary)


def insert_images(from_pdf, to_pdf, mode, report_every=None, stop_after=None):
    # TODO: This hasn't been updated, should fix this up, only MRC is tested
    # really.
    # TODO: implement img_dir here

    for idx, page in enumerate(to_pdf):
        # XXX: TODO: FIXME: MEGAHACK: For some reason the _imgonly PDFs
        # generated by us have all images on all pages according to pymupdf, so
        # hack around that for now.
        img = sorted(from_pdf.getPageImageList(idx))[idx]
        #img = from_pdf.getPageImageList(idx)[0]

        xref = img[0]
        maskxref = img[1]
        if mode == IMAGE_MODE_PASSTHROUGH:
            image = from_pdf.extract_image(xref)
            page.insert_image(page.rect, stream=image["image"], overlay=False)
        elif mode == IMAGE_MODE_PIXMAP:
            pixmap = fitz.Pixmap(from_pdf, xref)
            page.insert_image(page.rect, pixmap=pixmap, overlay=False)

        if stop_after is not None and idx >= stop_after:
            break

        if report_every is not None and idx % report_every == 0:
            print('Processed %d PDF pages.' % (idx + 1))
            sys.stdout.flush()


# TODO: Document these options (like in bin/recode_pdf)
def recode(from_pdf=None, from_imagestack=None, dpi=None, hocr_file=None,
        scandata_file=None, out_pdf=None, out_dir=None,
        reporter=None,
        grayscale_pdf=False,
        force_1bit_output=False,
        image_mode=IMAGE_MODE_MRC, jbig2=False, verbose=False, debug=False,
        tmp_dir=None,
        report_every=None, stop_after=None,
        jpeg2000_implementation=JPEG2000_IMPL_PILLOW,
        bg_compression_flags=None, fg_compression_flags=None,
        mrc_image_format=None,
        downsample=None,
        bg_downsample=None,
        fg_downsample=None,
        denoise_mask=None,
        hq_pages=None,
        hq_bg_compression_flags=None, hq_fg_compression_flags=None,
        threads=None,
        render_text_lines=False,
        metadata_url=None, metadata_title=None, metadata_author=None,
        metadata_creator=None, metadata_language=None,
        metadata_subject=None, metadata_creatortool=None,
        ignore_invalid_pagenumbers=False):
    # TODO: document that the scandata document dpi will override the dpi arg
    # TODO: Take hq-pages and reporter arg and change format (as lib call we
    # don't want to pass that as one string, I guess?)

    errors = set()

    in_pdf = None
    if from_pdf:
        in_pdf = fitz.open(from_pdf)

    image_files = None
    if from_imagestack:
        image_files = sorted(glob(from_imagestack))

    hocr_file = hocr_file
    outfile = out_pdf

    stop = stop_after
    if stop is not None:
        stop -= 1

    if verbose:
        from numpy.core._multiarray_umath import __cpu_features__ as cpu_have
        cpu = cpu_have
        for k, v in cpu.items():
            if v:
                print('\t', k)


    reporter = reporter.split(' ') if reporter else None # TODO: overriding

    start_time = time()

    scandata_doc_dpi = None

    # Figure out if we have scandata, and figure out if we want to skip pages
    # based on scandata.
    skip_pages = []
    dpi_pages = None
    if scandata_file is not None:
        skip_pages = scandata_xml_get_skip_pages(scandata_file)
        dpi_pages = scandata_xml_get_dpi_per_page(scandata_file)
        scandata_doc_dpi = scandata_xml_get_document_dpi(scandata_file)

        if scandata_doc_dpi is not None:
            # Let's prefer the DPI in the scandata file over the provided DPI
            dpi = scandata_doc_dpi

    # XXX: Maybe use a buffer, since the file is typically quite small
    fd, tess_tmp_path = mkstemp(prefix='pdfrenderer', suffix='.pdf', dir=tmp_dir)
    os.close(fd)

    if verbose:
        print('Creating text only PDF')

    # 1. Create text-only PDF from hOCR first, but honour page sizes of in_pdf
    create_tess_textonly_pdf(hocr_file, tess_tmp_path, in_pdf=in_pdf,
            image_files=image_files, dpi=dpi,
            skip_pages=skip_pages, dpi_pages=dpi_pages,
            reporter=reporter,
            verbose=verbose, debug=debug, stop_after=stop,
            render_text_lines=render_text_lines,
            tmp_dir=tmp_dir,
            jpeg2000_implementation=jpeg2000_implementation,
            errors=errors)

    if verbose:
        print('Inserting (and compressing) images')
    # 2. Load tesseract PDF and stick images in the PDF
    # We open the generated file but do not modify it in place
    outdoc = fitz.open(tess_tmp_path)

    HQ_PAGES = [False for x in range(outdoc.page_count)]
    if hq_pages is not None:
        index_range = map(int, hq_pages.split(','))
        for i in index_range:
            # We want 0-indexed, not 1-indexed, but not negative numbers we want
            # to remain 1-indexed.
            if i > 0:
                i = i - 1

            if abs(i) >= len(HQ_PAGES):
                # Page out of range, silently ignore for automation purposes.
                # We don't want scripts that call out tool to worry about how
                # many a PDF has exactly. E.g. if 1,2,3,4,-4,-3,-2,-1 is passed,
                # and a PDF has only three pages, let's just set them all to HQ
                # and not complain about 4 and -4 being out of range.
                continue

            # Mark page as HQ
            HQ_PAGES[i] = True


    if verbose:
        print('Converting with image mode:', image_mode)
    if image_mode == 2:
        insert_images_mrc(outdoc, hocr_file,
                          from_pdf=in_pdf,
                          image_files=image_files,
                          dpi=dpi,
                          dpi_pages=dpi_pages,
                          bg_compression_flags=bg_compression_flags,
                          fg_compression_flags=fg_compression_flags,
                          skip_pages=skip_pages,
                          img_dir=out_dir,
                          jbig2=jbig2,
                          downsample=downsample,
                          bg_downsample=bg_downsample,
                          fg_downsample=fg_downsample,
                          denoise_mask=denoise_mask,
                          reporter=reporter,
                          hq_pages=HQ_PAGES,
                          hq_bg_compression_flags=hq_bg_compression_flags,
                          hq_fg_compression_flags=hq_fg_compression_flags,
                          verbose=verbose,
                          debug=debug,
                          tmp_dir=tmp_dir,
                          report_every=report_every,
                          stop_after=stop,
                          grayscale_pdf=grayscale_pdf,
                          force_1bit_output=force_1bit_output,
                          jpeg2000_implementation=jpeg2000_implementation,
                          mrc_image_format=mrc_image_format,
                          threads=threads,
                          errors=errors)
    elif image_mode in (0, 1):
        # TODO: Update this codepath
        insert_images(in_pdf, outdoc, mode=image_mode,
                report_every=report_every, stop_after=stop)
    elif image_mode == 3:
        # 3 = skip
        pass

    # 3. Add PDF/A compliant data
    write_pdfa(outdoc)

    if scandata_file is not None:
        # XXX: we parse scandata twice now, let's not do that
        # 3b. Write page labels from scandata file, if present
        write_page_labels(outdoc, scandata_file, errors=errors,
                          ignore_invalid=ignore_invalid_pagenumbers)

        write_pdf_toc(outdoc, scandata_file)


    lang_if_any = metadata_language[0] if metadata_language else None
    write_basic_ua(outdoc, language=lang_if_any)

    # 4. Write metadata
    extra_metadata = {}
    if metadata_url:
        extra_metadata['url'] = metadata_url
    if metadata_title:
        extra_metadata['title'] = metadata_title
    if metadata_creator:
        extra_metadata['creator'] = metadata_creator
    if metadata_author:
        extra_metadata['author'] = metadata_author
    if metadata_language:
        extra_metadata['language'] = metadata_language
    if metadata_subject:
        extra_metadata['subject'] = metadata_subject
    if metadata_creatortool:
        extra_metadata['creatortool'] = metadata_creatortool
    write_metadata(in_pdf, outdoc, extra_metadata=extra_metadata)

    # 5. Save
    mupdf_warnings = fitz.TOOLS.mupdf_warnings()
    if mupdf_warnings:
        print('mupdf warnings:', repr(mupdf_warnings))
    if verbose:
        print('Saving PDF now')

    t = time()
    outdoc.save(outfile, deflate=True, pretty=True)
    save_time_ms = int((time() - t)*1000)
    if reporter:
        data = json.dumps({'time_to_save': {'time': save_time_ms}})
        subprocess.check_output(reporter, input=data.encode('utf-8'))

    end_time = time()
    print('Processed %d pages at %.2f seconds/page' % (len(outdoc),
        (end_time - start_time) / len(outdoc)))

    if from_pdf is not None:
        oldsize = os.path.getsize(from_pdf)
    else:
        bytesum = 0
        skipped_pages = 0
        for idx, fname in enumerate(image_files):
            if skip_pages is not None and idx in skip_pages:
                skipped_pages += 1
                continue

            if stop_after is not None and (idx - skipped_pages) > stop_after:
                break

            bytesum += os.path.getsize(fname)

        oldsize = bytesum

    newsize = os.path.getsize(out_pdf)
    compression_ratio  = oldsize / newsize
    if verbose:
        print('Compression ratio: %f' % (compression_ratio))

    # 5. Remove leftover files
    outdoc.close()
    remove(tess_tmp_path)

    return {'errors': errors,
            'compression_ratio': compression_ratio}