Source code for internetarchivepdf.mrc

# archive-pdf-tools
# Copyright (C) 2020-2021, Internet Archive
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Author: Merlijn Boris Wolf Wajer <merlijn@archive.org>

import sys
from os import close, remove

from glob import glob
from tempfile import mkstemp
import subprocess
from time import time

import warnings

from PIL import Image, ImageOps
from skimage.filters import threshold_local, threshold_otsu
from skimage.restoration import denoise_tv_bregman, estimate_sigma

from scipy import ndimage
import numpy as np

from optimiser import optimise_gray, optimise_rgb, optimise_gray2, optimise_rgb2, fast_mask_denoise
from sauvola import binarise_sauvola

import fitz

fitz.TOOLS.set_icc(True) # For good measure, not required

from internetarchivepdf.jpeg2000 import encode_jpeg2000
from internetarchivepdf.const import (RECODE_RUNTIME_WARNING_TOO_SMALL_TO_DOWNSAMPLE, COMPRESSOR_JPEG,
        COMPRESSOR_JPEG2000, DENOISE_NONE, DENOISE_FAST, DENOISE_BREGMAN)


"""
"""

# skimage throws useless UserWarnings in various functions
def mean_estimate_sigma(arr):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        return np.mean(estimate_sigma(arr))



[docs]
def threshold_image(img, dpi, k=0.34):
    """
    Perform Sauvola binarisation on the given image

    Args:

    * img (np.ndarray): input image array
    * dpi (int): dpi for Sauvola, used to calculate window size if not None
    * k (float): k parameter, defaults to 0.34

    Returns binarised numpy.ndarray
    """
    window_size = 51

    if dpi is not None:
        window_size = int(dpi / 4)
        if window_size % 2 == 0:
            window_size += 1

    h, w = img.shape
    out_img = np.ndarray(img.shape, dtype=bool)
    out_img = np.reshape(out_img, w*h)
    in_img = np.reshape(img, w*h)

    binarise_sauvola(in_img, out_img, w, h, window_size, window_size, k, 128)
    out_img = np.reshape(out_img, (h, w))
    # TODO: optimise this, we can do it in binarise_sauvola
    out_img = np.invert(out_img)

    return out_img




[docs]
def denoise_bregman(binary_img):
    """
    Denoise a binary numpy array using Bregman total variation denoising

    Args:

    * binary_img (np.array): input array

    Returns the denoised array
    """
    thresf = np.array(binary_img, dtype=np.float32)
    #denoise = denoise_tv_bregman(thresf, weight=0.25)
    denoise = denoise_tv_bregman(thresf, weight=1.)

    #denoise = denoise > 0.6
    denoise = denoise > 0.4  # XXX: 0.4?
    denoise = np.array(denoise, dtype=bool)

    return denoise


# TODO: Rename, can be either foreground or background

[docs]
def partial_blur(mask, img, sigma=5, mode=None):
    """
    Blur a part of the image 'img', where mask = 0.
    The actual values used by the blur are colours where mask = '1', effectively
    'erasing/blurring' parts of an image where mask = 0 with colours where mask = 1.

    At the end, restore all pixels from img where mask = 1.
    """
    maskf = np.array(mask, dtype=np.float32)

    if mode == 'RGB' or mode == 'RGBA':
        in_r = img[:, :, 0] * maskf
        in_g = img[:, :, 1] * maskf
        in_b = img[:, :, 2] * maskf
        filter_r = ndimage.filters.gaussian_filter(in_r, sigma = sigma)
        filter_g = ndimage.filters.gaussian_filter(in_g, sigma = sigma)
        filter_b = ndimage.filters.gaussian_filter(in_b, sigma = sigma)
    else:
        imgf = np.copy(img)
        imgf = np.array(imgf, dtype=np.float32)
        filter = ndimage.filters.gaussian_filter(imgf * maskf, sigma = sigma)

    weights = ndimage.filters.gaussian_filter(maskf, sigma = sigma)

    if mode == 'RGB' or mode == 'RGBA':
        filter_r /= weights + 0.00001
        filter_g /= weights + 0.00001
        filter_b /= weights + 0.00001

        newimg = np.copy(img)
        newimg[:, :, 0] = filter_r
        newimg[:, :, 1] = filter_g
        newimg[:, :, 2] = filter_b
    else:
        filter /= weights + 0.00001
        newimg = np.array(filter, dtype=np.uint8)

    newimg[mask] = img[mask]

    return newimg



def partial_boxblur(mask, fg, size=5, mode=None):
    maskf = np.array(mask, dtype=np.float32)

    if mode == 'RGB' or mode == 'RGBA':
        in_r = fg[:, :, 0] * maskf
        in_g = fg[:, :, 1] * maskf
        in_b = fg[:, :, 2] * maskf
        filter_r = ndimage.uniform_filter(in_r, size = size)
        filter_g = ndimage.uniform_filter(in_g, size = size)
        filter_b = ndimage.uniform_filter(in_b, size = size)
    else:
        fgf = np.copy(fg)
        fgf = np.array(fgf, dtype=np.float32)
        filter = ndimage.uniform_filter(fgf * maskf, size = size)

    weights = ndimage.uniform_filter(maskf, size = size)

    if mode == 'RGB' or mode == 'RGBA':
        filter_r /= weights + 0.00001
        filter_g /= weights + 0.00001
        filter_b /= weights + 0.00001

        newfg = np.copy(fg)
        newfg[:, :, 0] = filter_r
        newfg[:, :, 1] = filter_g
        newfg[:, :, 2] = filter_b
    else:
        filter /= weights + 0.00001
        newfg = np.array(filter, dtype=np.uint8)

    newfg[mask] = fg[mask]

    return newfg


def create_hocr_mask(img, mask_arr, hocr_word_data, downsample=None, dpi=None, timing_data=None):
    image_width, image_height = img.size
    np_img = np.array(img)

    t = time()

    for paragraph in hocr_word_data:
        for line in paragraph['lines']:
            coords = line['bbox']

            line_text = ' '.join([word['text'] for word in line['words']])
            line_confs = [word['confidence'] for word in line['words']]
            line_conf = sum(line_confs) / len(line_confs) if len(line_confs) else 0

            if line_text.strip() == '' or line_conf < 20:
                continue

            if downsample is not None:
                coords = [int(x/downsample) for x in coords]
            else:
                coords = [int(x) for x in coords]

            left, top, right, bottom = coords
            # This can happen if we downsample and round to int
            if left == right or top == bottom:
                continue

            if (left >= right) or (top >= bottom):
                print('Invalid bounding box: (%d, %d, %d, %d)' % (left, top, right, bottom), file=sys.stderr)
                continue

            if (left < 0) or (right > image_width) or (top < 0) or (bottom > image_height):
                print('Invalid bounding box outside image: (%d, %d, %d, %d)' % (left, top, right, bottom), file=sys.stderr)
                continue

            np_lineimg = np_img[top:bottom,left:right]
            # Simple grayscale invert
            np_lineimg_invert = 255 - np.copy(np_lineimg)

            # XXX: If you tweak k, you must tweak the various ratio and sigma's
            # based on the test images
            k = 0.1
            thres = threshold_image(np_lineimg, dpi, k)
            ones = np.count_nonzero(thres)
            zero = np_lineimg.size - ones
            ratio = (ones/(zero+ones))

            thres_invert = threshold_image(np_lineimg_invert, dpi, k)
            ones_i = np.count_nonzero(thres_invert)
            zero_i = np_lineimg.size - ones_i
            inv_ratio = (ones_i/(zero_i+ones_i))

            if ratio < 0.3 or inv_ratio < 0.3:
                th = None

                perc_larger = 0.
                if inv_ratio != 0.0:
                    perc_larger = (ratio / inv_ratio) * 100

                if inv_ratio > 0.2 and ratio < 0.2:
                    th = thres
                else:
                    # mean_estimate_sigma is expensive, so let's only do it if
                    # we need to

                    ratio_sigma = mean_estimate_sigma(thres)
                    inv_ratio_sigma = mean_estimate_sigma(thres_invert)


                    # Prefer ratio over inv_ratio by a bit
                    if inv_ratio < 0.3 and inv_ratio < ratio and \
                    (inv_ratio_sigma < ratio_sigma or \
                    (ratio_sigma < 0.1 and inv_ratio_sigma < 0.1)):
                        th = thres_invert
                    elif ratio < 0.2:
                        th = thres

                if th is not None:
                    mask_arr[top:bottom, left:right] = th


    if timing_data is not None:
        timing_data.append(('hocr_mask_gen', time() - t))


def estimate_noise(imgf):
    #sigma_est = mean_estimate_sigma(imgf)
    #return sigma_est

    # We do this only on a part of the image, because it's accurate enough wrt
    # noise estimation (definitely for camera noise estimation since that's
    # everywhere in the image, and it's quite a bit faster this way).
    h, w = imgf.shape
    MUL = 4
    hs = int(h/2 - h/MUL)
    he = int(h/2 + h/MUL)
    ws = int(w/2 - w/MUL)
    we = int(w/2 + w/MUL)

    # Really small image?
    if he == 0 or we == 0:
        hs = 0
        he = h
        ws = 0
        we = w

    sigma_est = mean_estimate_sigma(imgf[hs:he, ws:we])

    return sigma_est



def create_threshold_mask(mask_arr, imgf, dpi=None, denoise_mask=None, timing_data=None):
    # We don't apply any of these blurs to the hOCR mask, we want that as
    # sharp as possible.

    t = time()
    sigma_est = estimate_noise(imgf)

    if timing_data is not None:
        timing_data.append(('est_1', time() - t))
    if sigma_est > 1.0:
        t = time()
        imgf = ndimage.filters.gaussian_filter(imgf, sigma=sigma_est*0.1)
        if timing_data is not None:
            timing_data.append(('blur_1', time() - t))

        #t = time()
        #n_sigma_est = mean_estimate_sigma(imgf)
        #time_data.append(('est_2', time() - t))
        #if sigma_est > 1.0 and n_sigma_est > 1.0:
            #    t = time()
        #    imgf = ndimage.filters.gaussian_filter(imgf, sigma=sigma_est*0.5)
        #    print('Going for second blur: n_sigma_est:',n_sigma_est)
        #    time_data.append(('blur_2', time() - t))

    t = time()
    thres_arr = threshold_image(imgf.astype(np.uint8), dpi)
    if timing_data is not None:
        timing_data.append(('threshold', time() - t))

    mask_arr |= thres_arr


# TODO: Reduce amount of memory active at one given point (keep less images in
# memory, write to disk sooner, etc), careful with numpy <-> PIL conversions

[docs]
def create_mrc_hocr_components(image, hocr_word_data,
                               dpi=None,
                               downsample=None,
                               bg_downsample=None,
                               fg_downsample=None,
                               denoise_mask=None, timing_data=None,
                               errors=None):
    """
    Create the MRC components: mask, foreground and background

    Args:

    * image (PIL.Image): Image to be decomposed
    * hocr_word_data: OCR data about found text on the page
    * downsample (int): factor by which the OCR data is to be downsampled
    * bg_downsample (int): if the background image should be downscaled
    * denoise_mask (bool): Whether to denoise the image if it is deemed too
      noisy
    * timing_data: Optional timing data to log individual timing data to.
    * errors: Optional argument (of type set) with encountered runtime errors

    Returns a tuple of the components, as numpy arrays: (mask, foreground,
    background)
    """
    grayimg = image
    if image.mode != 'L':
        t = time()
        grayimg = image.convert('L')
        if timing_data is not None:
            timing_data.append(('grey_conversion', time() - t))

    width_, height_ = image.size

    mask_arr = np.array(Image.new('1', image.size))

    # Modifies mask_arr in place
    create_hocr_mask(grayimg, mask_arr, hocr_word_data, downsample=downsample,
                     dpi=dpi, timing_data=timing_data)
    grayimgf = np.array(grayimg, dtype=np.float32)

    MIX_THRESHOLD = True
    if MIX_THRESHOLD:
        # XXX: this nukes the hocr threshold, testing only
        # mask_arr = np.zeros(mask_arr.shape, dtype=bool)

        # Modifies mask_arr in place
        create_threshold_mask(mask_arr, grayimgf, dpi=dpi,
                              denoise_mask=denoise_mask,
                              timing_data=timing_data)

    if denoise_mask != DENOISE_NONE:
        t = time()
        if denoise_mask == DENOISE_FAST:
            # XXX: We could make the mincnt parameter take the dpi into account
            fast_mask_denoise(mask_arr, width_, height_, 4, 2)
            if timing_data is not None:
                timing_data.append(('fast_denoise', time() - t))
        elif denoise_mask == DENOISE_BREGMAN:
            mask_arr = denoise_bregman(mask_arr)
            if timing_data is not None:
                timing_data.append(('denoise', time() - t))
        else:
            raise ValueError('Invalid denoise option:', denoise_mask)


    yield mask_arr

    if image.mode not in ('L', 'RGB'):
        # Special modes like mapped ('P') or other modes we just map to RGB for
        # simplicity sake
        image = image.convert('RGB')

    image_arr = np.array(image)

    t = time()
    # Take foreground pixels and optimise the image by making the surrounding
    # pixels like the foreground, allowing for more optimal compression (and
    # higher quality foreground pixels as a result)
    if image.mode == 'L':
        foreground_arr = optimise_gray2(mask_arr, image_arr, width_, height_, 3)
    else:
        foreground_arr = optimise_rgb2(mask_arr, image_arr, width_, height_, 3)
    if timing_data is not None:
        # The name fg_partial_blur is kept for backwards compatibility
        timing_data.append(('fg_partial_blur', time() - t))

    if fg_downsample is not None:
        t = time()
        image2 = Image.fromarray(foreground_arr)
        w, h = image2.size
        w_downsample = int(w / fg_downsample)
        h_downsample = int(h / fg_downsample)
        if w_downsample > 0 and h_downsample > 0:
            image2.thumbnail((w_downsample, h_downsample))
            foreground_arr = np.array(image2)
        else:
            if errors is not None:
                errors.add(RECODE_RUNTIME_WARNING_TOO_SMALL_TO_DOWNSAMPLE)

        if timing_data is not None:
            timing_data.append(('fg_downsample', time() - t))

    yield foreground_arr
    foreground_arr = None

    mask_inv = mask_arr ^ np.ones(mask_arr.shape, dtype=bool)

    t = time()
    # Take background pixels and optimise the image by placing them where the
    # foreground pixels are thought to be, this has the effect of reducing
    # compression artifacts (thus improving quality) and at the same time making
    # the image easier to compress (smaller file size)
    if image.mode == 'L':
        background_arr = optimise_gray2(mask_inv, image_arr, width_, height_, 10)
    else:
        background_arr = optimise_rgb2(mask_inv, image_arr, width_, height_, 10)
    if timing_data is not None:
        # The name bg_partial_blur is kept for backwards compatibility
        timing_data.append(('bg_partial_blur', time() - t))

    if bg_downsample is not None:
        t = time()
        image2 = Image.fromarray(background_arr)
        w, h = image2.size
        w_downsample = int(w / bg_downsample)
        h_downsample = int(h / bg_downsample)
        if w_downsample > 0 and h_downsample > 0:
            image2.thumbnail((w_downsample, h_downsample))
            background_arr = np.array(image2)
        else:
            if errors is not None:
                errors.add(RECODE_RUNTIME_WARNING_TOO_SMALL_TO_DOWNSAMPLE)

        if timing_data is not None:
            timing_data.append(('bg_downsample', time() - t))

    yield background_arr
    return




[docs]
def encode_mrc_mask(np_mask, tmp_dir=None, jbig2=True, embedded_jbig2=False,
                    timing_data=None, debug=False):
    """
    Encode mask image either to JBIG2 or PNG.

    Args:

    * np_mask (numpy.array): Mask image array
    * tmp_dir (str): path the temporary directory to write images to
    * jbig2 (bool): Whether to encode to JBIG2 or PNG
    * embedded_jbig2 (bool): Whether to encode to JBIG2 with or without header
    * timing_data (optional): Add time information to timing_data structure

    Returns a tuple: (str, str) where the first entry is the jbig2
    path, if any, the second is the png path.
    """
    t = time()
    mask = Image.fromarray(np_mask)

    fd, mask_img_png = mkstemp(prefix='mask', suffix='.png', dir=tmp_dir)
    close(fd)
    if jbig2:
        fd, mask_img_jbig2 = mkstemp(prefix='mask', suffix='.jbig2', dir=tmp_dir)
        close(fd)

    mask.save(mask_img_png, compress_level=0)

    if jbig2:
        args = ['jbig2', mask_img_png]
        if embedded_jbig2:
            args = ['jbig2', '-p', mask_img_png]

        if debug:
            print('check_output: %s' % args, file=sys.stderr)

        out = subprocess.check_output(args)
        fp= open(mask_img_jbig2, 'wb+')
        fp.write(out)
        fp.close()

    if timing_data is not None:
        timing_data.append(('mask_jbig2', time()-t))

    if jbig2:
        return mask_img_jbig2, mask_img_png
    else:
        return None, mask_img_png




[docs]
def encode_mrc_img(np_img, img_compression_flags, imgtype=None, tmp_dir=None,
        jpeg2000_implementation=None, mrc_image_format=None, timing_data=None,
        threads=False,
        debug=False):
    """
    Encode image as JPEG2000 or JPEG, with the provided compression settings
    and JPEG2000/JPEG encoder.

    Args:

    * np_img (numpy.array): Image array
    * img_compression_flags (str): Compression flags
    * imgtype (str: 'bg' or 'fg'
    * tmp_dir (str): path the temporary directory to write images to
    * jpeg2000_implementation (str): What JPEG2000 implementation to use
    * mrc_image_format (str): What image format to produce
    * timing_data (optional): Add time information to timing_data structure
    * debug (bool, optional): Write debug info to stderr

    Returns the filepath to the JPEG2000 image
    """
    t = time()
    if imgtype not in ('bg', 'fg'):
        raise ValueError('imgtype should be \'bg\' or \'fg\'')

    # Create background
    if mrc_image_format == COMPRESSOR_JPEG:
        fd, img_tiff = mkstemp(prefix=imgtype, suffix='.jpg', dir=tmp_dir)
        close(fd)

    fd, img_jp2 = mkstemp(prefix=imgtype, suffix='.jp2', dir=tmp_dir)
    close(fd)
    remove(img_jp2) # XXX: Kakadu doesn't want the file to exist, so what are
                       # we even doing

    img = Image.fromarray(np_img)

    if mrc_image_format == COMPRESSOR_JPEG:
        img.save(img_tiff, quality=100)


        args = ['jpegoptim'] + img_compression_flags + [img_tiff, '--stdout']
        if debug:
            print('check_output: %s' % args, file=sys.stderr)
        output = subprocess.check_output(args)
        tmpfd=open(img_jp2, 'bw+') # XXX: FIXME: this defeats the point of a tmpfile
        tmpfd.write(output)
        tmpfd.close()
    else:
        encode_jpeg2000(img, img_jp2, jpeg2000_implementation,
                        img_compression_flags, imgtype=imgtype,
                        threads=threads, debug=debug)


    if timing_data is not None:
        timing_data.append(('%s_jp2' % imgtype, time()-t))

    return img_jp2




[docs]
def encode_mrc_background(np_bg, bg_compression_flags, tmp_dir=None,
        jpeg2000_implementation=None, mrc_image_format=None, timing_data=None,
        threads=None, debug=False):
    """
    Encode background image as JPEG2000, with the provided compression settings
    and JPEG2000 encoder.

    Args:

    * np_bg (numpy.array): Background image array
    * bg_compression_flags (str): Compression flags
    * tmp_dir (str): path the temporary directory to write images to
    * jpeg2000_implementation (str): What JPEG2000 implementation to use
    * mrc_image_format (str): What image format to produce
    * timing_data (optional): Add time information to timing_data structure

    Returns the filepath to the JPEG2000 background image
    """
    return encode_mrc_img(np_bg, bg_compression_flags, 'bg', tmp_dir=tmp_dir,
            jpeg2000_implementation=jpeg2000_implementation,
            mrc_image_format=mrc_image_format, timing_data=timing_data,
            threads=threads,
            debug=debug)




[docs]
def encode_mrc_foreground(np_fg, fg_compression_flags, tmp_dir=None,
        jpeg2000_implementation=None, mrc_image_format=None, timing_data=None,
        threads=False, debug=False):
    """
    Encode foreground image as JPEG2000, with the provided compression settings
    and JPEG2000 encoder.

    Args:

    * np_bg (numpy.array): Foreground image array
    * fg_compression_flags (str): Compression flags
    * tmp_dir (str): path the temporary directory to write images to
    * jpeg2000_implementation (str): What JPEG2000 implementation to use
    * mrc_image_format (str): What image format to produce
    * timing_data (optional): Add time information to timing_data structure

    Returns the filepath to the JPEG2000 foreground image
    """
    return encode_mrc_img(np_fg, fg_compression_flags, 'fg', tmp_dir=tmp_dir,
            jpeg2000_implementation=jpeg2000_implementation,
            mrc_image_format=mrc_image_format, timing_data=timing_data,
            threads=threads,
            debug=debug)



def encode_mrc_images(mrc_gen, bg_compression_flags=None, fg_compression_flags=None,
                      tmp_dir=None, jbig2=True, timing_data=None,
                      jpeg2000_implementation=None, mrc_image_format=None,
                      embedded_jbig2=False, threads=None, debug=False):
    mask_img_jbig2, mask_img_png = encode_mrc_mask(next(mrc_gen),
            tmp_dir=tmp_dir, jbig2=jbig2, embedded_jbig2=embedded_jbig2,
            timing_data=timing_data)

    np_fg = next(mrc_gen)
    fg_img_jp2 = encode_mrc_foreground(np_fg, fg_compression_flags, tmp_dir=tmp_dir,
                                       jpeg2000_implementation=jpeg2000_implementation,
                                       mrc_image_format=mrc_image_format,
                                       timing_data=timing_data, threads=threads, debug=debug)
    fg_h, fg_w = np_fg.shape[0:2]
    np_fg = None

    np_bg = next(mrc_gen)
    bg_img_jp2 = encode_mrc_background(np_bg, bg_compression_flags, tmp_dir=tmp_dir,
                                       jpeg2000_implementation=jpeg2000_implementation,
                                       mrc_image_format=mrc_image_format,
                                       timing_data=timing_data, threads=threads, debug=debug)
    bg_h, bg_w = np_bg.shape[0:2]
    np_bg = None

    # XXX: probably don't need this
    try:
        _ = next(mrc_gen)
    except StopIteration:
        pass

    if jbig2:
        remove(mask_img_png)

    if jbig2:
        return mask_img_jbig2, bg_img_jp2, (bg_w, bg_h), fg_img_jp2, (fg_w, fg_h)
    else:
        # Return PNG which mupdf will turn into ccitt with
        # save(..., deflate=True) until mupdf fixes their JBIG2 support
        #return mask_img_png, bg_img_jp2, fg_img_jp2
        return mask_img_png, bg_img_jp2, (bg_w, bg_h), fg_img_jp2, (fg_w, fg_h)