pyocr: Add patch to support Tesseract 3.05.00

This is from the commit message I've written for the upstream pull
request (jflesch/pyocr#62):

    This is a bit more involved, because Tesseract 3.05.00 comes not
    only with improvements but also with a few quirks we need to deal
    with.

    The first quirk is that the order arguments of the `tesseract'
    command now matters and the list of configurations has to be at the
    end of the command line. So we add a new attribute tesseract_flags
    to the BaseBuilder class that contains a list of all the flags to
    pass to `tesseract', the tesseract_configs attribute however remains
    pretty much the same but now only really contains a list of configs
    instead of being mixed with flag arguments.

    Another quirk has to do with Leptonica >= 1.74 which Tesseract
    3.05.00 now requires. Leptonica has special handling of files that
    reside in /tmp and assumes that it's an internal temporary file of
    Leptonica. In order to deal with it, we now run Tesseract in a
    temporary directory, which contains the input/output files and use
    the relative name of these files because Leptonica only searches for
    path names beginning with /tmp.

    Fortunately the last item we need to address is not really a quirk,
    but an API change. In Tesseract 3.05.00 there is now a new function
    called TessBaseAPIDetectOrientationScript(), which doesn't fill the
    OSResults object anymore but now allows to pass the values we're
    interested in directly by reference. We need to use this new
    function because the old function TessBaseAPIDetectOS() now *always*
    returns false.

I've tested this specifically on NixOS and in conjunction with Paperwork
(the only package that's using pyocr so far) and all the tests of the
dependency chain are now succeeding. However, I didn't do manual tests
of Paperwork though.

Signed-off-by: aszlig <aszlig@redmoonstudios.org>
This commit is contained in:
aszlig 2017-04-09 23:19:12 +02:00
parent 121751e10f
commit 49cf934642
No known key found for this signature in database
GPG Key ID: 1DE8E48E57DB5436
2 changed files with 318 additions and 0 deletions

View File

@ -0,0 +1,316 @@
This patch is required for pyocr to work with Tesseract version 3.05.00
and has been submitted upstream at the following URL:
https://github.com/jflesch/pyocr/pull/62
diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py
index 73c964d..20f390c 100644
--- a/src/pyocr/builders.py
+++ b/src/pyocr/builders.py
@@ -240,8 +240,10 @@ class BaseBuilder(object):
cuneiform_args : Arguments passed to the Cuneiform command line.
"""
- def __init__(self, file_extensions, tesseract_configs, cuneiform_args):
+ def __init__(self, file_extensions, tesseract_flags, tesseract_configs,
+ cuneiform_args):
self.file_extensions = file_extensions
+ self.tesseract_flags = tesseract_flags
self.tesseract_configs = tesseract_configs
self.cuneiform_args = cuneiform_args
@@ -298,7 +300,7 @@ class TextBuilder(BaseBuilder):
def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False,
cuneiform_fax=False, cuneiform_singlecolumn=False):
file_ext = ["txt"]
- tess_conf = ["-psm", str(tesseract_layout)]
+ tess_flags = ["-psm", str(tesseract_layout)]
cun_args = ["-f", "text"]
# Add custom cuneiform parameters if needed
for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"),
@@ -306,7 +308,7 @@ class TextBuilder(BaseBuilder):
(cuneiform_singlecolumn, "--singlecolumn")]:
if par:
cun_args.append(arg)
- super(TextBuilder, self).__init__(file_ext, tess_conf, cun_args)
+ super(TextBuilder, self).__init__(file_ext, tess_flags, [], cun_args)
self.tesseract_layout = tesseract_layout
self.built_text = []
@@ -540,9 +542,11 @@ class WordBoxBuilder(BaseBuilder):
def __init__(self, tesseract_layout=1):
file_ext = ["html", "hocr"]
- tess_conf = ["hocr", "-psm", str(tesseract_layout)]
+ tess_flags = ["-psm", str(tesseract_layout)]
+ tess_conf = ["hocr"]
cun_args = ["-f", "hocr"]
- super(WordBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
+ super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
+ cun_args)
self.word_boxes = []
self.tesseract_layout = tesseract_layout
@@ -614,9 +618,11 @@ class LineBoxBuilder(BaseBuilder):
def __init__(self, tesseract_layout=1):
file_ext = ["html", "hocr"]
- tess_conf = ["hocr", "-psm", str(tesseract_layout)]
+ tess_flags = ["-psm", str(tesseract_layout)]
+ tess_conf = ["hocr"]
cun_args = ["-f", "hocr"]
- super(LineBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
+ super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
+ cun_args)
self.lines = []
self.tesseract_layout = tesseract_layout
diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py
index 0c2259a..f7ab309 100644
--- a/src/pyocr/libtesseract/tesseract_raw.py
+++ b/src/pyocr/libtesseract/tesseract_raw.py
@@ -263,11 +263,22 @@ if g_libtesseract:
]
g_libtesseract.TessDeleteText.restype = None
- g_libtesseract.TessBaseAPIDetectOS.argtypes = [
- ctypes.c_void_p, # TessBaseAPI*
- ctypes.POINTER(OSResults),
- ]
- g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool
+ if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'):
+ g_libtesseract.TessBaseAPIDetectOrientationScript.argtypes = [
+ ctypes.c_void_p, # TessBaseAPI*
+ ctypes.POINTER(ctypes.c_int), # orient_deg
+ ctypes.POINTER(ctypes.c_float), # orient_conf
+ ctypes.POINTER(ctypes.c_char_p), # script_name
+ ctypes.POINTER(ctypes.c_float), # script_conf
+ ]
+ g_libtesseract.TessBaseAPIDetectOrientationScript.restype = \
+ ctypes.c_bool
+ else:
+ g_libtesseract.TessBaseAPIDetectOS.argtypes = [
+ ctypes.c_void_p, # TessBaseAPI*
+ ctypes.POINTER(OSResults),
+ ]
+ g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool
def init(lang=None):
@@ -526,15 +537,37 @@ def detect_os(handle):
global g_libtesseract
assert(g_libtesseract)
- results = OSResults()
- r = g_libtesseract.TessBaseAPIDetectOS(
- ctypes.c_void_p(handle),
- ctypes.pointer(results)
- )
- if not r:
- raise TesseractError("detect_orientation failed",
- "TessBaseAPIDetectOS() failed")
- return {
- "orientation": results.best_orientation_id,
- "confidence": results.best_oconfidence,
- }
+ # Use the new API function if it is available, because since Tesseract
+ # 3.05.00 the old API function _always_ returns False.
+ if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'):
+ orientation_deg = ctypes.c_int(0)
+ orientation_confidence = ctypes.c_float(0.0)
+
+ r = g_libtesseract.TessBaseAPIDetectOrientationScript(
+ ctypes.c_void_p(handle),
+ ctypes.byref(orientation_deg),
+ ctypes.byref(orientation_confidence),
+ None, # script_name
+ None # script_confidence
+ )
+
+ if not r:
+ raise TesseractError("detect_orientation failed",
+ "TessBaseAPIDetectOrientationScript() failed")
+ return {
+ "orientation": round(orientation_deg.value / 90),
+ "confidence": orientation_confidence.value,
+ }
+ else: # old API (before Tesseract 3.05.00)
+ results = OSResults()
+ r = g_libtesseract.TessBaseAPIDetectOS(
+ ctypes.c_void_p(handle),
+ ctypes.pointer(results)
+ )
+ if not r:
+ raise TesseractError("detect_orientation failed",
+ "TessBaseAPIDetectOS() failed")
+ return {
+ "orientation": results.best_orientation_id,
+ "confidence": results.best_oconfidence,
+ }
diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py
index 99b0121..658c96b 100755
--- a/src/pyocr/tesseract.py
+++ b/src/pyocr/tesseract.py
@@ -22,6 +22,8 @@ import os
import subprocess
import sys
import tempfile
+import contextlib
+import shutil
from . import builders
from . import error
@@ -62,9 +64,11 @@ class CharBoxBuilder(builders.BaseBuilder):
def __init__(self):
file_ext = ["box"]
+ tess_flags = []
tess_conf = ["batch.nochop", "makebox"]
cun_args = []
- super(CharBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
+ super(CharBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
+ cun_args)
self.tesseract_layout = 1
@staticmethod
@@ -173,18 +177,19 @@ def detect_orientation(image, lang=None):
TesseractError --- if no script detected on the image
"""
_set_environment()
- with temp_file(".bmp") as input_file:
- command = [TESSERACT_CMD, input_file.name, 'stdout', "-psm", "0"]
+ with temp_dir() as tmpdir:
+ command = [TESSERACT_CMD, "input.bmp", 'stdout', "-psm", "0"]
if lang is not None:
command += ['-l', lang]
if image.mode != "RGB":
image = image.convert("RGB")
- image.save(input_file.name)
+ image.save(os.path.join(tmpdir, "input.bmp"))
proc = subprocess.Popen(command, stdin=subprocess.PIPE, shell=False,
startupinfo=g_subprocess_startup_info,
creationflags=g_creation_flags,
+ cwd=tmpdir,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
proc.stdin.close()
@@ -224,8 +229,8 @@ def get_available_builders():
]
-def run_tesseract(input_filename, output_filename_base, lang=None,
- configs=None):
+def run_tesseract(input_filename, output_filename_base, cwd=None, lang=None,
+ flags=None, configs=None):
'''
Runs Tesseract:
`TESSERACT_CMD` \
@@ -238,6 +243,8 @@ def run_tesseract(input_filename, output_filename_base, lang=None,
input_filename --- image to read
output_filename_base --- file name in which must be stored the result
(without the extension)
+ cwd --- Run Tesseract in the specified working directory or use current
+ one if None
lang --- Tesseract language to use (if None, none will be specified)
config --- List of Tesseract configs to use (if None, none will be
specified)
@@ -252,10 +259,13 @@ def run_tesseract(input_filename, output_filename_base, lang=None,
if lang is not None:
command += ['-l', lang]
+ if flags is not None:
+ command += flags
+
if configs is not None:
command += configs
- proc = subprocess.Popen(command,
+ proc = subprocess.Popen(command, cwd=cwd,
startupinfo=g_subprocess_startup_info,
creationflags=g_creation_flags,
stdout=subprocess.PIPE,
@@ -301,11 +311,18 @@ class ReOpenableTempfile(object):
self.name = None
-def temp_file(suffix):
- ''' Returns a temporary file '''
- if os.name == 'nt': # Windows
- return ReOpenableTempfile(suffix)
- return tempfile.NamedTemporaryFile(prefix='tess_', suffix=suffix)
+@contextlib.contextmanager
+def temp_dir():
+ """
+ A context manager for maintaining a temporary directory
+ """
+ # NOTE: Drop this as soon as we don't support Python 2.7 anymore, because
+ # since Python 3.2 there is a context manager called TemporaryDirectory().
+ path = tempfile.mkdtemp(prefix='tess_')
+ try:
+ yield path
+ finally:
+ shutil.rmtree(path)
def image_to_string(image, lang=None, builder=None):
@@ -329,23 +346,20 @@ def image_to_string(image, lang=None, builder=None):
if builder is None:
builder = builders.TextBuilder()
- with temp_file(".bmp") as input_file:
- with temp_file('') as output_file:
- output_file_name_base = output_file.name
-
+ with temp_dir() as tmpdir:
if image.mode != "RGB":
image = image.convert("RGB")
- image.save(input_file.name)
- (status, errors) = run_tesseract(input_file.name,
- output_file_name_base,
+ image.save(os.path.join(tmpdir, "input.bmp"))
+ (status, errors) = run_tesseract("input.bmp", "output", cwd=tmpdir,
lang=lang,
+ flags=builder.tesseract_flags,
configs=builder.tesseract_configs)
if status:
raise TesseractError(status, errors)
output_file_name = "ERROR"
for file_extension in builder.file_extensions:
- output_file_name = ('%s.%s' % (output_file_name_base,
+ output_file_name = ('%s.%s' % (os.path.join(tmpdir, "output"),
file_extension))
if not os.access(output_file_name, os.F_OK):
continue
diff --git a/tests/tests_libtesseract.py b/tests/tests_libtesseract.py
index ccddd07..1ac2a4b 100644
--- a/tests/tests_libtesseract.py
+++ b/tests/tests_libtesseract.py
@@ -33,8 +33,9 @@ class TestContext(unittest.TestCase):
(3, 3, 0),
(3, 4, 0),
(3, 4, 1),
+ (3, 5, 0),
), ("Tesseract does not have the expected version"
- " (3.4.0) ! Some tests will be skipped !"))
+ " (3.5.0) ! Some tests will be skipped !"))
def test_langs(self):
langs = libtesseract.get_available_languages()
diff --git a/tests/tests_tesseract.py b/tests/tests_tesseract.py
index e29c512..fa4d483 100644
--- a/tests/tests_tesseract.py
+++ b/tests/tests_tesseract.py
@@ -27,8 +27,9 @@ class TestContext(unittest.TestCase):
(3, 3, 0),
(3, 4, 0),
(3, 4, 1),
+ (3, 5, 0),
), ("Tesseract does not have the expected version"
- " (3.4.0) ! Some tests will be skipped !"))
+ " (3.5.0) ! Some tests will be skipped !"))
def test_langs(self):
langs = tesseract.get_available_languages()

View File

@ -20883,6 +20883,8 @@ in {
sha256 = "0amyhkkm400qzbw65ivyzrzxl2r7vxqgsgqm7ml95m7gwkwhnzz0";
};
patches = [ ../development/python-modules/pyocr-tesseract.patch ];
postPatch = ''
sed -i \
-e 's,^\(TESSERACT_CMD *= *\).*,\1"${pkgs.tesseract}/bin/tesseract",' \