Merge pull request #65775 from Kiwi/ocrmypdf

ocrmypdf: init
This commit is contained in:
Frederik Rietdijk 2019-08-18 08:16:59 +02:00 committed by GitHub
commit 316a0e9382
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 353 additions and 0 deletions

View File

@ -0,0 +1,31 @@
{ buildPythonPackage
, fetchFromGitHub
, lxml
, pillow
, reportlab
, stdenv
}:
buildPythonPackage rec {
pname = "hocr-tools";
version = "1.3.0";
src = fetchFromGitHub {
owner = "tmbdev";
repo = "${pname}";
rev = "v${version}";
sha256 = "14f9hkp7pr677085w8iidwd0la9cjzy3pyj3rdg9b03nz9pc0w6p";
};
# hocr-tools uses a test framework that requires internet access
doCheck = false;
propagatedBuildInputs = [ pillow lxml reportlab ];
meta = with stdenv.lib; {
description = "
Tools for manipulating and evaluating the hOCR format for representing multi-lingual OCR results by embedding them into HTML";
homepage = https://github.com/tmbdev/hocr-tools;
license = licenses.asl20;
maintainers = [ maintainers.kiwi ];
};
}

View File

@ -0,0 +1,73 @@
{ attrs
, buildPythonPackage
, defusedxml
, fetchPypi
, hypothesis
, isPy3k
, lxml
, pillow
, pybind11
, pytest
, pytest-helpers-namespace
, pytest-timeout
, pytest_xdist
, pytestrunner
, python-xmp-toolkit
, python3
, qpdf
, setuptools-scm-git-archive
, setuptools_scm
, stdenv
}:
buildPythonPackage rec {
pname = "pikepdf";
version = "1.1.0";
disabled = ! isPy3k;
src = fetchPypi {
inherit pname version;
sha256 = "14b36r6h3088z2sxp2pqvm171js53hz53mwm1g52iadignjnp0my";
};
buildInputs = [
pybind11
qpdf
];
nativeBuildInputs = [
setuptools-scm-git-archive
setuptools_scm
];
checkInputs = [
attrs
hypothesis
pillow
pytest
pytest-helpers-namespace
pytest-timeout
pytest_xdist
pytestrunner
python-xmp-toolkit
];
propagatedBuildInputs = [ defusedxml lxml ];
postPatch = ''
substituteInPlace requirements/test.txt \
--replace "pytest >= 3.6.0, < 4.1.0" "pytest >= 4.2.1, < 5"
'';
preBuild = ''
HOME=$TMPDIR
'';
meta = with stdenv.lib; {
homepage = "https://github.com/pikepdf/pikepdf";
description = "Read and write PDFs with Python, powered by qpdf";
license = licenses.mpl20;
maintainers = [ maintainers.kiwi ];
};
}

View File

@ -0,0 +1,35 @@
{ buildPythonPackage
, fetchFromGitHub
, pytest
, stdenv
}:
buildPythonPackage rec {
pname = "pytest-helpers-namespace";
version = "2019.1.8";
src = fetchFromGitHub {
owner = "saltstack";
repo = "${pname}";
rev = "v${version}";
sha256 = "0z9f25d2wpf3lnqzmmnrlvl5b1f7kqwjjf4nzs9x2bpf91s5zny1";
};
buildInputs = [ pytest ];
checkInputs = [ pytest ];
checkPhase = ''
pytest
'';
# The tests fail with newest pytest. They passed with pytest_3, which no longer exists
doCheck = false;
meta = with stdenv.lib; {
homepage = "https://github.com/saltstack/pytest-helpers-namespace";
description = "PyTest Helpers Namespace";
license = licenses.asl20;
maintainers = [ maintainers.kiwi ];
};
}

View File

@ -0,0 +1,44 @@
{ buildPythonPackage
, exempi
, fetchFromGitHub
, mock
, pythonOlder
, pytz
, stdenv
}:
buildPythonPackage rec {
pname = "python-xmp-toolkit";
version = "2.0.2";
# PyPi has version 2.0.1; the tests fail
# There are commits for a 2.0.2 release that was never published
# Not to github, not to PyPi
# This is the latest commit from Jun 29, 2017 (as of Mar 13, 2019)
# It includes the commits for the unreleased version 2.0.2 and more
# Tests pass with this version
src = fetchFromGitHub {
owner = "python-xmp-toolkit";
repo = "python-xmp-toolkit";
rev = "5692bdf8dac3581a0d5fb3c5aeb29be0ab6a54fc";
sha256 = "16bylcm183ilzp7mrpdzw0pzp6csv9v5v247914qsv2abg0hgl5y";
};
buildInputs = [ exempi ];
checkInputs = stdenv.lib.optionals (pythonOlder "3.3") [ mock ];
propagatedBuildInputs = [ pytz ];
postPatch = ''
substituteInPlace libxmp/exempi.py \
--replace "ctypes.util.find_library('exempi')" "'${exempi}/lib/libexempi${stdenv.hostPlatform.extensions.sharedLibrary}'"
'';
meta = with stdenv.lib; {
homepage = https://github.com/python-xmp-toolkit/python-xmp-toolkit;
description = "Python XMP Toolkit for working with metadata";
license = licenses.bsd3;
maintainers = [ maintainers.kiwi ];
};
}

View File

@ -0,0 +1,53 @@
{ gevent
, buildPythonPackage
, fetchFromGitHub
, hostname
, pytest
, python
, stdenv
}:
buildPythonPackage rec {
pname = "ruffus";
version = "2.8.1";
src = fetchFromGitHub {
owner = "cgat-developers";
repo = "${pname}";
rev = "v${version}";
sha256 = "1gyabqafq4s2sy0prh3k1m8859shzjmfxr7fimx10liflvki96a9";
};
propagatedBuildInputs = [ gevent ];
postPatch = ''
sed -i -e 's|/bin/bash|${stdenv.shell}|' ruffus/test/Makefile
sed -i -e 's|\tpytest|\t${pytest}/bin/pytest|' ruffus/test/Makefile
sed -i -e 's|\tpython|\t${python.interpreter}|' ruffus/test/Makefile
sed -i -e 's|/usr/bin/env bash|${stdenv.shell}|' ruffus/test/run_all_unit_tests.cmd
sed -i -e 's|python3|${python.interpreter}|' ruffus/test/run_all_unit_tests3.cmd
sed -i -e 's|python %s|${python.interpreter} %s|' ruffus/test/test_drmaa_wrapper_run_job_locally.py
'';
makefile = "ruffus/test/Makefile";
checkInputs = [
gevent
hostname
pytest
];
checkPhase = ''
export HOME=$TMPDIR
cd ruffus/test
make all PYTEST_OPTIONS="-q --disable-warnings"
'';
meta = with stdenv.lib; {
description = "Light-weight Python Computational Pipeline Management";
homepage = http://www.ruffus.org.uk;
license = licenses.mit;
maintainers = [ maintainers.kiwi ];
};
}

View File

@ -0,0 +1,103 @@
{ fetchFromGitHub
, ghostscript
, img2pdf
, jbig2enc
, leptonica
, pngquant
, python3
, python3Packages
, qpdf
, stdenv
, tesseract4
, unpaper
}:
let
inherit (python3Packages) buildPythonApplication;
runtimeDeps = with python3Packages; [
ghostscript
jbig2enc
leptonica
pngquant
qpdf
tesseract4
unpaper
pillow
];
in buildPythonApplication rec {
pname = "ocrmypdf";
version = "8.2.3";
disabled = ! python3Packages.isPy3k;
src = fetchFromGitHub {
owner = "jbarlow83";
repo = "OCRmyPDF";
rev = "v${version}";
sha256 = "1ldlyhxkav34y9d7g2kx3d4p26c2b82vnwi0ywnfynb16sav36d5";
};
nativeBuildInputs = with python3Packages; [
pytestrunner
setuptools
setuptools-scm-git-archive
setuptools_scm
];
propagatedBuildInputs = with python3Packages; [
cffi
chardet
img2pdf
pdfminer
pikepdf
reportlab
ruffus
];
checkInputs = with python3Packages; [
hocr-tools
pypdf2
pytest
pytest-helpers-namespace
pytest_xdist
pytestcov
pytestrunner
python-xmp-toolkit
setuptools
] ++ runtimeDeps;
postPatch = ''
substituteInPlace src/ocrmypdf/leptonica.py \
--replace "ffi.dlopen(find_library('lept'))" \
'ffi.dlopen("${stdenv.lib.makeLibraryPath [leptonica]}/liblept${stdenv.hostPlatform.extensions.sharedLibrary}")'
'';
# The tests take potentially 20+ minutes, depending on machine
doCheck = false;
# These tests fail and it might be upstream problem... or packaging. :)
# development is happening on macos and the pinned test versions are
# significantly newer than nixpkgs has. Program still works...
# (to the extent I've used it) -- Kiwi
checkPhase = ''
export HOME=$TMPDIR
pytest -k 'not test_force_ocr_on_pdf_with_no_images \
and not test_tesseract_crash \
and not test_tesseract_crash_autorotate \
and not test_ghostscript_pdfa_failure \
and not test_gs_render_failure \
and not test_gs_raster_failure \
and not test_bad_utf8 \
and not test_old_unpaper'
'';
meta = with stdenv.lib; {
homepage = "https://github.com/jbarlow83/OCRmyPDF";
description = "Adds an OCR text layer to scanned PDF files, allowing them to be searched";
license = licenses.gpl3;
platforms = platforms.linux;
maintainers = [ maintainers.kiwi ];
};
}

View File

@ -1704,6 +1704,8 @@ in
hid-listen = callPackage ../tools/misc/hid-listen { };
hocr-tools = with python3Packages; toPythonApplication hocr-tools;
home-manager = callPackage ../tools/package-management/home-manager {};
hostsblock = callPackage ../tools/misc/hostsblock { };
@ -1828,6 +1830,8 @@ in
nyx = callPackage ../tools/networking/nyx { };
ocrmypdf = callPackage ../tools/text/ocrmypdf { };
onboard = callPackage ../applications/misc/onboard { };
xkbd = callPackage ../applications/misc/xkbd { };

View File

@ -643,6 +643,8 @@ in {
hdmedians = callPackage ../development/python-modules/hdmedians { };
hocr-tools = callPackage ../development/python-modules/hocr-tools { };
holoviews = callPackage ../development/python-modules/holoviews { };
hoomd-blue = toPythonModule (callPackage ../development/python-modules/hoomd-blue {
@ -1952,6 +1954,8 @@ in {
hypothesis = self.hypothesis.override { doCheck = false; };
};
pytest-helpers-namespace = callPackage ../development/python-modules/pytest-helpers-namespace { };
pytest-httpbin = callPackage ../development/python-modules/pytest-httpbin { };
pytest-asyncio = callPackage ../development/python-modules/pytest-asyncio { };
@ -4143,6 +4147,8 @@ in {
pika-pool = callPackage ../development/python-modules/pika-pool { };
pikepdf = callPackage ../development/python-modules/pikepdf { };
kmapper = callPackage ../development/python-modules/kmapper { };
kmsxx = (callPackage ../development/libraries/kmsxx {
@ -5083,6 +5089,8 @@ in {
ruamel_yaml = callPackage ../development/python-modules/ruamel_yaml { };
ruffus = callPackage ../development/python-modules/ruffus { };
runsnakerun = callPackage ../development/python-modules/runsnakerun { };
pysendfile = callPackage ../development/python-modules/pysendfile { };
@ -5669,6 +5677,8 @@ in {
python-u2flib-host = callPackage ../development/python-modules/python-u2flib-host { };
python-xmp-toolkit = callPackage ../development/python-modules/python-xmp-toolkit { };
pluggy = callPackage ../development/python-modules/pluggy {};
xcffib = callPackage ../development/python-modules/xcffib {};