From 740d4c22ecccf94b5b55c449437069a96a005c6c Mon Sep 17 00:00:00 2001 From: Daniel Fullmer Date: Fri, 13 Sep 2019 15:53:03 -0400 Subject: [PATCH] k2pdfopt: Fix build and clean up --- pkgs/applications/misc/k2pdfopt/default.nix | 62 +- .../k2pdfopt/leptonica-CVE-2018-3836.patch | 95 -- .../misc/k2pdfopt/leptonica.patch | 254 ++++ pkgs/applications/misc/k2pdfopt/mupdf.patch | 1060 +++++++++++++++++ .../misc/k2pdfopt/tesseract.patch | 678 ++++++++++- 5 files changed, 1991 insertions(+), 158 deletions(-) delete mode 100644 pkgs/applications/misc/k2pdfopt/leptonica-CVE-2018-3836.patch create mode 100644 pkgs/applications/misc/k2pdfopt/leptonica.patch create mode 100644 pkgs/applications/misc/k2pdfopt/mupdf.patch diff --git a/pkgs/applications/misc/k2pdfopt/default.nix b/pkgs/applications/misc/k2pdfopt/default.nix index 9391fe88c5ea..58bd200e713c 100644 --- a/pkgs/applications/misc/k2pdfopt/default.nix +++ b/pkgs/applications/misc/k2pdfopt/default.nix @@ -36,67 +36,19 @@ stdenv.mkDerivation rec { buildInputs = let + # The patches below were constructed by taking the files from k2pdfopt in + # the {mupdf,leptonica,tesseract}_mod/ directories, replacing the + # corresponding files in the respective source trees, resolving any errors + # with more recent versions of these depencencies, and running diff. mupdf_modded = mupdf.overrideAttrs (attrs: { - # Excluded the pdf-*.c files, since they mostly just broke the #includes - prePatch = '' - cp ${src}/mupdf_mod/{font,stext-device,string}.c source/fitz/ - cp ${src}/mupdf_mod/font-win32.c source/pdf/ - ''; + patches = attrs.patches ++ [ ./mupdf.patch ]; # Last verified with mupdf 1.14.0 }); - leptonica_modded = leptonica.overrideAttrs (attrs: { - name = "leptonica-1.74.4"; - # Modified source files apply to this particular version of leptonica - version = "1.74.4"; - - src = fetchurl { - url = "http://www.leptonica.org/source/leptonica-1.74.4.tar.gz"; - sha256 = "0fw39amgyv8v6nc7x8a4c7i37dm04i6c5zn62d24bgqnlhk59hr9"; - }; - - prePatch = '' - cp ${src}/leptonica_mod/{allheaders.h,dewarp2.c,leptwin.c} src/ - ''; - patches = [ - # stripped down copy of upstream commit b88c821f8d347bce0aea86d606c710303919f3d2 - ./leptonica-CVE-2018-3836.patch - (fetchpatch { - # CVE-2018-7186 - url = "https://github.com/DanBloomberg/leptonica/commit/" - + "ee301cb2029db8a6289c5295daa42bba7715e99a.patch"; - sha256 = "0cgb7mvz2px1rg5i80wk1wxxjvzjga617d8q6j7qygkp7jm6495d"; - }) - (fetchpatch { - # CVE-2018-7247 - url = "https://github.com/DanBloomberg/leptonica/commit/" - + "c1079bb8e77cdd426759e466729917ca37a3ed9f.patch"; - sha256 = "1z4iac5gwqggh7aa8cvyp6nl9fwd1v7wif26caxc9y5qr3jj34qf"; - }) - (fetchpatch { - # CVE-2018-7440 - url = "https://github.com/DanBloomberg/leptonica/commit/" - + "49ecb6c2dfd6ed5078c62f4a8eeff03e3beced3b.patch"; - sha256 = "1hjmva98iaw9xj7prg7aimykyayikcwnk4hk0380007hqb35lqmy"; - }) - ]; + patches = [ ./leptonica.patch ]; # Last verified with leptonica 1.78.0 }); tesseract_modded = tesseract4.override { tesseractBase = tesseract4.tesseractBase.overrideAttrs (_: { - prePatch = '' - cp ${src}/tesseract_mod/baseapi.{h,cpp} src/api/ - cp ${src}/tesseract_mod/ccutil.{h,cpp} src/ccutil/ - cp ${src}/tesseract_mod/genericvector.h src/ccutil/ - cp ${src}/tesseract_mod/input.cpp src/lstm/ - cp ${src}/tesseract_mod/lstmrecognizer.cpp src/lstm/ - cp ${src}/tesseract_mod/mainblk.cpp src/ccutil/ - cp ${src}/tesseract_mod/params.cpp src/ccutil/ - cp ${src}/tesseract_mod/serialis.{h,cpp} src/ccutil/ - cp ${src}/tesseract_mod/tesscapi.cpp src/api/ - cp ${src}/tesseract_mod/tessdatamanager.cpp src/ccstruct/ - cp ${src}/tesseract_mod/tessedit.cpp src/ccmain/ - cp ${src}/include_mod/{tesseract.h,leptonica.h} src/api/ - ''; - patches = [ ./tesseract.patch ]; + patches = [ ./tesseract.patch ]; # Last verified with tesseract 1.4 }); }; in diff --git a/pkgs/applications/misc/k2pdfopt/leptonica-CVE-2018-3836.patch b/pkgs/applications/misc/k2pdfopt/leptonica-CVE-2018-3836.patch deleted file mode 100644 index f1b4170fbaae..000000000000 --- a/pkgs/applications/misc/k2pdfopt/leptonica-CVE-2018-3836.patch +++ /dev/null @@ -1,95 +0,0 @@ ---- a/src/allheaders.h -+++ b/src/allheaders.h -@@ -2600,6 +2600,7 @@ - LEPT_DLL extern char * stringReverse ( const char *src ); - LEPT_DLL extern char * strtokSafe ( char *cstr, const char *seps, char **psaveptr ); - LEPT_DLL extern l_int32 stringSplitOnToken ( char *cstr, const char *seps, char **phead, char **ptail ); -+LEPT_DLL extern l_int32 stringCheckForChars ( const char *src, const char *chars, l_int32 *pfound ); - LEPT_DLL extern char * stringRemoveChars ( const char *src, const char *remchars ); - LEPT_DLL extern l_int32 stringFindSubstr ( const char *src, const char *sub, l_int32 *ploc ); - LEPT_DLL extern char * stringReplaceSubstr ( const char *src, const char *sub1, const char *sub2, l_int32 *pfound, l_int32 *ploc ); ---- a/src/gplot.c -+++ b/src/gplot.c -@@ -141,9 +141,10 @@ - const char *xlabel, - const char *ylabel) - { --char *newroot; --char buf[L_BUF_SIZE]; --GPLOT *gplot; -+char *newroot; -+char buf[L_BUF_SIZE]; -+l_int32 badchar; -+GPLOT *gplot; - - PROCNAME("gplotCreate"); - -@@ -152,6 +153,9 @@ - if (outformat != GPLOT_PNG && outformat != GPLOT_PS && - outformat != GPLOT_EPS && outformat != GPLOT_LATEX) - return (GPLOT *)ERROR_PTR("outformat invalid", procName, NULL); -+ stringCheckForChars(rootname, "`;&|><\"?*", &badchar); -+ if (badchar) /* danger of command injection */ -+ return (GPLOT *)ERROR_PTR("invalid rootname", procName, NULL); - - if ((gplot = (GPLOT *)LEPT_CALLOC(1, sizeof(GPLOT))) == NULL) - return (GPLOT *)ERROR_PTR("gplot not made", procName, NULL); ---- a/src/utils2.c -+++ b/src/utils2.c -@@ -42,6 +42,7 @@ - * l_int32 stringSplitOnToken() - * - * Find and replace string and array procs -+ * l_int32 stringCheckForChars() - * char *stringRemoveChars() - * l_int32 stringFindSubstr() - * char *stringReplaceSubstr() -@@ -701,6 +702,48 @@ - /*--------------------------------------------------------------------* - * Find and replace procs * - *--------------------------------------------------------------------*/ -+/*! -+ * \brief stringCheckForChars() -+ * -+ * \param[in] src input string; can be of zero length -+ * \param[in] chars string of chars to be searched for in %src -+ * \param[out] pfound 1 if any characters are found; 0 otherwise -+ * \return 0 if OK, 1 on error -+ * -+ *
-+ * Notes:
-+ *      (1) This can be used to sanitize an operation by checking for
-+ *          special characters that don't belong in a string.
-+ * 
-+ */ -+l_int32 -+stringCheckForChars(const char *src, -+ const char *chars, -+ l_int32 *pfound) -+{ -+char ch; -+l_int32 i, n; -+ -+ PROCNAME("stringCheckForChars"); -+ -+ if (!pfound) -+ return ERROR_INT("&found not defined", procName, 1); -+ *pfound = FALSE; -+ if (!src || !chars) -+ return ERROR_INT("src and chars not both defined", procName, 1); -+ -+ n = strlen(src); -+ for (i = 0; i < n; i++) { -+ ch = src[i]; -+ if (strchr(chars, ch)) { -+ *pfound = TRUE; -+ break; -+ } -+ } -+ return 0; -+} -+ -+ - /*! - * \brief stringRemoveChars() - * diff --git a/pkgs/applications/misc/k2pdfopt/leptonica.patch b/pkgs/applications/misc/k2pdfopt/leptonica.patch new file mode 100644 index 000000000000..dfab99fd0130 --- /dev/null +++ b/pkgs/applications/misc/k2pdfopt/leptonica.patch @@ -0,0 +1,254 @@ +From 8c11a20925686855023df90ed477957c7d7fe91e Mon Sep 17 00:00:00 2001 +From: Daniel Fullmer +Date: Fri, 13 Sep 2019 15:54:21 -0400 +Subject: [PATCH] Willus mod for k2pdfopt + +--- + src/allheaders.h | 4 ++ + src/dewarp2.c | 106 ++++++++++++++++++++++++++++++++++++++++++----- + src/leptwin.c | 6 ++- + 3 files changed, 104 insertions(+), 12 deletions(-) + +diff --git a/src/allheaders.h b/src/allheaders.h +index e68eff1..b3cc729 100644 +--- a/src/allheaders.h ++++ b/src/allheaders.h +@@ -669,6 +669,10 @@ LEPT_DLL extern L_DEWARPA * dewarpaReadMem ( const l_uint8 *data, size_t size ); + LEPT_DLL extern l_ok dewarpaWrite ( const char *filename, L_DEWARPA *dewa ); + LEPT_DLL extern l_ok dewarpaWriteStream ( FILE *fp, L_DEWARPA *dewa ); + LEPT_DLL extern l_ok dewarpaWriteMem ( l_uint8 **pdata, size_t *psize, L_DEWARPA *dewa ); ++/* WILLUS MOD */ ++ LEPT_DLL extern l_int32 dewarpBuildPageModel_ex ( L_DEWARP *dew, const char *debugfile,l_int32 fit_order ); ++ LEPT_DLL extern l_int32 dewarpFindVertDisparity_ex ( L_DEWARP *dew, PTAA *ptaa, l_int32 rotflag,l_int32 fit_order ); ++ LEPT_DLL extern l_int32 dewarpBuildLineModel_ex ( L_DEWARP *dew, l_int32 opensize, const char *debugfile,l_int32 fit_order ); + LEPT_DLL extern l_ok dewarpBuildPageModel ( L_DEWARP *dew, const char *debugfile ); + LEPT_DLL extern l_ok dewarpFindVertDisparity ( L_DEWARP *dew, PTAA *ptaa, l_int32 rotflag ); + LEPT_DLL extern l_ok dewarpFindHorizDisparity ( L_DEWARP *dew, PTAA *ptaa ); +diff --git a/src/dewarp2.c b/src/dewarp2.c +index 220eec1..2e29500 100644 +--- a/src/dewarp2.c ++++ b/src/dewarp2.c +@@ -144,9 +144,17 @@ static const l_float32 L_ALLOWED_W_FRACT = 0.05; /* no bigger */ + * longest textlines. + * + */ ++/* WILLUS MOD */ + l_ok +-dewarpBuildPageModel(L_DEWARP *dew, +- const char *debugfile) ++dewarpBuildPageModel(L_DEWARP *dew,const char *debugfile) ++{ ++return(dewarpBuildPageModel_ex(dew,debugfile,2)); ++} ++ ++l_ok ++dewarpBuildPageModel_ex(L_DEWARP *dew, ++ const char *debugfile, ++ l_int32 fit_order) + { + l_int32 linecount, topline, botline, ret; + PIX *pixs, *pix1, *pix2, *pix3; +@@ -225,7 +233,7 @@ PTAA *ptaa1, *ptaa2; + /* Get the sampled vertical disparity from the textline centers. + * The disparity array will push pixels vertically so that each + * textline is flat and centered at the y-position of the mid-point. */ +- if (dewarpFindVertDisparity(dew, ptaa2, 0) != 0) { ++ if (dewarpFindVertDisparity_ex(dew, ptaa2, 0, fit_order) != 0) { + L_WARNING("vertical disparity not built\n", procName); + ptaaDestroy(&ptaa2); + return 1; +@@ -290,13 +298,24 @@ PTAA *ptaa1, *ptaa2; + * a pdf. Non-pix debug output goes to /tmp. + * + */ ++/* WILLUS MOD */ + l_ok + dewarpFindVertDisparity(L_DEWARP *dew, + PTAA *ptaa, + l_int32 rotflag) + { ++return(dewarpFindVertDisparity_ex(dew,ptaa,rotflag,2)); ++} ++/* WILLUS MOD -- add cubic and quartic fits and ..._ex functions */ ++l_int32 ++dewarpFindVertDisparity_ex(L_DEWARP *dew, ++ PTAA *ptaa, ++ l_int32 rotflag, ++ l_int32 fit_order) ++{ + l_int32 i, j, nlines, npts, nx, ny, sampling; +-l_float32 c0, c1, c2, x, y, midy, val, medval, meddev, minval, maxval; ++/* WILLUS MOD */ ++l_float32 c0, c1, c2, c3, c4, x, y, midy, val, medval, meddev, minval, maxval; + l_float32 *famidys; + NUMA *nax, *nafit, *nacurve0, *nacurve1, *nacurves; + NUMA *namidy, *namidys, *namidysi; +@@ -304,11 +323,22 @@ PIX *pix1, *pix2, *pixcirc, *pixdb; + PTA *pta, *ptad, *ptacirc; + PTAA *ptaa0, *ptaa1, *ptaa2, *ptaa3, *ptaa4, *ptaa5, *ptaat; + FPIX *fpix; ++/* WILLUS MOD */ ++l_int32 fit_order1,fit_order2; + + PROCNAME("dewarpFindVertDisparity"); + + if (!dew) + return ERROR_INT("dew not defined", procName, 1); ++/* WILLUS MOD */ ++ if (fit_order < 10) ++ fit_order1 = fit_order2 = fit_order; ++ else ++ { ++ fit_order1=fit_order % 10; ++ fit_order2=fit_order / 10; ++ fit_order2=fit_order2 % 10; ++ } + dew->vsuccess = 0; + if (!ptaa) + return ERROR_INT("ptaa not defined", procName, 1); +@@ -331,12 +361,32 @@ FPIX *fpix; + pixdb = (rotflag) ? pixRotateOrth(dew->pixs, 1) : pixClone(dew->pixs); + for (i = 0; i < nlines; i++) { /* for each line */ + pta = ptaaGetPta(ptaa, i, L_CLONE); +- ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL); +- numaAddNumber(nacurve0, c2); ++/* WILLUS MOD */ ++if (fit_order1>3) ++ { ++ ptaGetQuarticLSF(pta, &c4, &c3, &c2, &c1, &c0, NULL); ++ numaAddNumber(nacurve0, c4); ++ } ++else if (fit_order1==3) ++ { ++ ptaGetCubicLSF(pta, &c3, &c2, &c1, &c0, NULL); ++ numaAddNumber(nacurve0, c3); ++ } ++else ++ { ++ ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL); ++ numaAddNumber(nacurve0, c2); ++ } + ptad = ptaCreate(nx); + for (j = 0; j < nx; j++) { /* uniformly sampled in x */ + x = j * sampling; +- applyQuadraticFit(c2, c1, c0, x, &y); ++/* WILLUS MOD */ ++if (fit_order1>3) ++ applyQuarticFit(c4, c3, c2, c1, c0, x, &y); ++else if (fit_order1==3) ++ applyCubicFit(c3, c2, c1, c0, x, &y); ++else ++ applyQuadraticFit(c2, c1, c0, x, &y); + ptaAddPt(ptad, x, y); + } + ptaaAddPta(ptaa0, ptad, L_INSERT); +@@ -350,7 +400,13 @@ FPIX *fpix; + for (i = 0; i < nlines; i++) { + pta = ptaaGetPta(ptaa, i, L_CLONE); + ptaGetArrays(pta, &nax, NULL); +- ptaGetQuadraticLSF(pta, NULL, NULL, NULL, &nafit); ++/* WILLUS MOD */ ++if (fit_order1>3) ++ptaGetQuarticLSF(pta, NULL, NULL, NULL, NULL, NULL, &nafit); ++else if (fit_order1==3) ++ptaGetCubicLSF(pta, NULL, NULL, NULL, NULL, &nafit); ++else ++ptaGetQuadraticLSF(pta, NULL, NULL, NULL, &nafit); + ptad = ptaCreateFromNuma(nax, nafit); + ptaaAddPta(ptaat, ptad, L_INSERT); + ptaDestroy(&pta); +@@ -494,11 +550,24 @@ FPIX *fpix; + ptaa5 = ptaaCreate(nx); /* uniformly sampled across full height of image */ + for (j = 0; j < nx; j++) { /* for each column */ + pta = ptaaGetPta(ptaa4, j, L_CLONE); +- ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL); ++/* WILLUS MOD */ ++/* Order higher than 2 can cause a little craziness here. */ ++if (fit_order2>3) ++ ptaGetQuarticLSF(pta, &c4, &c3, &c2, &c1, &c0, NULL); ++else if (fit_order2==3) ++ ptaGetCubicLSF(pta, &c3, &c2, &c1, &c0, NULL); ++else ++ ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL); + ptad = ptaCreate(ny); + for (i = 0; i < ny; i++) { /* uniformly sampled in y */ + y = i * sampling; +- applyQuadraticFit(c2, c1, c0, y, &val); ++/* WILLUS MOD */ ++if (fit_order2>3) ++ applyQuarticFit(c4, c3, c2, c1, c0, y, &val); ++else if (fit_order2==3) ++ applyCubicFit(c3, c2, c1, c0, y, &val); ++else ++ applyQuadraticFit(c2, c1, c0, y, &val); + ptaAddPt(ptad, y, val); + } + ptaaAddPta(ptaa5, ptad, L_INSERT); +@@ -1602,11 +1671,21 @@ FPIX *fpix; + * See notes there. + * + */ ++/* WILLUS MOD */ + l_ok + dewarpBuildLineModel(L_DEWARP *dew, + l_int32 opensize, + const char *debugfile) + { ++return(dewarpBuildLineModel_ex(dew,opensize,debugfile,2)); ++} ++ ++l_int32 ++dewarpBuildLineModel_ex(L_DEWARP *dew, ++ l_int32 opensize, ++ const char *debugfile, ++ l_int32 fit_order) ++{ + char buf[64]; + l_int32 i, j, bx, by, ret, nlines; + BOXA *boxa; +@@ -1695,6 +1774,8 @@ PTAA *ptaa1, *ptaa2; + + /* Remove all lines that are not at least 0.75 times the length + * of the longest line. */ ++/* WILLUS MOD */ ++/* + ptaa2 = dewarpRemoveShortLines(pix, ptaa1, 0.75, DEBUG_SHORT_LINES); + if (debugfile) { + pix1 = pixConvertTo32(pix); +@@ -1704,6 +1785,8 @@ PTAA *ptaa1, *ptaa2; + pixDestroy(&pix1); + pixDestroy(&pix2); + } ++*/ ++ptaa2=ptaa1; + ptaaDestroy(&ptaa1); + nlines = ptaaGetCount(ptaa2); + if (nlines < dew->minlines) { +@@ -1717,7 +1800,8 @@ PTAA *ptaa1, *ptaa2; + * centers. The disparity array will push pixels vertically + * so that each line is flat and centered at the y-position + * of the mid-point. */ +- ret = dewarpFindVertDisparity(dew, ptaa2, 1 - i); ++/* WILLUS MOD */ ++ ret = dewarpFindVertDisparity_ex(dew, ptaa2, 1 - i, fit_order); + + /* If i == 0, move the result to the horizontal disparity, + * rotating it back by -90 degrees. */ +diff --git a/src/leptwin.c b/src/leptwin.c +index 72643a0..573d33e 100644 +--- a/src/leptwin.c ++++ b/src/leptwin.c +@@ -364,5 +364,9 @@ PIXCMAP *cmap; + + return hBitmap; + } +- ++#else ++/* willus mod: Avoid weird issue with OS/X library archiver when there are no symbols */ ++int leptwin_my_empty_func(void); ++int leptwin_my_empty_func(void) ++{return(0);} + #endif /* _WIN32 */ +-- +2.22.0 + diff --git a/pkgs/applications/misc/k2pdfopt/mupdf.patch b/pkgs/applications/misc/k2pdfopt/mupdf.patch new file mode 100644 index 000000000000..f7c04d42a71d --- /dev/null +++ b/pkgs/applications/misc/k2pdfopt/mupdf.patch @@ -0,0 +1,1060 @@ +From 3d763f84872351c250ffea26150e73b02b8f4c6f Mon Sep 17 00:00:00 2001 +From: Daniel Fullmer +Date: Fri, 13 Sep 2019 15:11:45 -0400 +Subject: [PATCH] Willus mod for k2pdfopt + +--- + source/fitz/filter-basic.c | 3 + + source/fitz/font-win32.c | 866 +++++++++++++++++++++++++++++++++++++ + source/fitz/font.c | 3 + + source/fitz/stext-device.c | 5 + + source/fitz/string.c | 5 + + source/pdf/pdf-annot.c | 14 +- + source/pdf/pdf-link.c | 3 + + source/pdf/pdf-parse.c | 5 + + source/pdf/pdf-xref.c | 9 + + 9 files changed, 912 insertions(+), 1 deletion(-) + create mode 100644 source/fitz/font-win32.c + +diff --git a/source/fitz/filter-basic.c b/source/fitz/filter-basic.c +index 0713a62e7..b8ef4d292 100644 +--- a/source/fitz/filter-basic.c ++++ b/source/fitz/filter-basic.c +@@ -259,7 +259,10 @@ look_for_endstream: + if (!state->warned) + { + state->warned = 1; ++/* willus mod -- no warning */ ++/* + fz_warn(ctx, "PDF stream Length incorrect"); ++*/ + } + return *stm->rp++; + } +diff --git a/source/fitz/font-win32.c b/source/fitz/font-win32.c +new file mode 100644 +index 000000000..45de8cfd3 +--- /dev/null ++++ b/source/fitz/font-win32.c +@@ -0,0 +1,866 @@ ++/* ++** Routines to access MS Windows system fonts. ++** From sumatra PDF distro. ++** Modified for MuPDF v1.9a by willus.com ++*/ ++#include "mupdf/pdf.h" ++ ++/* ++ Which fonts are embedded is based on a few preprocessor definitions. ++ ++ The base 14 fonts are always embedded. ++ For CJK font substitution we embed DroidSansFallback. ++ ++ Set NOCJK to skip all CJK support (this also omits embedding the CJK CMaps) ++ Set NOCJKFONT to skip the embedded CJK font. ++ Set NOCJKFULL to embed a smaller CJK font without CJK Extension A support. ++*/ ++ ++#ifdef NOCJK ++#define NOCJKFONT ++#endif ++ ++/* SumatraPDF: also load fonts included with Windows */ ++#ifdef _WIN32 ++ ++#ifndef UNICODE ++#define UNICODE ++#endif ++#ifndef _UNICODE ++#define _UNICODE ++#endif ++ ++#include ++ ++// TODO: Use more of FreeType for TTF parsing (for performance reasons, ++// the fonts can't be parsed completely, though) ++#include ++#include FT_TRUETYPE_IDS_H ++#include FT_TRUETYPE_TAGS_H ++ ++#define TTC_VERSION1 0x00010000 ++#define TTC_VERSION2 0x00020000 ++ ++#define MAX_FACENAME 128 ++ ++// Note: the font face must be the first field so that the structure ++// can be treated like a simple string for searching ++typedef struct pdf_fontmapMS_s ++{ ++ char fontface[MAX_FACENAME]; ++ char fontpath[MAX_PATH]; ++ int index; ++} pdf_fontmapMS; ++ ++typedef struct pdf_fontlistMS_s ++{ ++ pdf_fontmapMS *fontmap; ++ int len; ++ int cap; ++} pdf_fontlistMS; ++ ++typedef struct _tagTT_OFFSET_TABLE ++{ ++ ULONG uVersion; ++ USHORT uNumOfTables; ++ USHORT uSearchRange; ++ USHORT uEntrySelector; ++ USHORT uRangeShift; ++} TT_OFFSET_TABLE; ++ ++typedef struct _tagTT_TABLE_DIRECTORY ++{ ++ ULONG uTag; //table name ++ ULONG uCheckSum; //Check sum ++ ULONG uOffset; //Offset from beginning of file ++ ULONG uLength; //length of the table in bytes ++} TT_TABLE_DIRECTORY; ++ ++typedef struct _tagTT_NAME_TABLE_HEADER ++{ ++ USHORT uFSelector; //format selector. Always 0 ++ USHORT uNRCount; //Name Records count ++ USHORT uStorageOffset; //Offset for strings storage, from start of the table ++} TT_NAME_TABLE_HEADER; ++ ++typedef struct _tagTT_NAME_RECORD ++{ ++ USHORT uPlatformID; ++ USHORT uEncodingID; ++ USHORT uLanguageID; ++ USHORT uNameID; ++ USHORT uStringLength; ++ USHORT uStringOffset; //from start of storage area ++} TT_NAME_RECORD; ++ ++typedef struct _tagFONT_COLLECTION ++{ ++ ULONG Tag; ++ ULONG Version; ++ ULONG NumFonts; ++} FONT_COLLECTION; ++ ++static struct { ++ char *name; ++ char *pattern; ++} baseSubstitutes[] = { ++ { "Courier", "CourierNewPSMT" }, ++ { "Courier-Bold", "CourierNewPS-BoldMT" }, ++ { "Courier-Oblique", "CourierNewPS-ItalicMT" }, ++ { "Courier-BoldOblique", "CourierNewPS-BoldItalicMT" }, ++ { "Helvetica", "ArialMT" }, ++ { "Helvetica-Bold", "Arial-BoldMT" }, ++ { "Helvetica-Oblique", "Arial-ItalicMT" }, ++ { "Helvetica-BoldOblique", "Arial-BoldItalicMT" }, ++ { "Times-Roman", "TimesNewRomanPSMT" }, ++ { "Times-Bold", "TimesNewRomanPS-BoldMT" }, ++ { "Times-Italic", "TimesNewRomanPS-ItalicMT" }, ++ { "Times-BoldItalic", "TimesNewRomanPS-BoldItalicMT" }, ++ { "Symbol", "SymbolMT" }, ++}; ++static const char *base_font_names[][10] = ++{ ++ { "Courier", "CourierNew", "CourierNewPSMT", NULL }, ++ { "Courier-Bold", "CourierNew,Bold", "Courier,Bold", ++ "CourierNewPS-BoldMT", "CourierNew-Bold", NULL }, ++ { "Courier-Oblique", "CourierNew,Italic", "Courier,Italic", ++ "CourierNewPS-ItalicMT", "CourierNew-Italic", NULL }, ++ { "Courier-BoldOblique", "CourierNew,BoldItalic", "Courier,BoldItalic", ++ "CourierNewPS-BoldItalicMT", "CourierNew-BoldItalic", NULL }, ++ { "Helvetica", "ArialMT", "Arial", NULL }, ++ { "Helvetica-Bold", "Arial-BoldMT", "Arial,Bold", "Arial-Bold", ++ "Helvetica,Bold", NULL }, ++ { "Helvetica-Oblique", "Arial-ItalicMT", "Arial,Italic", "Arial-Italic", ++ "Helvetica,Italic", "Helvetica-Italic", NULL }, ++ { "Helvetica-BoldOblique", "Arial-BoldItalicMT", ++ "Arial,BoldItalic", "Arial-BoldItalic", ++ "Helvetica,BoldItalic", "Helvetica-BoldItalic", NULL }, ++ { "Times-Roman", "TimesNewRomanPSMT", "TimesNewRoman", ++ "TimesNewRomanPS", NULL }, ++ { "Times-Bold", "TimesNewRomanPS-BoldMT", "TimesNewRoman,Bold", ++ "TimesNewRomanPS-Bold", "TimesNewRoman-Bold", NULL }, ++ { "Times-Italic", "TimesNewRomanPS-ItalicMT", "TimesNewRoman,Italic", ++ "TimesNewRomanPS-Italic", "TimesNewRoman-Italic", NULL }, ++ { "Times-BoldItalic", "TimesNewRomanPS-BoldItalicMT", ++ "TimesNewRoman,BoldItalic", "TimesNewRomanPS-BoldItalic", ++ "TimesNewRoman-BoldItalic", NULL }, ++ { "Symbol", "Symbol,Italic", "Symbol,Bold", "Symbol,BoldItalic", ++ "SymbolMT", "SymbolMT,Italic", "SymbolMT,Bold", "SymbolMT,BoldItalic", NULL }, ++ { "ZapfDingbats", NULL } ++}; ++ ++static pdf_fontlistMS fontlistMS = ++{ ++ NULL, ++ 0, ++ 0, ++}; ++static int strcmp_ignore_space(const char *a, const char *b); ++static const char *clean_font_name(const char *fontname); ++static const char *pdf_clean_base14_name(const char *fontname); ++ ++static inline USHORT BEtoHs(USHORT x) ++{ ++ BYTE *data = (BYTE *)&x; ++ return (data[0] << 8) | data[1]; ++} ++ ++static inline ULONG BEtoHl(ULONG x) ++{ ++ BYTE *data = (BYTE *)&x; ++ return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3]; ++} ++ ++static int strcmp_ignore_space(const char *a, const char *b) ++{ ++ while (1) ++ { ++ while (*a == ' ') ++ a++; ++ while (*b == ' ') ++ b++; ++ if (*a != *b) ++ return 1; ++ if (*a == 0) ++ return *a != *b; ++ if (*b == 0) ++ return *a != *b; ++ a++; ++ b++; ++ } ++} ++ ++/* A little bit more sophisticated name matching so that e.g. "EurostileExtended" ++ matches "EurostileExtended-Roman" or "Tahoma-Bold,Bold" matches "Tahoma-Bold" */ ++static int ++lookup_compare(const void *elem1, const void *elem2) ++{ ++ const char *val1 = elem1; ++ const char *val2 = elem2; ++ int len1 = strlen(val1); ++ int len2 = strlen(val2); ++ ++ if (len1 != len2) ++ { ++ const char *rest = len1 > len2 ? val1 + len2 : val2 + len1; ++ if (',' == *rest || !_stricmp(rest, "-roman")) ++ return _strnicmp(val1, val2, fz_mini(len1, len2)); ++ } ++ ++ return _stricmp(val1, val2); ++} ++ ++static void ++remove_spaces(char *srcDest) ++{ ++ char *dest; ++ ++ for (dest = srcDest; *srcDest; srcDest++) ++ if (*srcDest != ' ') ++ *dest++ = *srcDest; ++ *dest = '\0'; ++} ++ ++static int ++str_ends_with(const char *str, const char *end) ++{ ++ size_t len1 = strlen(str); ++ size_t len2 = strlen(end); ++ ++ return len1 >= len2 && !strcmp(str + len1 - len2, end); ++} ++ ++static pdf_fontmapMS * ++pdf_find_windows_font_path(const char *fontname) ++{ ++ return bsearch(fontname, fontlistMS.fontmap, fontlistMS.len, sizeof(pdf_fontmapMS), lookup_compare); ++} ++ ++/* source and dest can be same */ ++static void ++decode_unicode_BE(fz_context *ctx, char *source, int sourcelen, char *dest, int destlen) ++{ ++ WCHAR *tmp; ++ int converted, i; ++ ++ if (sourcelen % 2 != 0) ++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid unicode string"); ++ ++ tmp = fz_malloc_array(ctx, sourcelen / 2 + 1, sizeof(WCHAR)); ++ for (i = 0; i < sourcelen / 2; i++) ++ tmp[i] = BEtoHs(((WCHAR *)source)[i]); ++ tmp[sourcelen / 2] = '\0'; ++ ++ converted = WideCharToMultiByte(CP_UTF8, 0, tmp, -1, dest, destlen, NULL, NULL); ++ fz_free(ctx, tmp); ++ if (!converted) ++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid unicode string"); ++} ++ ++static void ++decode_platform_string(fz_context *ctx, int platform, int enctype, char *source, int sourcelen, char *dest, int destlen) ++{ ++ switch (platform) ++ { ++ case TT_PLATFORM_APPLE_UNICODE: ++ switch (enctype) ++ { ++ case TT_APPLE_ID_DEFAULT: ++ case TT_APPLE_ID_UNICODE_2_0: ++ decode_unicode_BE(ctx, source, sourcelen, dest, destlen); ++ return; ++ } ++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype); ++ case TT_PLATFORM_MACINTOSH: ++ switch (enctype) ++ { ++ case TT_MAC_ID_ROMAN: ++ if (sourcelen + 1 > destlen) ++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : overlong fontname: %s", source); ++ // TODO: Convert to UTF-8 from what encoding? ++ memcpy(dest, source, sourcelen); ++ dest[sourcelen] = 0; ++ return; ++ } ++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype); ++ case TT_PLATFORM_MICROSOFT: ++ switch (enctype) ++ { ++ case TT_MS_ID_SYMBOL_CS: ++ case TT_MS_ID_UNICODE_CS: ++ case TT_MS_ID_UCS_4: ++ decode_unicode_BE(ctx, source, sourcelen, dest, destlen); ++ return; ++ } ++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype); ++ default: ++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype); ++ } ++} ++ ++static void ++grow_system_font_list(fz_context *ctx, pdf_fontlistMS *fl) ++{ ++ int newcap; ++ pdf_fontmapMS *newitems; ++ ++ if (fl->cap == 0) ++ newcap = 1024; ++ else ++ newcap = fl->cap * 2; ++ ++ // use realloc/free for the fontmap, since the list can ++ // remain in memory even with all fz_contexts destroyed ++ newitems = realloc(fl->fontmap, newcap * sizeof(pdf_fontmapMS)); ++ if (!newitems) ++ fz_throw(ctx, FZ_ERROR_GENERIC, "OOM in grow_system_font_list"); ++ memset(newitems + fl->cap, 0, sizeof(pdf_fontmapMS) * (newcap - fl->cap)); ++ ++ fl->fontmap = newitems; ++ fl->cap = newcap; ++} ++ ++static void ++append_mapping(fz_context *ctx, pdf_fontlistMS *fl, const char *facename, const char *path, int index) ++{ ++ if (fl->len == fl->cap) ++ grow_system_font_list(ctx, fl); ++ ++ if (fl->len >= fl->cap) ++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : fontlist overflow"); ++ ++ fz_strlcpy(fl->fontmap[fl->len].fontface, facename, sizeof(fl->fontmap[0].fontface)); ++ fz_strlcpy(fl->fontmap[fl->len].fontpath, path, sizeof(fl->fontmap[0].fontpath)); ++ fl->fontmap[fl->len].index = index; ++ ++ ++fl->len; ++} ++ ++static void ++safe_read(fz_context *ctx, fz_stream *file, int offset, char *buf, int size) ++{ ++ int n; ++ fz_seek(ctx, file, offset, 0); ++ n = fz_read(ctx, file, (unsigned char *)buf, size); ++ if (n != size) ++ fz_throw(ctx, FZ_ERROR_GENERIC, "safe_read: read %d, expected %d", n, size); ++} ++ ++static void ++read_ttf_string(fz_context *ctx, fz_stream *file, int offset, TT_NAME_RECORD *ttRecordBE, char *buf, int size) ++{ ++ char szTemp[MAX_FACENAME * 2]; ++ // ignore empty and overlong strings ++ int stringLength = BEtoHs(ttRecordBE->uStringLength); ++ if (stringLength == 0 || stringLength >= sizeof(szTemp)) ++ return; ++ ++ safe_read(ctx, file, offset + BEtoHs(ttRecordBE->uStringOffset), szTemp, stringLength); ++ decode_platform_string(ctx, BEtoHs(ttRecordBE->uPlatformID), ++ BEtoHs(ttRecordBE->uEncodingID), szTemp, stringLength, buf, size); ++} ++ ++static void ++makeFakePSName(char szName[MAX_FACENAME], const char *szStyle) ++{ ++ // append the font's subfamily, unless it's a Regular font ++ if (*szStyle && _stricmp(szStyle, "Regular") != 0) ++ { ++ fz_strlcat(szName, "-", MAX_FACENAME); ++ fz_strlcat(szName, szStyle, MAX_FACENAME); ++ } ++ remove_spaces(szName); ++} ++ ++static void ++parseTTF(fz_context *ctx, fz_stream *file, int offset, int index, const char *path) ++{ ++ TT_OFFSET_TABLE ttOffsetTableBE; ++ TT_TABLE_DIRECTORY tblDirBE; ++ TT_NAME_TABLE_HEADER ttNTHeaderBE; ++ TT_NAME_RECORD ttRecordBE; ++ ++ char szPSName[MAX_FACENAME] = { 0 }; ++ char szTTName[MAX_FACENAME] = { 0 }; ++ char szStyle[MAX_FACENAME] = { 0 }; ++ char szCJKName[MAX_FACENAME] = { 0 }; ++ int i, count, tblOffset; ++ ++ safe_read(ctx, file, offset, (char *)&ttOffsetTableBE, sizeof(TT_OFFSET_TABLE)); ++ ++ // check if this is a TrueType font of version 1.0 or an OpenType font ++ if (BEtoHl(ttOffsetTableBE.uVersion) != TTC_VERSION1 && ++ BEtoHl(ttOffsetTableBE.uVersion) != TTAG_OTTO) ++ { ++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid font version %x", (unsigned int)BEtoHl(ttOffsetTableBE.uVersion)); ++ } ++ ++ // determine the name table's offset by iterating through the offset table ++ count = BEtoHs(ttOffsetTableBE.uNumOfTables); ++ for (i = 0; i < count; i++) ++ { ++ int entryOffset = offset + sizeof(TT_OFFSET_TABLE) + i * sizeof(TT_TABLE_DIRECTORY); ++ safe_read(ctx, file, entryOffset, (char *)&tblDirBE, sizeof(TT_TABLE_DIRECTORY)); ++ if (!BEtoHl(tblDirBE.uTag) || BEtoHl(tblDirBE.uTag) == TTAG_name) ++ break; ++ } ++ if (count == i || !BEtoHl(tblDirBE.uTag)) ++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : nameless font"); ++ tblOffset = BEtoHl(tblDirBE.uOffset); ++ ++ // read the 'name' table for record count and offsets ++ safe_read(ctx, file, tblOffset, (char *)&ttNTHeaderBE, sizeof(TT_NAME_TABLE_HEADER)); ++ offset = tblOffset + sizeof(TT_NAME_TABLE_HEADER); ++ tblOffset += BEtoHs(ttNTHeaderBE.uStorageOffset); ++ ++ // read through the strings for PostScript name and font family ++ count = BEtoHs(ttNTHeaderBE.uNRCount); ++ for (i = 0; i < count; i++) ++ { ++ short langId, nameId; ++ BOOL isCJKName; ++ ++ safe_read(ctx, file, offset + i * sizeof(TT_NAME_RECORD), (char *)&ttRecordBE, sizeof(TT_NAME_RECORD)); ++ ++ langId = BEtoHs(ttRecordBE.uLanguageID); ++ nameId = BEtoHs(ttRecordBE.uNameID); ++ isCJKName = TT_NAME_ID_FONT_FAMILY == nameId && LANG_CHINESE == PRIMARYLANGID(langId); ++ ++ // ignore non-English strings (except for Chinese font names) ++ if (langId && langId != TT_MS_LANGID_ENGLISH_UNITED_STATES && !isCJKName) ++ continue; ++ // ignore names other than font (sub)family and PostScript name ++ fz_try(ctx) ++ { ++ if (isCJKName) ++ read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szCJKName, sizeof(szCJKName)); ++ else if (TT_NAME_ID_FONT_FAMILY == nameId) ++ read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szTTName, sizeof(szTTName)); ++ else if (TT_NAME_ID_FONT_SUBFAMILY == nameId) ++ read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szStyle, sizeof(szStyle)); ++ else if (TT_NAME_ID_PS_NAME == nameId) ++ read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szPSName, sizeof(szPSName)); ++ } ++ fz_catch(ctx) ++ { ++ fz_warn(ctx, "ignoring face name decoding fonterror"); ++ } ++ } ++ ++ // try to prevent non-Arial fonts from accidentally substituting Arial ++ if (!strcmp(szPSName, "ArialMT")) ++ { ++ // cf. https://code.google.com/p/sumatrapdf/issues/detail?id=2471 ++ if (strcmp(szTTName, "Arial") != 0) ++ szPSName[0] = '\0'; ++ // TODO: is there a better way to distinguish Arial Caps from Arial proper? ++ // cf. http://code.google.com/p/sumatrapdf/issues/detail?id=1290 ++ else if (strstr(path, "caps") || strstr(path, "Caps")) ++ fz_throw(ctx, FZ_ERROR_GENERIC, "ignore %s, as it can't be distinguished from Arial,Regular", path); ++ } ++ ++ if (szPSName[0]) ++ append_mapping(ctx, &fontlistMS, szPSName, path, index); ++ if (szTTName[0]) ++ { ++ // derive a PostScript-like name and add it, if it's different from the font's ++ // included PostScript name; cf. http://code.google.com/p/sumatrapdf/issues/detail?id=376 ++ makeFakePSName(szTTName, szStyle); ++ // compare the two names before adding this one ++ if (lookup_compare(szTTName, szPSName)) ++ append_mapping(ctx, &fontlistMS, szTTName, path, index); ++ } ++ if (szCJKName[0]) ++ { ++ makeFakePSName(szCJKName, szStyle); ++ if (lookup_compare(szCJKName, szPSName) && lookup_compare(szCJKName, szTTName)) ++ append_mapping(ctx, &fontlistMS, szCJKName, path, index); ++ } ++} ++ ++static void ++parseTTFs(fz_context *ctx, const char *path) ++{ ++ fz_stream *file = fz_open_file(ctx, path); ++ /* "fonterror : %s not found", path */ ++ fz_try(ctx) ++ { ++ parseTTF(ctx, file, 0, 0, path); ++ } ++ fz_always(ctx) ++ { ++ fz_drop_stream(ctx,file); ++ } ++ fz_catch(ctx) ++ { ++ fz_rethrow(ctx); ++ } ++} ++ ++static void ++parseTTCs(fz_context *ctx, const char *path) ++{ ++ FONT_COLLECTION fontcollectionBE; ++ ULONG i, numFonts, *offsettableBE = NULL; ++ ++ fz_stream *file = fz_open_file(ctx, path); ++ /* "fonterror : %s not found", path */ ++ ++ fz_var(offsettableBE); ++ ++ fz_try(ctx) ++ { ++ safe_read(ctx, file, 0, (char *)&fontcollectionBE, sizeof(FONT_COLLECTION)); ++ if (BEtoHl(fontcollectionBE.Tag) != TTAG_ttcf) ++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : wrong format %x", (unsigned int)BEtoHl(fontcollectionBE.Tag)); ++ if (BEtoHl(fontcollectionBE.Version) != TTC_VERSION1 && ++ BEtoHl(fontcollectionBE.Version) != TTC_VERSION2) ++ { ++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid version %x", (unsigned int)BEtoHl(fontcollectionBE.Version)); ++ } ++ ++ numFonts = BEtoHl(fontcollectionBE.NumFonts); ++ offsettableBE = fz_malloc_array(ctx, numFonts, sizeof(ULONG)); ++ ++ safe_read(ctx, file, sizeof(FONT_COLLECTION), (char *)offsettableBE, numFonts * sizeof(ULONG)); ++ for (i = 0; i < numFonts; i++) ++ parseTTF(ctx, file, BEtoHl(offsettableBE[i]), i, path); ++ } ++ fz_always(ctx) ++ { ++ fz_free(ctx, offsettableBE); ++ fz_drop_stream(ctx,file); ++ } ++ fz_catch(ctx) ++ { ++ fz_rethrow(ctx); ++ } ++} ++ ++static void ++extend_system_font_list(fz_context *ctx, const WCHAR *path) ++{ ++ WCHAR szPath[MAX_PATH], *lpFileName; ++ WIN32_FIND_DATA FileData; ++ HANDLE hList; ++ ++ GetFullPathName(path, nelem(szPath), szPath, &lpFileName); ++ ++ hList = FindFirstFile(szPath, &FileData); ++ if (hList == INVALID_HANDLE_VALUE) ++ { ++ // Don't complain about missing directories ++ if (GetLastError() == ERROR_FILE_NOT_FOUND) ++ return; ++ fz_throw(ctx, FZ_ERROR_GENERIC, "extend_system_font_list: unknown error %d", (int)GetLastError()); ++ } ++ do ++ { ++ if (!(FileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) ++ { ++ char szPathUtf8[MAX_PATH], *fileExt; ++ int res; ++ lstrcpyn(lpFileName, FileData.cFileName, szPath + MAX_PATH - lpFileName); ++ res = WideCharToMultiByte(CP_UTF8, 0, szPath, -1, szPathUtf8, sizeof(szPathUtf8), NULL, NULL); ++ if (!res) ++ { ++ fz_warn(ctx, "WideCharToMultiByte failed for %S", szPath); ++ continue; ++ } ++ fileExt = szPathUtf8 + strlen(szPathUtf8) - 4; ++ fz_try(ctx) ++ { ++ if (!_stricmp(fileExt, ".ttc")) ++ parseTTCs(ctx, szPathUtf8); ++ else if (!_stricmp(fileExt, ".ttf") || !_stricmp(fileExt, ".otf")) ++ parseTTFs(ctx, szPathUtf8); ++ } ++ fz_catch(ctx) ++ { ++ // ignore errors occurring while parsing a given font file ++ } ++ } ++ } while (FindNextFile(hList, &FileData)); ++ FindClose(hList); ++} ++ ++static void ++destroy_system_font_list(void) ++{ ++ free(fontlistMS.fontmap); ++ memset(&fontlistMS, 0, sizeof(fontlistMS)); ++} ++ ++static void ++create_system_font_list(fz_context *ctx) ++{ ++ WCHAR szFontDir[MAX_PATH]; ++ UINT cch; ++ ++ cch = GetWindowsDirectory(szFontDir, nelem(szFontDir) - 12); ++ if (0 < cch && cch < nelem(szFontDir) - 12) ++ { ++ /* willus.com edit--Win XP default MSVCRT.DLL doesn't have wcscat_s */ ++#ifdef _WIN64 ++ wcscat_s(szFontDir, MAX_PATH, L"\\Fonts\\*.?t?"); ++#else ++ wcscat(szFontDir,L"\\Fonts\\*.?t?"); ++#endif ++ extend_system_font_list(ctx, szFontDir); ++ } ++ ++ if (fontlistMS.len == 0) ++ fz_warn(ctx, "couldn't find any usable system fonts"); ++ ++#ifdef NOCJKFONT ++ { ++ // If no CJK fallback font is builtin but one has been shipped separately (in the same ++ // directory as the main executable), add it to the list of loadable system fonts ++ WCHAR szFile[MAX_PATH], *lpFileName; ++ GetModuleFileName(0, szFontDir, MAX_PATH); ++ GetFullPathName(szFontDir, MAX_PATH, szFile, &lpFileName); ++ lstrcpyn(lpFileName, L"DroidSansFallback.ttf", szFile + MAX_PATH - lpFileName); ++ extend_system_font_list(ctx, szFile); ++ } ++#endif ++ ++ // sort the font list, so that it can be searched binarily ++ qsort(fontlistMS.fontmap, fontlistMS.len, sizeof(pdf_fontmapMS), _stricmp); ++ ++#ifdef DEBUG ++ // allow to overwrite system fonts for debugging purposes ++ // (either pass a full path or a search pattern such as "fonts\*.ttf") ++ cch = GetEnvironmentVariable(L"MUPDF_FONTS_PATTERN", szFontDir, nelem(szFontDir)); ++ if (0 < cch && cch < nelem(szFontDir)) ++ { ++ int i, prev_len = fontlistMS.len; ++ extend_system_font_list(ctx, szFontDir); ++ for (i = prev_len; i < fontlistMS.len; i++) ++ { ++ pdf_fontmapMS *entry = bsearch(fontlistMS.fontmap[i].fontface, fontlistMS.fontmap, prev_len, sizeof(pdf_fontmapMS), lookup_compare); ++ if (entry) ++ *entry = fontlistMS.fontmap[i]; ++ } ++ qsort(fontlistMS.fontmap, fontlistMS.len, sizeof(pdf_fontmapMS), _stricmp); ++ } ++#endif ++ ++ // make sure to clean up after ourselves ++ atexit(destroy_system_font_list); ++} ++ ++static fz_font * ++pdf_load_windows_font_by_name(fz_context *ctx, const char *orig_name) ++{ ++ pdf_fontmapMS *found = NULL; ++ char *comma, *fontname; ++ fz_font *font; ++ ++ /* WILLUS MOD--not multi-threaded for k2pdfopt */ ++ /* fz_synchronize_begin(); */ ++ if (fontlistMS.len == 0) ++ { ++ fz_try(ctx) ++ { ++ create_system_font_list(ctx); ++ } ++ fz_catch(ctx) { } ++ } ++ /* WILLUS MOD--not multi-threaded for k2pdfopt */ ++ /* fz_synchronize_end(); */ ++ if (fontlistMS.len == 0) ++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror: couldn't find any fonts"); ++ ++ // work on a normalized copy of the font name ++ fontname = fz_strdup(ctx, orig_name); ++ remove_spaces(fontname); ++ ++ // first, try to find the exact font name (including appended style information) ++ comma = strchr(fontname, ','); ++ if (comma) ++ { ++ *comma = '-'; ++ found = pdf_find_windows_font_path(fontname); ++ *comma = ','; ++ } ++ // second, substitute the font name with a known PostScript name ++ else ++ { ++ int i; ++ for (i = 0; i < nelem(baseSubstitutes) && !found; i++) ++ if (!strcmp(fontname, baseSubstitutes[i].name)) ++ found = pdf_find_windows_font_path(baseSubstitutes[i].pattern); ++ } ++ // third, search for the font name without additional style information ++ if (!found) ++ found = pdf_find_windows_font_path(fontname); ++ // fourth, try to separate style from basename for prestyled fonts (e.g. "ArialBold") ++ if (!found && !comma && (str_ends_with(fontname, "Bold") || str_ends_with(fontname, "Italic"))) ++ { ++ int styleLen = str_ends_with(fontname, "Bold") ? 4 : str_ends_with(fontname, "BoldItalic") ? 10 : 6; ++ fontname = fz_resize_array(ctx, fontname, strlen(fontname) + 2, sizeof(char)); ++ comma = fontname + strlen(fontname) - styleLen; ++ memmove(comma + 1, comma, styleLen + 1); ++ *comma = '-'; ++ found = pdf_find_windows_font_path(fontname); ++ *comma = ','; ++ if (!found) ++ found = pdf_find_windows_font_path(fontname); ++ } ++ // fifth, try to convert the font name from the common Chinese codepage 936 ++ if (!found && fontname[0] < 0) ++ { ++ WCHAR cjkNameW[MAX_FACENAME]; ++ char cjkName[MAX_FACENAME]; ++ if (MultiByteToWideChar(936, MB_ERR_INVALID_CHARS, fontname, -1, cjkNameW, nelem(cjkNameW)) && ++ WideCharToMultiByte(CP_UTF8, 0, cjkNameW, -1, cjkName, nelem(cjkName), NULL, NULL)) ++ { ++ comma = strchr(cjkName, ','); ++ if (comma) ++ { ++ *comma = '-'; ++ found = pdf_find_windows_font_path(cjkName); ++ *comma = ','; ++ } ++ if (!found) ++ found = pdf_find_windows_font_path(cjkName); ++ } ++ } ++ ++ fz_free(ctx, fontname); ++ if (!found) ++ fz_throw(ctx, FZ_ERROR_GENERIC, "couldn't find system font '%s'", orig_name); ++ ++ /* ++ fz_warn(ctx, "loading non-embedded font '%s' from '%s'", orig_name, found->fontpath); ++ */ ++ ++ font = fz_new_font_from_file(ctx, orig_name, found->fontpath, found->index, ++ strcmp(found->fontface, "DroidSansFallback") != 0); ++ /* willus mod for MuPDF v1.10, 10-21-2016 */ ++ { ++ fz_font_flags_t *flags; ++ flags=fz_font_flags(font); ++ if (flags!=NULL) ++ flags->ft_substitute = 1; ++ } ++ return font; ++} ++ ++static fz_font * ++pdf_load_windows_font(fz_context *ctx, const char *fontname, int bold, int italic, int needs_exact_metrics) ++{ ++ if (needs_exact_metrics) ++ { ++ const char *clean_name; ++ /* WILLUS: Declare pdf_clean_base14_name() */ ++ extern const char *pdf_clean_base14_name(const char *fontname); ++ ++ /* TODO: the metrics for Times-Roman and Courier don't match ++ those of Windows' Times New Roman and Courier New; for ++ some reason, Poppler doesn't seem to have this problem */ ++ int len; ++ if (fz_lookup_builtin_font(ctx,fontname, bold, italic, &len)) ++ return NULL; ++ ++ /* cf. http://code.google.com/p/sumatrapdf/issues/detail?id=2173 */ ++ clean_name = pdf_clean_base14_name(fontname); ++ if (clean_name != fontname && !strncmp(clean_name, "Times-", 6)) ++ return NULL; ++ } ++ ++ // TODO: unset font->ft_substitute for base14/needs_exact_metrics? ++ return pdf_load_windows_font_by_name(ctx, fontname); ++} ++ ++static const char *clean_font_name(const char *fontname) ++{ ++ int i, k; ++ for (i = 0; i < nelem(base_font_names); i++) ++ for (k = 0; base_font_names[i][k]; k++) ++ if (!strcmp_ignore_space(base_font_names[i][k], fontname)) ++ return base_font_names[i][0]; ++ return fontname; ++} ++ ++ ++/* SumatraPDF: expose clean_font_name */ ++static const char * pdf_clean_base14_name(const char *fontname) ++{ ++ return clean_font_name(fontname); ++} ++ ++static fz_font * ++pdf_load_windows_cjk_font(fz_context *ctx, const char *fontname, int ros, int serif) ++{ ++ fz_font *font; ++ ++ font=NULL; /* WILLUS: Avoid compiler warning */ ++ /* try to find a matching system font before falling back to an approximate one */ ++ fz_try(ctx) ++ { ++ font = pdf_load_windows_font_by_name(ctx, fontname); ++ } ++ fz_catch(ctx) ++ { ++ font = NULL; ++ } ++ if (font) ++ return font; ++ ++ /* try to fall back to a reasonable system font */ ++ fz_try(ctx) ++ { ++ if (serif) ++ { ++ switch (ros) ++ { ++ case FZ_ADOBE_CNS: font = pdf_load_windows_font_by_name(ctx, "MingLiU"); break; ++ case FZ_ADOBE_GB: font = pdf_load_windows_font_by_name(ctx, "SimSun"); break; ++ case FZ_ADOBE_JAPAN: font = pdf_load_windows_font_by_name(ctx, "MS-Mincho"); break; ++ case FZ_ADOBE_KOREA: font = pdf_load_windows_font_by_name(ctx, "Batang"); break; ++ default: fz_throw(ctx, FZ_ERROR_GENERIC, "invalid serif ros"); ++ } ++ } ++ else ++ { ++ switch (ros) ++ { ++ case FZ_ADOBE_CNS: font = pdf_load_windows_font_by_name(ctx, "DFKaiShu-SB-Estd-BF"); break; ++ case FZ_ADOBE_GB: ++ fz_try(ctx) ++ { ++ font = pdf_load_windows_font_by_name(ctx, "KaiTi"); ++ } ++ fz_catch(ctx) ++ { ++ font = pdf_load_windows_font_by_name(ctx, "KaiTi_GB2312"); ++ } ++ break; ++ case FZ_ADOBE_JAPAN: font = pdf_load_windows_font_by_name(ctx, "MS-Gothic"); break; ++ case FZ_ADOBE_KOREA: font = pdf_load_windows_font_by_name(ctx, "Gulim"); break; ++ default: fz_throw(ctx, FZ_ERROR_GENERIC, "invalid sans-serif ros"); ++ } ++ } ++ } ++ fz_catch(ctx) ++ { ++#ifdef NOCJKFONT ++ /* If no CJK fallback font is builtin, maybe one has been shipped separately */ ++ font = pdf_load_windows_font_by_name(ctx, "DroidSansFallback"); ++#else ++ fz_rethrow(ctx); ++#endif ++ } ++ ++ return font; ++} ++ ++#endif ++ ++void pdf_install_load_system_font_funcs(fz_context *ctx) ++{ ++#ifdef _WIN32 ++ fz_install_load_system_font_funcs(ctx, pdf_load_windows_font, pdf_load_windows_cjk_font, NULL); ++#endif ++} +diff --git a/source/fitz/font.c b/source/fitz/font.c +index 733d91dae..69c46d968 100644 +--- a/source/fitz/font.c ++++ b/source/fitz/font.c +@@ -5,8 +5,11 @@ + #include "draw-imp.h" + + #include ++/* willus mod -- remove hb includes */ ++/* + #include "hb.h" + #include "hb-ft.h" ++*/ + + #include + +diff --git a/source/fitz/stext-device.c b/source/fitz/stext-device.c +index 0ba944d44..3c05c51ac 100644 +--- a/source/fitz/stext-device.c ++++ b/source/fitz/stext-device.c +@@ -692,6 +692,11 @@ fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options + dev->trm = fz_identity; + dev->lastchar = ' '; + dev->curdir = 1; ++ /* willus mod -- seems like this should be here, but not sure. */ ++ if (opts) ++ dev->flags = opts->flags; ++ else ++ dev->flags = 0; + + return (fz_device*)dev; + } +diff --git a/source/fitz/string.c b/source/fitz/string.c +index e70ae6e6e..b310463f4 100644 +--- a/source/fitz/string.c ++++ b/source/fitz/string.c +@@ -448,6 +448,10 @@ fz_utflen(const char *s) + + float fz_atof(const char *s) + { ++/* willus mod: atof(s), #if-#else-#endif */ ++#if (!defined(__SSE__)) ++ return(atof(s)); ++#else + float result; + + errno = 0; +@@ -457,6 +461,7 @@ float fz_atof(const char *s) + return 1; + result = fz_clamp(result, -FLT_MAX, FLT_MAX); + return result; ++#endif + } + + int fz_atoi(const char *s) +diff --git a/source/pdf/pdf-annot.c b/source/pdf/pdf-annot.c +index 68de8898a..5d43485bd 100644 +--- a/source/pdf/pdf-annot.c ++++ b/source/pdf/pdf-annot.c +@@ -4,8 +4,20 @@ + #include + #include + ++/* willus mod--don't use _mkgmtime--not available in Win XP */ + #ifdef _WIN32 +-#define timegm _mkgmtime ++static time_t timegm(struct tm *date); ++static time_t timegm(struct tm *date) ++ ++ { ++ time_t t,z; ++ struct tm gmz; ++ ++ z=(time_t)0; ++ gmz=(*gmtime(&z)); ++ t=mktime(date)-mktime(&gmz); ++ return(t); ++ } + #endif + + #define TEXT_ANNOT_SIZE (25.0f) +diff --git a/source/pdf/pdf-link.c b/source/pdf/pdf-link.c +index ae5beaa35..b5a52a000 100644 +--- a/source/pdf/pdf-link.c ++++ b/source/pdf/pdf-link.c +@@ -351,6 +351,9 @@ pdf_resolve_link(fz_context *ctx, pdf_document *doc, const char *uri, float *xp, + } + return page; + } ++/* willus mod -- be quiet */ ++/* + fz_warn(ctx, "unknown link uri '%s'", uri); ++*/ + return -1; + } +diff --git a/source/pdf/pdf-parse.c b/source/pdf/pdf-parse.c +index 501c5626a..927ba6cd5 100644 +--- a/source/pdf/pdf-parse.c ++++ b/source/pdf/pdf-parse.c +@@ -586,9 +586,14 @@ pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc, + if (c == '\r') + { + c = fz_peek_byte(ctx, file); ++/* willus mod -- no warning */ ++/* + if (c != '\n') + fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen); + else ++*/ ++if (c=='\n') ++/* willus mod -- end */ + fz_read_byte(ctx, file); + } + stm_ofs = fz_tell(ctx, file); +diff --git a/source/pdf/pdf-xref.c b/source/pdf/pdf-xref.c +index 2475b6e86..bc163563a 100644 +--- a/source/pdf/pdf-xref.c ++++ b/source/pdf/pdf-xref.c +@@ -707,8 +707,11 @@ pdf_xref_size_from_old_trailer(fz_context *ctx, pdf_document *doc, pdf_lexbuf *b + if (!s) + fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length missing"); + len = fz_atoi(fz_strsep(&s, " ")); ++/* willus mod -- no warning */ ++/* + if (len < 0) + fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length must be positive"); ++*/ + + /* broken pdfs where the section is not on a separate line */ + if (s && *s != '\0') +@@ -1372,7 +1375,10 @@ pdf_init_document(fz_context *ctx, pdf_document *doc) + { + pdf_drop_xref_sections(ctx, doc); + fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); ++/* willus mod -- be quiet */ ++/* + fz_warn(ctx, "trying to repair broken xref"); ++*/ + repaired = 1; + } + +@@ -1496,7 +1502,10 @@ pdf_drop_document_imp(fz_context *ctx, pdf_document *doc) + /* Swallow error, but continue dropping */ + } + ++/* willu smod -- no pdf_drop_js */ ++/* + pdf_drop_js(ctx, doc->js); ++*/ + + pdf_drop_xref_sections(ctx, doc); + fz_free(ctx, doc->xref_index); +-- +2.22.0 + diff --git a/pkgs/applications/misc/k2pdfopt/tesseract.patch b/pkgs/applications/misc/k2pdfopt/tesseract.patch index b882f5b949c3..adfee9ae282f 100644 --- a/pkgs/applications/misc/k2pdfopt/tesseract.patch +++ b/pkgs/applications/misc/k2pdfopt/tesseract.patch @@ -1,13 +1,675 @@ +From 39aa8502eee7bb669a29d1a9b3bfe5c9595ad960 Mon Sep 17 00:00:00 2001 +From: Daniel Fullmer +Date: Fri, 13 Sep 2019 13:45:05 -0400 +Subject: [PATCH] Willus mod changes from k2pdfopt + +--- + src/api/Makefile.am | 1 + + src/api/baseapi.cpp | 87 +++++++++++ + src/api/baseapi.h | 3 + + src/api/tesscapi.cpp | 311 +++++++++++++++++++++++++++++++++++++ + src/api/tesseract.h | 29 ++++ + src/ccmain/tessedit.cpp | 5 +- + src/ccutil/ccutil.h | 7 + + src/ccutil/genericvector.h | 21 ++- + src/ccutil/mainblk.cpp | 17 +- + src/ccutil/params.cpp | 3 +- + src/ccutil/serialis.cpp | 3 + + src/ccutil/serialis.h | 2 + + src/lstm/input.cpp | 3 + + 13 files changed, 488 insertions(+), 4 deletions(-) + create mode 100644 src/api/tesscapi.cpp + create mode 100644 src/api/tesseract.h + diff --git a/src/api/Makefile.am b/src/api/Makefile.am -index d8c1e54..46ead13 100644 +index d9b76eb6..cd2dc30f 100644 --- a/src/api/Makefile.am +++ b/src/api/Makefile.am -@@ -42,7 +42,7 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS) - if VISIBILITY - libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS - endif --libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp -+libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp tesscapi.cpp +@@ -39,6 +39,7 @@ libtesseract_api_la_SOURCES += lstmboxrenderer.cpp + libtesseract_api_la_SOURCES += pdfrenderer.cpp + libtesseract_api_la_SOURCES += wordstrboxrenderer.cpp + libtesseract_api_la_SOURCES += renderer.cpp ++libtesseract_api_la_SOURCES += tesscapi.cpp lib_LTLIBRARIES += libtesseract.la - libtesseract_la_LDFLAGS = + libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) $(libarchive_LIBS) +diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp +index 9245d07c..ea964ee6 100644 +--- a/src/api/baseapi.cpp ++++ b/src/api/baseapi.cpp +@@ -215,6 +215,14 @@ TessBaseAPI::TessBaseAPI() + // Use the current locale if building debug code. + std::locale::global(std::locale("")); + #endif ++ const char *locale; ++ locale = std::setlocale(LC_ALL, nullptr); ++/* willus mod Remove assertions--taken care of in tesscapi.cpp */ ++// ASSERT_HOST(!strcmp(locale, "C")); ++ locale = std::setlocale(LC_CTYPE, nullptr); ++// ASSERT_HOST(!strcmp(locale, "C")); ++ locale = std::setlocale(LC_NUMERIC, nullptr); ++// ASSERT_HOST(!strcmp(locale, "C")); + } + + TessBaseAPI::~TessBaseAPI() { +@@ -1333,6 +1341,85 @@ static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level, + text->add_str_int("\t", bottom - top); + } + ++/* willus mod */ ++int TessBaseAPI::GetOCRWords(int **x00,int **y00,int **x11,int **y11,int **ybaseline0, ++ char **utf8words) ++ ++ { ++ int iword,nwords,totlen,it8; ++ int *x0,*y0,*x1,*y1,*ybaseline; ++ char *tutf8; ++ ++ ResultIterator *res_it = GetIterator(); ++ /* Count words */ ++ iword=0; ++ totlen=0; ++ while (!res_it->Empty(RIL_BLOCK)) ++ { ++ if (res_it->Empty(RIL_WORD)) ++ { ++ res_it->Next(RIL_WORD); ++ continue; ++ } ++ iword++; ++ STRING textstr=std::unique_ptr(res_it->GetUTF8Text(RIL_WORD)).get(); ++ totlen+=strlen(textstr.string())+1; ++ res_it->Next(RIL_WORD); ++ } ++ nwords=iword; ++/* ++printf("\nnwords=%d, totlen=%d\n",nwords,totlen); ++*/ ++ x0=(*x00)=(int *)malloc(sizeof(int)*5*nwords); ++ y0=(*y00)=&x0[nwords]; ++ x1=(*x11)=&y0[nwords]; ++ y1=(*y11)=&x1[nwords]; ++ ybaseline=(*ybaseline0)=&y1[nwords]; ++ tutf8=(*utf8words)=(char *)malloc(totlen); ++ iword=0; ++ it8=0; ++ res_it->Begin(); ++ while (!res_it->Empty(RIL_BLOCK)) ++ { ++ if (res_it->Empty(RIL_WORD)) ++ { ++ res_it->Next(RIL_WORD); ++ continue; ++ } ++ STRING textstr=std::unique_ptr(res_it->GetUTF8Text(RIL_WORD)).get(); ++ strcpy(&tutf8[it8],textstr.string()); ++ it8 += strlen(&tutf8[it8])+1; ++ /* ++ STRING textstr(""); ++ textstr += std::unique_ptr(res_it->GetUTF8Text(RIL_WORD)).get(); ++ */ ++/* ++printf("Word %d: '%s'\n",iword,textstr.string()); ++*/ ++ int left, top, right, bottom; ++ int u1,v1,u2,v2; ++ res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); ++ res_it->Baseline(RIL_WORD, &u1, &v1, &u2, &v2); ++ x0[iword]=left; ++ x1[iword]=right; ++ y0[iword]=top; ++ y1[iword]=bottom; ++ ybaseline[iword]=(v1+v2)/2; ++ iword++; ++/* ++printf("BB: (%d,%d)-(%d,%d) BL: (%d,%d)-(%d,%d)\n",left,bottom,right,top,x1,y1,x2,y2); ++*/ ++ res_it->Next(RIL_WORD); ++ } ++/* ++printf("iword=%d\n",iword); ++*/ ++ return(iword); ++ } ++ ++/* willus mod */ ++int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words); ++ + /** + * Make a TSV-formatted string from the internal data structures. + * page_number is 0-based but will appear in the output as 1-based. +diff --git a/src/api/baseapi.h b/src/api/baseapi.h +index 3724dd92..23be5920 100644 +--- a/src/api/baseapi.h ++++ b/src/api/baseapi.h +@@ -575,6 +575,9 @@ class TESS_API TessBaseAPI { + */ + char* GetHOCRText(ETEXT_DESC* monitor, int page_number); + ++/* willus mod */ ++int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words); ++ + /** + * Make a HTML-formatted string with hOCR markup from the internal + * data structures. +diff --git a/src/api/tesscapi.cpp b/src/api/tesscapi.cpp +new file mode 100644 +index 00000000..1752fafe +--- /dev/null ++++ b/src/api/tesscapi.cpp +@@ -0,0 +1,311 @@ ++/* ++** tesscapi.cpp willus.com attempt at C wrapper for tesseract. ++** (Butchered from tesseractmain.cpp) ++** Last udpated 9-1-12 ++** ++** Copyright (C) 2012 http://willus.com ++** ++** This program is free software: you can redistribute it and/or modify ++** it under the terms of the GNU Affero General Public License as ++** published by the Free Software Foundation, either version 3 of the ++** License, or (at your option) any later version. ++** ++** This program is distributed in the hope that it will be useful, ++** but WITHOUT ANY WARRANTY; without even the implied warranty of ++** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++** GNU Affero General Public License for more details. ++** ++** You should have received a copy of the GNU Affero General Public License ++** along with this program. If not, see . ++** ++*/ ++ ++/* ++#include "mfcpch.h" ++*/ ++// #define USE_VLD //Uncomment for Visual Leak Detector. ++#if (defined _MSC_VER && defined USE_VLD) ++#include ++#endif ++ ++// Include automatically generated configuration file if running autoconf ++#ifdef HAVE_CONFIG_H ++#include "config_auto.h" ++#endif ++#include ++#ifdef USING_GETTEXT ++#include ++#define _(x) gettext(x) ++#else ++#define _(x) (x) ++#endif ++ ++#include "allheaders.h" ++#include "baseapi.h" ++#include "strngs.h" ++#include "params.h" ++#include "blobs.h" ++#include "simddetect.h" ++#include "tesseractclass.h" ++/* ++#include "notdll.h" ++*/ ++ ++/* C Wrappers */ ++#include "tesseract.h" ++ ++// static tesseract::TessBaseAPI api[4]; ++ ++/* ++** ocr_type=0: OEM_DEFAULT ++** ocr_type=1: OEM_TESSERACT_ONLY ++** ocr_type=2: OEM_LSTM_ONLY ++** ocr_type=3: OEM_TESSERACT_LSTM_COMBINED ++*/ ++void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out, ++ char *initstr,int maxlen,int *status) ++ ++ { ++ char original_locale[256]; ++ tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI; ++/* ++printf("@tess_capi_init\n"); ++printf(" datapath='%s'\n",datapath); ++printf(" language='%s'\n",language); ++printf(" ocr_type=%d\n",ocr_type); ++*/ ++#ifdef USE_NLS ++ setlocale (LC_ALL, ""); ++ bindtextdomain (PACKAGE, LOCALEDIR); ++ textdomain (PACKAGE); ++#endif ++ /* willus mod, 11-24-16 */ ++ /* Tesseract needs "C" locale to correctly parse all data .traineddata files. */ ++/* ++printf("locale='%s'\n",setlocale(LC_ALL,NULL)); ++printf("ctype='%s'\n",setlocale(LC_CTYPE,NULL)); ++printf("numeric='%s'\n",setlocale(LC_NUMERIC,NULL)); ++*/ ++ strncpy(original_locale,setlocale(LC_ALL,NULL),255); ++ original_locale[255]='\0'; ++/* ++printf("original_locale='%s'\n",original_locale); ++*/ ++ setlocale(LC_ALL,"C"); ++/* ++printf("new locale='%s'\n",setlocale(LC_ALL,NULL)); ++printf("new ctype='%s'\n",setlocale(LC_CTYPE,NULL)); ++printf("new numeric='%s'\n",setlocale(LC_NUMERIC,NULL)); ++*/ ++ // fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version()); ++ // Make the order of args a bit more forgiving than it used to be. ++ const char* lang = "eng"; ++ tesseract::PageSegMode pagesegmode = tesseract::PSM_SINGLE_BLOCK; ++ if (language!=NULL && language[0]!='\0') ++ lang = language; ++ /* ++ if (output == NULL) ++ { ++ fprintf(stderr, _("Usage:%s imagename outputbase [-l lang] " ++ "[-psm pagesegmode] [configfile...]\n"), argv[0]); ++ fprintf(stderr, ++ _("pagesegmode values are:\n" ++ "0 = Orientation and script detection (OSD) only.\n" ++ "1 = Automatic page segmentation with OSD.\n" ++ "2 = Automatic page segmentation, but no OSD, or OCR\n" ++ "3 = Fully automatic page segmentation, but no OSD. (Default)\n" ++ "4 = Assume a single column of text of variable sizes.\n" ++ "5 = Assume a single uniform block of vertically aligned text.\n" ++ "6 = Assume a single uniform block of text.\n" ++ "7 = Treat the image as a single text line.\n" ++ "8 = Treat the image as a single word.\n" ++ "9 = Treat the image as a single word in a circle.\n" ++ "10 = Treat the image as a single character.\n")); ++ fprintf(stderr, _("-l lang and/or -psm pagesegmode must occur before any" ++ "configfile.\n")); ++ exit(1); ++ } ++ */ ++/* ++printf("SSE = %s\n",SIMDDetect::IsSSEAvailable() ? "AVAILABLE" : "NOT AVAILABLE"); ++printf("AVX = %s\n",SIMDDetect::IsAVXAvailable() ? "AVAILABLE" : "NOT AVAILABLE"); ++*/ ++/* ++v4.00 loads either TESSERACT enginer, LSTM engine, or both. No CUBE. ++*/ ++ ocr_type=0; /* Ignore specified and use default */ ++ api->SetOutputName(NULL); ++ (*status)=api->Init(datapath,lang, ++ ocr_type==0 ? tesseract::OEM_DEFAULT : ++ (ocr_type==1 ? tesseract::OEM_TESSERACT_ONLY : ++ (ocr_type==2 ? tesseract::OEM_LSTM_ONLY : ++ (tesseract::OEM_TESSERACT_LSTM_COMBINED)))); ++ if ((*status)!=0) ++ { ++ /* willus mod, 11-24-16 */ ++ setlocale(LC_ALL,original_locale); ++ api->End(); ++ delete api; ++ return(NULL); ++ } ++ /* ++ api.Init("tesscapi",lang,tesseract::OEM_DEFAULT, ++ &(argv[arg]), argc - arg, NULL, NULL, false); ++ */ ++ // We have 2 possible sources of pagesegmode: a config file and ++ // the command line. For backwards compatability reasons, the ++ // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the ++ // default for this program is tesseract::PSM_AUTO. We will let ++ // the config file take priority, so the command-line default ++ // can take priority over the tesseract default, so we use the ++ // value from the command line only if the retrieved mode ++ // is still tesseract::PSM_SINGLE_BLOCK, indicating no change ++ // in any config file. Therefore the only way to force ++ // tesseract::PSM_SINGLE_BLOCK is from the command line. ++ // It would be simpler if we could set the value before Init, ++ // but that doesn't work. ++ if (api->GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK) ++ api->SetPageSegMode(pagesegmode); ++ ++ /* ++ ** Initialization message ++ */ ++ { ++ char istr[1024]; ++ int sse,avx; ++ ++// printf("tessedit_ocr_engine_mode = %d\n",tessedit_ocr_engine_mode); ++ sprintf(istr,"%s",api->Version()); ++ sse=tesseract::SIMDDetect::IsSSEAvailable(); ++ avx=tesseract::SIMDDetect::IsAVXAvailable(); ++ if (sse || avx) ++ sprintf(&istr[strlen(istr)]," [%s]",sse&&avx?"SSE+AVX":(sse?"SSE":"AVX")); ++ sprintf(&istr[strlen(istr)],"\n Tesseract data folder = '%s'",datapath==NULL?getenv("TESSDATA_PREFIX"):datapath); ++ strcat(istr,"\n Tesseract languages: "); ++ GenericVector languages; ++ api->GetLoadedLanguagesAsVector(&languages); ++/* ++printf("OEM=%d\n",api->oem()); ++printf("Langs='%s'\n",api->GetInitLanguagesAsString()); ++printf("AnyTessLang()=%d\n",(int)api->tesseract()->AnyTessLang()); ++printf("AnyLSTMLang()=%d\n",(int)api->tesseract()->AnyLSTMLang()); ++printf("num_sub_langs()=%d\n",api->tesseract()->num_sub_langs()); ++printf("languages.size()=%d\n",(int)languages.size()); ++*/ ++ ++ for (int i=0;i<=api->tesseract()->num_sub_langs();i++) ++ { ++ tesseract::Tesseract *lang1; ++ int eng; ++ lang1 = i==0 ? api->tesseract() : api->tesseract()->get_sub_lang(i-1); ++ eng=(int)lang1->tessedit_ocr_engine_mode; ++ sprintf(&istr[strlen(istr)],"%s%s [%s]",i==0?"":", ",lang1->lang.string(), ++ eng==2?"LSTM+Tess":(eng==1?"LSTM":"Tess")); ++ } ++/* ++printf("%d. '%s'\n",i+1,languages[i].string()); ++printf(" sublang[%d].oem_engine = %d\n",i+1,(int)api->tesseract()->get_sub_lang(i)->tessedit_ocr_engine_mode); ++*/ ++ ++ /* ++ if (ocr_type==0 || ocr_type==3) ++ sprintf(&istr[strlen(istr)],"[LSTM+] (lang="); ++ else if (ocr_type==2) ++ sprintf(&istr[strlen(istr)],"[LSTM] (lang="); ++ strncpy(&istr[strlen(istr)],language,253-strlen(istr)); ++ istr[253]='\0'; ++ strcat(istr,")"); ++ */ ++ if (out!=NULL) ++ fprintf(out,"%s\n",istr); ++ if (initstr!=NULL) ++ { ++ strncpy(initstr,istr,maxlen-1); ++ initstr[maxlen-1]='\0'; ++ } ++ } ++ ++ ++ /* Turn off LSTM debugging output */ ++ api->SetVariable("lstm_debug_level","0"); ++#if (WILLUSDEBUG & 1) ++ api->SetVariable("lstm_debug_level","9"); ++ api->SetVariable("paragraph_debug_level","9"); ++ api->SetVariable("tessdata_manager_debug_level","9"); ++ api->SetVariable("tosp_debug_level","9"); ++ api->SetVariable("wordrec_debug_level","9"); ++ api->SetVariable("segsearch_debug_level","9"); ++#endif ++ /* willus mod, 11-24-16 */ ++ setlocale(LC_ALL,original_locale); ++ return((void *)api); ++ } ++ ++ ++int tess_capi_get_ocr(void *vapi,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out) ++ ++ { ++ tesseract::TessBaseAPI *api; ++ static int old_segmode=-1; ++ ++ api=(tesseract::TessBaseAPI *)vapi; ++ if (old_segmode != segmode) ++ { ++ old_segmode=segmode; ++ api->SetPageSegMode((tesseract::PageSegMode)segmode); ++ } ++ if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL)) ++ { ++ /* pixDestroy(&pix); */ ++ if (out!=NULL) ++ fprintf(out,"tesscapi: Error during bitmap processing.\n"); ++ api->Clear(); ++ return(-1); ++ } ++ strncpy(outstr,api->GetUTF8Text(),maxlen-1); ++ outstr[maxlen-1]='\0'; ++ api->Clear(); ++ return(0); ++ } ++ ++ ++int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode, ++ int **left,int **top,int **right,int **bottom, ++ int **ybase,char **text,int *nw, ++ FILE *out) ++ ++ { ++ tesseract::TessBaseAPI *api; ++ static int old_segmode=-1; ++ ++ api=(tesseract::TessBaseAPI *)vapi; ++ if (old_segmode != segmode) ++ { ++ old_segmode=segmode; ++ api->SetPageSegMode((tesseract::PageSegMode)segmode); ++ } ++ if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL)) ++ { ++ if (out!=NULL) ++ fprintf(out,"tesscapi: Error during bitmap processing.\n"); ++ api->Clear(); ++ (*nw)=0; ++ return(-1); ++ } ++ (*nw)=api->GetOCRWords(left,top,right,bottom,ybase,text); ++ api->Clear(); ++ return(0); ++ } ++ ++ ++void tess_capi_end(void *vapi) ++ ++ { ++ tesseract::TessBaseAPI *api; ++ ++ if (vapi==NULL) ++ return; ++ api=(tesseract::TessBaseAPI *)vapi; ++ api->End(); ++ delete api; ++ } +diff --git a/src/api/tesseract.h b/src/api/tesseract.h +new file mode 100644 +index 00000000..575948cc +--- /dev/null ++++ b/src/api/tesseract.h +@@ -0,0 +1,29 @@ ++/* ++** Willus.com's Tesseract C Wrappers ++** ++** 6-8-12 ++** ++*/ ++ ++#ifndef _TESSERACT_H_ ++#define _TESSERACT_H_ ++ ++//#include ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out, ++ char *initstr,int maxlen,int *status); ++int tess_capi_get_ocr(void *api,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out); ++int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode, ++ int **left,int **top,int **right,int **bottom, ++ int **ybase,char **text,int *nw, ++ FILE *out); ++void tess_capi_end(void *api); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif +diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp +index 17f0951b..7af94ee2 100644 +--- a/src/ccmain/tessedit.cpp ++++ b/src/ccmain/tessedit.cpp +@@ -101,6 +101,10 @@ bool Tesseract::init_tesseract_lang_data( + " to your \"tessdata\" directory.\n"); + return false; + } ++ /* willus mod */ ++ TFile fp; ++ strncpy(fp.tfile_filename,tessdata_path.string(),511); ++ fp.tfile_filename[511]='\0'; + #ifndef DISABLED_LEGACY_ENGINE + if (oem == OEM_DEFAULT) { + // Set the engine mode from availability, which can then be overridden by +@@ -116,7 +120,6 @@ bool Tesseract::init_tesseract_lang_data( + #endif // ndef DISABLED_LEGACY_ENGINE + + // If a language specific config file (lang.config) exists, load it in. +- TFile fp; + if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) { + ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, + this->params()); +diff --git a/src/ccutil/ccutil.h b/src/ccutil/ccutil.h +index 71e89c60..bdeccc14 100644 +--- a/src/ccutil/ccutil.h ++++ b/src/ccutil/ccutil.h +@@ -80,6 +80,13 @@ class CCUtil { + // Member parameters. + // These have to be declared and initialized after params_ member, since + // params_ should be initialized before parameters are added to it. ++/* willus mod */ ++/* ++ #ifdef _WIN32 ++ STRING_VAR_H(tessedit_module_name, WINDLLNAME, ++ "Module colocated with tessdata dir"); ++ #endif ++*/ + INT_VAR_H(ambigs_debug_level, 0, "Debug level for unichar ambiguities"); + BOOL_VAR_H(use_definite_ambigs_for_classifier, false, + "Use definite ambiguities when running character classifier"); +diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h +index 3556d153..3a5e8662 100644 +--- a/src/ccutil/genericvector.h ++++ b/src/ccutil/genericvector.h +@@ -382,7 +382,26 @@ inline bool LoadDataFromFile(const char* filename, GenericVector* data) { + // reserve an extra byte in case caller wants to append a '\0' character + data->reserve(size + 1); + data->resize_no_init(size); +- result = static_cast(fread(&(*data)[0], 1, size, fp)) == size; ++ /* willus mod Dec 2018--weird issue with Win XP and MinGW gcc 7.3.0 */ ++ /* Can't read entire file at once -- need to break up into smaller blocksize reads */ ++ { ++ int frs,n; ++ int blocksize; ++ blocksize=1024*1024; ++ for (n=0;1;) ++ { ++ int bs; ++ bs= size-n > blocksize ? blocksize : size-n; ++ frs=(int)fread(&(*data)[n],1,bs,fp); ++ n+=frs; ++ if (frs=size) ++ break; ++ } ++ result = static_cast((long)n==size); ++ } ++ /* ++ result = static_cast(fread(&(*data)[0], 1, size, fp)) == size; ++ */ + } + fclose(fp); + } +diff --git a/src/ccutil/mainblk.cpp b/src/ccutil/mainblk.cpp +index 52b04b04..80b26044 100644 +--- a/src/ccutil/mainblk.cpp ++++ b/src/ccutil/mainblk.cpp +@@ -55,8 +55,22 @@ void CCUtil::main_setup(const char *argv0, const char *basename) { + #if defined(_WIN32) + } else if (datadir == nullptr || _access(datadir.string(), 0) != 0) { + /* Look for tessdata in directory of executable. */ ++ /* ++ char drive[_MAX_DRIVE]; ++ char dir[_MAX_DIR]; ++ */ + char path[_MAX_PATH]; +- DWORD length = GetModuleFileName(nullptr, path, sizeof(path)); ++ int i; ++ /* DWORD length = */ GetModuleFileName(nullptr, path, sizeof(path)); ++ /* willus mod--avoid _splitpath_s -- not in XP */ ++ for (i=strlen(path)-1;i>=0 && path[i]!='/' && path[i]!='\\';i--); ++ if (i>=0) ++ { ++ path[i]='\0'; ++ datadir=path; ++ datadir += "/tessdata"; ++ } ++ /* + if (length > 0 && length < sizeof(path)) { + char* separator = std::strrchr(path, '\\'); + if (separator != nullptr) { +@@ -65,6 +79,7 @@ void CCUtil::main_setup(const char *argv0, const char *basename) { + datadir += "/tessdata"; + } + } ++ */ + #endif /* _WIN32 */ + #if defined(TESSDATA_PREFIX) + } else { +diff --git a/src/ccutil/params.cpp b/src/ccutil/params.cpp +index 00bf2563..486c5ce0 100644 +--- a/src/ccutil/params.cpp ++++ b/src/ccutil/params.cpp +@@ -82,7 +82,8 @@ bool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, + + if (!foundit) { + anyerr = true; // had an error +- tprintf("Warning: Parameter not found: %s\n", line); ++ /* willus mod */ ++ tprintf("Tesseract warning: Parameter %s not found in file %s.\n",line,fp->tfile_filename); + } + } + } +diff --git a/src/ccutil/serialis.cpp b/src/ccutil/serialis.cpp +index 7def011f..6107a494 100644 +--- a/src/ccutil/serialis.cpp ++++ b/src/ccutil/serialis.cpp +@@ -201,6 +201,9 @@ bool TFile::Open(const STRING& filename, FileReader reader) { + offset_ = 0; + is_writing_ = false; + swap_ = false; ++ /* willus mod */ ++ strncpy(tfile_filename,filename.string(),511); ++ tfile_filename[511]='\0'; + if (reader == nullptr) + return LoadDataFromFile(filename, data_); + else +diff --git a/src/ccutil/serialis.h b/src/ccutil/serialis.h +index 095b9227..4cc8251e 100644 +--- a/src/ccutil/serialis.h ++++ b/src/ccutil/serialis.h +@@ -77,6 +77,8 @@ class TFile { + public: + TFile(); + ~TFile(); ++ /* willus mod */ ++ char tfile_filename[512]; + + // All the Open methods load the whole file into memory for reading. + // Opens a file with a supplied reader, or nullptr to use the default. +diff --git a/src/lstm/input.cpp b/src/lstm/input.cpp +index 73b584b3..0b0b54c3 100644 +--- a/src/lstm/input.cpp ++++ b/src/lstm/input.cpp +@@ -93,8 +93,11 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data, + return nullptr; + } + if (width < min_width || height < min_width) { ++ /* willus mod -- no warning */ ++ /* + tprintf("Image too small to scale!! (%dx%d vs min width of %d)\n", width, + height, min_width); ++ */ + pixDestroy(&pix); + return nullptr; + } +-- +2.22.0 +