From dd83057b4849f58cefe877b1cbec56dcfc7cee9b Mon Sep 17 00:00:00 2001 From: Ben Olden-Cooligan Date: Fri, 29 Dec 2023 14:15:58 -0800 Subject: [PATCH] Sdk: Make OCR sdk-friendly --- NAPS2.Lib/Modules/CommonModule.cs | 7 +-- NAPS2.Sdk.Samples/OcrSample.cs | 43 +++++++++++++++++ NAPS2.Sdk.Tests/ContextualTests.cs | 3 +- NAPS2.Sdk/Ocr/OcrParams.cs | 2 +- NAPS2.Sdk/Ocr/TesseractOcrEngine.cs | 56 +++++++++++++++++++++-- NAPS2.Sdk/Platform/ISystemCompat.cs | 2 - NAPS2.Sdk/Platform/LinuxSystemCompat.cs | 2 - NAPS2.Sdk/Platform/MacSystemCompat.cs | 2 - NAPS2.Sdk/Platform/WindowsSystemCompat.cs | 2 - NAPS2.Sdk/README.md | 5 +- 10 files changed, 100 insertions(+), 24 deletions(-) create mode 100644 NAPS2.Sdk.Samples/OcrSample.cs diff --git a/NAPS2.Lib/Modules/CommonModule.cs b/NAPS2.Lib/Modules/CommonModule.cs index 2c6037577..cf5ec6cf6 100644 --- a/NAPS2.Lib/Modules/CommonModule.cs +++ b/NAPS2.Lib/Modules/CommonModule.cs @@ -112,12 +112,7 @@ public class CommonModule : Module }).SingleInstance(); builder.Register(ctx => { - var tesseractPath = PlatformCompat.System.UseSystemTesseract - ? "tesseract" - : NativeLibrary.FindExePath(PlatformCompat.System.TesseractExecutableName); - var engine = new TesseractOcrEngine( - tesseractPath, - ctx.Resolve().TessdataBasePath); + var engine = TesseractOcrEngine.BundledWithModes(ctx.Resolve().TessdataBasePath); var errorOutput = ctx.Resolve(); engine.OcrError += (_, args) => errorOutput.DisplayError(SdkResources.OcrError, args.Exception); engine.OcrTimeout += (_, _) => errorOutput.DisplayError(SdkResources.OcrTimeout); diff --git a/NAPS2.Sdk.Samples/OcrSample.cs b/NAPS2.Sdk.Samples/OcrSample.cs new file mode 100644 index 000000000..08a3dc7ab --- /dev/null +++ b/NAPS2.Sdk.Samples/OcrSample.cs @@ -0,0 +1,43 @@ +using NAPS2.Images.Gdi; +using NAPS2.Ocr; +using NAPS2.Pdf; +using NAPS2.Scan; + +namespace NAPS2.Sdk.Samples; + +public class OcrSample +{ + public static async Task OcrAndExportPdf() + { + // Exporting PDFs with OCR requires the optional NAPS2.Tesseract.Binaries Nuget package to be installed. + // Or, alternatively, you can use the system-installed Tesseract or provide a custom path to a Tesseract EXE. + + using var scanningContext = new ScanningContext(new GdiImageContext()); + + // The NAPS2.Tesseract.Binaries package doesn't include all the actual language data (1GB+ for 100+ languages). + // You can download .traineddata files from one of these repos: + // - https://github.com/tesseract-ocr/tessdata_fast + // - https://github.com/tesseract-ocr/tessdata_best + // Then specify the folder where those .traineddata files are stored. + scanningContext.OcrEngine = TesseractOcrEngine.Bundled(@"C:\path\to\my\traineddata\files\"); + + // Or if you know Tesseract is installed on the system PATH you can just do this without needing any extra + // packages or downloads. + scanningContext.OcrEngine = TesseractOcrEngine.System(); + + // Or if you have a custom path to the tesseract EXE you can do this. + scanningContext.OcrEngine = TesseractOcrEngine.Custom(@"C:\path\to\tesseract.exe"); + + // Scan some images + var controller = new ScanController(scanningContext); + var devices = await controller.GetDeviceList(); + var options = new ScanOptions { Device = devices.First() }; + var images = await controller.Scan(options).ToListAsync(); + + // Export to PDF with OCR + var pdfExporter = new PdfExporter(scanningContext); + // We specify the language code for OCR. This is based on the name of the .traineddata file, and is found here: + // https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016 + await pdfExporter.Export("doc.pdf", images, ocrParams: new OcrParams("eng")); + } +} \ No newline at end of file diff --git a/NAPS2.Sdk.Tests/ContextualTests.cs b/NAPS2.Sdk.Tests/ContextualTests.cs index ad576330a..041f0dd89 100644 --- a/NAPS2.Sdk.Tests/ContextualTests.cs +++ b/NAPS2.Sdk.Tests/ContextualTests.cs @@ -74,8 +74,7 @@ public class ContextualTests : IDisposable var tesseractPath = NativeLibrary.FindExePath(PlatformCompat.System.TesseractExecutableName, depsRoot); CopyResourceToFile(BinaryResources.eng_traineddata, fast, "eng.traineddata"); CopyResourceToFile(BinaryResources.heb_traineddata, fast, "heb.traineddata"); - ScanningContext.OcrEngine = - new TesseractOcrEngine(tesseractPath, FolderPath); + ScanningContext.OcrEngine = TesseractOcrEngine.CustomWithModes(tesseractPath, FolderPath); } public void SetUpFakeOcr() => SetUpFakeOcr(new()); diff --git a/NAPS2.Sdk/Ocr/OcrParams.cs b/NAPS2.Sdk/Ocr/OcrParams.cs index 163c4001e..32201a935 100644 --- a/NAPS2.Sdk/Ocr/OcrParams.cs +++ b/NAPS2.Sdk/Ocr/OcrParams.cs @@ -7,7 +7,7 @@ /// For language codes, see /// https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016 /// -public record OcrParams(string? LanguageCode, OcrMode Mode, double TimeoutInSeconds) +public record OcrParams(string? LanguageCode, OcrMode Mode = OcrMode.Default, double TimeoutInSeconds = 0) { private OcrParams() : this(null, OcrMode.Default, 0) diff --git a/NAPS2.Sdk/Ocr/TesseractOcrEngine.cs b/NAPS2.Sdk/Ocr/TesseractOcrEngine.cs index 3ce8b1101..7bb34d5ed 100644 --- a/NAPS2.Sdk/Ocr/TesseractOcrEngine.cs +++ b/NAPS2.Sdk/Ocr/TesseractOcrEngine.cs @@ -3,6 +3,7 @@ using System.Threading; using System.Xml; using Microsoft.Extensions.Logging; using NAPS2.Scan; +using NAPS2.Unmanaged; namespace NAPS2.Ocr; @@ -10,13 +11,54 @@ public class TesseractOcrEngine : IOcrEngine { private readonly string _tesseractPath; private readonly string? _languageDataBasePath; + private readonly bool _withModes; - public TesseractOcrEngine(string tesseractPath, string? languageDataBasePath = null) + /// + /// Gets a TesseractOcrEngine instance configured to use the Tesseract executable on the system PATH with the + /// system-installed language data. + /// + public static TesseractOcrEngine System() => + new("tesseract"); + + /// + /// Gets a TesseractOcrEngine instance configured to use the Tesseract executable from the NAPS2.Tesseract.Binaries + /// nuget package using language data .traineddata files in the specified folder. + /// + public static TesseractOcrEngine Bundled(string languageDataPath) => + new(BundlePath, languageDataPath, false); + + /// + /// Gets a TesseractOcrEngine instance configured to use the Tesseract executable from the NAPS2.Tesseract.Binaries + /// nuget package using language data .traineddata files in the specified folder. The folder is expected to have + /// subfolders named "best" and "fast" with the actual .trainneddata files that will be used based on the OcrMode. + /// + public static TesseractOcrEngine BundledWithModes(string languageDataBasePath) => + new(BundlePath, languageDataBasePath, true); + + /// + /// Gets a TesseractOcrEngine instance configured to use the specified Tesseract executable, optionally looking for + /// .traineddata files in the specified folder. + /// + public static TesseractOcrEngine Custom(string tesseractExePath, string? languageDataPath = null) => + new(tesseractExePath, languageDataPath, false); + + /// + /// Gets a TesseractOcrEngine instance configured to use the specified Tesseract executable using language data + /// .traineddata files in the specified folder. The folder is expected to have subfolders named "best" and "fast" + /// with the actual .trainneddata files that will be used based on the OcrMode. + /// + public static TesseractOcrEngine CustomWithModes(string tesseractExePath, string languageDataBasePath) => + new(tesseractExePath, languageDataBasePath, true); + + private static string BundlePath => NativeLibrary.FindExePath(PlatformCompat.System.TesseractExecutableName); + + private TesseractOcrEngine(string tesseractPath, string? languageDataBasePath = null, bool withModes = true) { _tesseractPath = tesseractPath; _languageDataBasePath = languageDataBasePath; + _withModes = withModes; } - + public async Task ProcessImage(ScanningContext scanningContext, string imagePath, OcrParams ocrParams, CancellationToken cancelToken) { @@ -36,8 +78,12 @@ public class TesseractOcrEngine : IOcrEngine }; if (_languageDataBasePath != null) { - string subfolder = ocrParams.Mode == OcrMode.Best ? "best" : "fast"; - string languageDataPath = Path.Combine(_languageDataBasePath, subfolder); + string languageDataPath = _languageDataBasePath; + if (_withModes) + { + string subfolder = ocrParams.Mode == OcrMode.Best ? "best" : "fast"; + languageDataPath = Path.Combine(languageDataPath, subfolder); + } startInfo.EnvironmentVariables["TESSDATA_PREFIX"] = languageDataPath; var tessdata = new DirectoryInfo(languageDataPath); EnsureHocrConfigExists(tessdata); @@ -193,7 +239,7 @@ public class TesseractOcrEngine : IOcrEngine } return bounds; } - + // TODO: Consider adding back CanProcess, or otherwise using this code to get the languages from a system engine // private void CheckIfInstalled() // { diff --git a/NAPS2.Sdk/Platform/ISystemCompat.cs b/NAPS2.Sdk/Platform/ISystemCompat.cs index 80ef8e091..3ec41698e 100644 --- a/NAPS2.Sdk/Platform/ISystemCompat.cs +++ b/NAPS2.Sdk/Platform/ISystemCompat.cs @@ -20,8 +20,6 @@ internal interface ISystemCompat bool ShouldRememberBackgroundOperations { get; } - bool UseSystemTesseract { get; } - bool RenderInWorker { get; } bool SupportsWinX86Worker { get; } diff --git a/NAPS2.Sdk/Platform/LinuxSystemCompat.cs b/NAPS2.Sdk/Platform/LinuxSystemCompat.cs index 272c46b8d..cfafd34e8 100644 --- a/NAPS2.Sdk/Platform/LinuxSystemCompat.cs +++ b/NAPS2.Sdk/Platform/LinuxSystemCompat.cs @@ -26,8 +26,6 @@ internal class LinuxSystemCompat : ISystemCompat public bool ShouldRememberBackgroundOperations => true; - public bool UseSystemTesseract => false; - public bool RenderInWorker => false; public bool SupportsWinX86Worker => false; diff --git a/NAPS2.Sdk/Platform/MacSystemCompat.cs b/NAPS2.Sdk/Platform/MacSystemCompat.cs index e0f26c76f..7f7fedbea 100644 --- a/NAPS2.Sdk/Platform/MacSystemCompat.cs +++ b/NAPS2.Sdk/Platform/MacSystemCompat.cs @@ -26,8 +26,6 @@ internal class MacSystemCompat : ISystemCompat public bool ShouldRememberBackgroundOperations => true; - public bool UseSystemTesseract => false; - public bool RenderInWorker => false; public bool SupportsWinX86Worker => false; diff --git a/NAPS2.Sdk/Platform/WindowsSystemCompat.cs b/NAPS2.Sdk/Platform/WindowsSystemCompat.cs index ab81c804e..8923cc0f8 100644 --- a/NAPS2.Sdk/Platform/WindowsSystemCompat.cs +++ b/NAPS2.Sdk/Platform/WindowsSystemCompat.cs @@ -23,8 +23,6 @@ internal abstract class WindowsSystemCompat : ISystemCompat public bool ShouldRememberBackgroundOperations => true; - public bool UseSystemTesseract => false; - public bool RenderInWorker => true; public bool SupportsWinX86Worker => true; diff --git a/NAPS2.Sdk/README.md b/NAPS2.Sdk/README.md index eca69d492..1e22f3982 100644 --- a/NAPS2.Sdk/README.md +++ b/NAPS2.Sdk/README.md @@ -27,11 +27,11 @@ NAPS2.Sdk is modular, and depending on your needs you may have to reference a di - **[NAPS2.Sdk.Worker.Win32](https://www.nuget.org/packages/NAPS2.Sdk.Worker.Win32/)** - For scanning with [TWAIN on Windows](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/TwainSample.cs). - **[NAPS2.Pdfium.Binaries](https://www.nuget.org/packages/NAPS2.Pdfium.Binaries/)** - - For [importing PDFs](). + - For [importing PDFs](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/PdfImportSample.cs). - **[NAPS2.Sane.Binaries](https://www.nuget.org/packages/NAPS2.Sane.Binaries/)** - For [using SANE drivers]() on Mac. (Linux has them pre-installed, and Windows isn't supported.) - **[NAPS2.Tesseract.Binaries](https://www.nuget.org/packages/NAPS2.Tesseract.Binaries/)** - - For [running OCR](). (You can also use a separate Tesseract installation if you like.) + - For [running OCR](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/OcrSample.cs). (You can also use a separate Tesseract installation if you like.) - **[NAPS2.Escl.Server](https://www.nuget.org/packages/NAPS2.Escl.Server/)** - For [sharing scanners](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/NetworkSharingSample.cs) across the local network. @@ -73,6 +73,7 @@ More [samples](https://github.com/cyanfish/naps2/tree/master/NAPS2.Sdk.Samples): - [Scan with TWAIN drivers](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/TwainSample.cs) - [Scan to System.Drawing.Bitmap](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/ScanToBitmapSample.cs) - [Import and export PDFs](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/PdfImportSample.cs) +- [Export PDFs with OCR](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/OcrSample.cs) - [Store image data on the filesystem](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/FileStorageSample.cs) - [Share scanners on the local network](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/NetworkSharingSample.cs)