Sdk: Make OCR sdk-friendly

This commit is contained in:
Ben Olden-Cooligan 2023-12-29 14:15:58 -08:00
parent 9a6fb36a8c
commit dd83057b48
10 changed files with 100 additions and 24 deletions

View File

@ -112,12 +112,7 @@ public class CommonModule : Module
}).SingleInstance();
builder.Register<IOcrEngine>(ctx =>
{
var tesseractPath = PlatformCompat.System.UseSystemTesseract
? "tesseract"
: NativeLibrary.FindExePath(PlatformCompat.System.TesseractExecutableName);
var engine = new TesseractOcrEngine(
tesseractPath,
ctx.Resolve<TesseractLanguageManager>().TessdataBasePath);
var engine = TesseractOcrEngine.BundledWithModes(ctx.Resolve<TesseractLanguageManager>().TessdataBasePath);
var errorOutput = ctx.Resolve<ErrorOutput>();
engine.OcrError += (_, args) => errorOutput.DisplayError(SdkResources.OcrError, args.Exception);
engine.OcrTimeout += (_, _) => errorOutput.DisplayError(SdkResources.OcrTimeout);

View File

@ -0,0 +1,43 @@
using NAPS2.Images.Gdi;
using NAPS2.Ocr;
using NAPS2.Pdf;
using NAPS2.Scan;
namespace NAPS2.Sdk.Samples;
public class OcrSample
{
public static async Task OcrAndExportPdf()
{
// Exporting PDFs with OCR requires the optional NAPS2.Tesseract.Binaries Nuget package to be installed.
// Or, alternatively, you can use the system-installed Tesseract or provide a custom path to a Tesseract EXE.
using var scanningContext = new ScanningContext(new GdiImageContext());
// The NAPS2.Tesseract.Binaries package doesn't include all the actual language data (1GB+ for 100+ languages).
// You can download .traineddata files from one of these repos:
// - https://github.com/tesseract-ocr/tessdata_fast
// - https://github.com/tesseract-ocr/tessdata_best
// Then specify the folder where those .traineddata files are stored.
scanningContext.OcrEngine = TesseractOcrEngine.Bundled(@"C:\path\to\my\traineddata\files\");
// Or if you know Tesseract is installed on the system PATH you can just do this without needing any extra
// packages or downloads.
scanningContext.OcrEngine = TesseractOcrEngine.System();
// Or if you have a custom path to the tesseract EXE you can do this.
scanningContext.OcrEngine = TesseractOcrEngine.Custom(@"C:\path\to\tesseract.exe");
// Scan some images
var controller = new ScanController(scanningContext);
var devices = await controller.GetDeviceList();
var options = new ScanOptions { Device = devices.First() };
var images = await controller.Scan(options).ToListAsync();
// Export to PDF with OCR
var pdfExporter = new PdfExporter(scanningContext);
// We specify the language code for OCR. This is based on the name of the .traineddata file, and is found here:
// https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016
await pdfExporter.Export("doc.pdf", images, ocrParams: new OcrParams("eng"));
}
}

View File

@ -74,8 +74,7 @@ public class ContextualTests : IDisposable
var tesseractPath = NativeLibrary.FindExePath(PlatformCompat.System.TesseractExecutableName, depsRoot);
CopyResourceToFile(BinaryResources.eng_traineddata, fast, "eng.traineddata");
CopyResourceToFile(BinaryResources.heb_traineddata, fast, "heb.traineddata");
ScanningContext.OcrEngine =
new TesseractOcrEngine(tesseractPath, FolderPath);
ScanningContext.OcrEngine = TesseractOcrEngine.CustomWithModes(tesseractPath, FolderPath);
}
public void SetUpFakeOcr() => SetUpFakeOcr(new());

View File

@ -7,7 +7,7 @@
/// For language codes, see
/// https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016
/// </summary>
public record OcrParams(string? LanguageCode, OcrMode Mode, double TimeoutInSeconds)
public record OcrParams(string? LanguageCode, OcrMode Mode = OcrMode.Default, double TimeoutInSeconds = 0)
{
private OcrParams()
: this(null, OcrMode.Default, 0)

View File

@ -3,6 +3,7 @@ using System.Threading;
using System.Xml;
using Microsoft.Extensions.Logging;
using NAPS2.Scan;
using NAPS2.Unmanaged;
namespace NAPS2.Ocr;
@ -10,13 +11,54 @@ public class TesseractOcrEngine : IOcrEngine
{
private readonly string _tesseractPath;
private readonly string? _languageDataBasePath;
private readonly bool _withModes;
public TesseractOcrEngine(string tesseractPath, string? languageDataBasePath = null)
/// <summary>
/// Gets a TesseractOcrEngine instance configured to use the Tesseract executable on the system PATH with the
/// system-installed language data.
/// </summary>
public static TesseractOcrEngine System() =>
new("tesseract");
/// <summary>
/// Gets a TesseractOcrEngine instance configured to use the Tesseract executable from the NAPS2.Tesseract.Binaries
/// nuget package using language data .traineddata files in the specified folder.
/// </summary>
public static TesseractOcrEngine Bundled(string languageDataPath) =>
new(BundlePath, languageDataPath, false);
/// <summary>
/// Gets a TesseractOcrEngine instance configured to use the Tesseract executable from the NAPS2.Tesseract.Binaries
/// nuget package using language data .traineddata files in the specified folder. The folder is expected to have
/// subfolders named "best" and "fast" with the actual .trainneddata files that will be used based on the OcrMode.
/// </summary>
public static TesseractOcrEngine BundledWithModes(string languageDataBasePath) =>
new(BundlePath, languageDataBasePath, true);
/// <summary>
/// Gets a TesseractOcrEngine instance configured to use the specified Tesseract executable, optionally looking for
/// .traineddata files in the specified folder.
/// </summary>
public static TesseractOcrEngine Custom(string tesseractExePath, string? languageDataPath = null) =>
new(tesseractExePath, languageDataPath, false);
/// <summary>
/// Gets a TesseractOcrEngine instance configured to use the specified Tesseract executable using language data
/// .traineddata files in the specified folder. The folder is expected to have subfolders named "best" and "fast"
/// with the actual .trainneddata files that will be used based on the OcrMode.
/// </summary>
public static TesseractOcrEngine CustomWithModes(string tesseractExePath, string languageDataBasePath) =>
new(tesseractExePath, languageDataBasePath, true);
private static string BundlePath => NativeLibrary.FindExePath(PlatformCompat.System.TesseractExecutableName);
private TesseractOcrEngine(string tesseractPath, string? languageDataBasePath = null, bool withModes = true)
{
_tesseractPath = tesseractPath;
_languageDataBasePath = languageDataBasePath;
_withModes = withModes;
}
public async Task<OcrResult?> ProcessImage(ScanningContext scanningContext, string imagePath, OcrParams ocrParams,
CancellationToken cancelToken)
{
@ -36,8 +78,12 @@ public class TesseractOcrEngine : IOcrEngine
};
if (_languageDataBasePath != null)
{
string subfolder = ocrParams.Mode == OcrMode.Best ? "best" : "fast";
string languageDataPath = Path.Combine(_languageDataBasePath, subfolder);
string languageDataPath = _languageDataBasePath;
if (_withModes)
{
string subfolder = ocrParams.Mode == OcrMode.Best ? "best" : "fast";
languageDataPath = Path.Combine(languageDataPath, subfolder);
}
startInfo.EnvironmentVariables["TESSDATA_PREFIX"] = languageDataPath;
var tessdata = new DirectoryInfo(languageDataPath);
EnsureHocrConfigExists(tessdata);
@ -193,7 +239,7 @@ public class TesseractOcrEngine : IOcrEngine
}
return bounds;
}
// TODO: Consider adding back CanProcess, or otherwise using this code to get the languages from a system engine
// private void CheckIfInstalled()
// {

View File

@ -20,8 +20,6 @@ internal interface ISystemCompat
bool ShouldRememberBackgroundOperations { get; }
bool UseSystemTesseract { get; }
bool RenderInWorker { get; }
bool SupportsWinX86Worker { get; }

View File

@ -26,8 +26,6 @@ internal class LinuxSystemCompat : ISystemCompat
public bool ShouldRememberBackgroundOperations => true;
public bool UseSystemTesseract => false;
public bool RenderInWorker => false;
public bool SupportsWinX86Worker => false;

View File

@ -26,8 +26,6 @@ internal class MacSystemCompat : ISystemCompat
public bool ShouldRememberBackgroundOperations => true;
public bool UseSystemTesseract => false;
public bool RenderInWorker => false;
public bool SupportsWinX86Worker => false;

View File

@ -23,8 +23,6 @@ internal abstract class WindowsSystemCompat : ISystemCompat
public bool ShouldRememberBackgroundOperations => true;
public bool UseSystemTesseract => false;
public bool RenderInWorker => true;
public bool SupportsWinX86Worker => true;

View File

@ -27,11 +27,11 @@ NAPS2.Sdk is modular, and depending on your needs you may have to reference a di
- **[NAPS2.Sdk.Worker.Win32](https://www.nuget.org/packages/NAPS2.Sdk.Worker.Win32/)**
- For scanning with [TWAIN on Windows](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/TwainSample.cs).
- **[NAPS2.Pdfium.Binaries](https://www.nuget.org/packages/NAPS2.Pdfium.Binaries/)**
- For [importing PDFs]().
- For [importing PDFs](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/PdfImportSample.cs).
- **[NAPS2.Sane.Binaries](https://www.nuget.org/packages/NAPS2.Sane.Binaries/)**
- For [using SANE drivers]() on Mac. (Linux has them pre-installed, and Windows isn't supported.)
- **[NAPS2.Tesseract.Binaries](https://www.nuget.org/packages/NAPS2.Tesseract.Binaries/)**
- For [running OCR](). (You can also use a separate Tesseract installation if you like.)
- For [running OCR](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/OcrSample.cs). (You can also use a separate Tesseract installation if you like.)
- **[NAPS2.Escl.Server](https://www.nuget.org/packages/NAPS2.Escl.Server/)**
- For [sharing scanners](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/NetworkSharingSample.cs) across the local network.
@ -73,6 +73,7 @@ More [samples](https://github.com/cyanfish/naps2/tree/master/NAPS2.Sdk.Samples):
- [Scan with TWAIN drivers](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/TwainSample.cs)
- [Scan to System.Drawing.Bitmap](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/ScanToBitmapSample.cs)
- [Import and export PDFs](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/PdfImportSample.cs)
- [Export PDFs with OCR](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/OcrSample.cs)
- [Store image data on the filesystem](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/FileStorageSample.cs)
- [Share scanners on the local network](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/NetworkSharingSample.cs)