mirror of
https://github.com/cyanfish/naps2.git
synced 2024-07-14 18:40:39 +03:00
Sdk: Make OCR sdk-friendly
This commit is contained in:
parent
9a6fb36a8c
commit
dd83057b48
@ -112,12 +112,7 @@ public class CommonModule : Module
|
||||
}).SingleInstance();
|
||||
builder.Register<IOcrEngine>(ctx =>
|
||||
{
|
||||
var tesseractPath = PlatformCompat.System.UseSystemTesseract
|
||||
? "tesseract"
|
||||
: NativeLibrary.FindExePath(PlatformCompat.System.TesseractExecutableName);
|
||||
var engine = new TesseractOcrEngine(
|
||||
tesseractPath,
|
||||
ctx.Resolve<TesseractLanguageManager>().TessdataBasePath);
|
||||
var engine = TesseractOcrEngine.BundledWithModes(ctx.Resolve<TesseractLanguageManager>().TessdataBasePath);
|
||||
var errorOutput = ctx.Resolve<ErrorOutput>();
|
||||
engine.OcrError += (_, args) => errorOutput.DisplayError(SdkResources.OcrError, args.Exception);
|
||||
engine.OcrTimeout += (_, _) => errorOutput.DisplayError(SdkResources.OcrTimeout);
|
||||
|
43
NAPS2.Sdk.Samples/OcrSample.cs
Normal file
43
NAPS2.Sdk.Samples/OcrSample.cs
Normal file
@ -0,0 +1,43 @@
|
||||
using NAPS2.Images.Gdi;
|
||||
using NAPS2.Ocr;
|
||||
using NAPS2.Pdf;
|
||||
using NAPS2.Scan;
|
||||
|
||||
namespace NAPS2.Sdk.Samples;
|
||||
|
||||
public class OcrSample
|
||||
{
|
||||
public static async Task OcrAndExportPdf()
|
||||
{
|
||||
// Exporting PDFs with OCR requires the optional NAPS2.Tesseract.Binaries Nuget package to be installed.
|
||||
// Or, alternatively, you can use the system-installed Tesseract or provide a custom path to a Tesseract EXE.
|
||||
|
||||
using var scanningContext = new ScanningContext(new GdiImageContext());
|
||||
|
||||
// The NAPS2.Tesseract.Binaries package doesn't include all the actual language data (1GB+ for 100+ languages).
|
||||
// You can download .traineddata files from one of these repos:
|
||||
// - https://github.com/tesseract-ocr/tessdata_fast
|
||||
// - https://github.com/tesseract-ocr/tessdata_best
|
||||
// Then specify the folder where those .traineddata files are stored.
|
||||
scanningContext.OcrEngine = TesseractOcrEngine.Bundled(@"C:\path\to\my\traineddata\files\");
|
||||
|
||||
// Or if you know Tesseract is installed on the system PATH you can just do this without needing any extra
|
||||
// packages or downloads.
|
||||
scanningContext.OcrEngine = TesseractOcrEngine.System();
|
||||
|
||||
// Or if you have a custom path to the tesseract EXE you can do this.
|
||||
scanningContext.OcrEngine = TesseractOcrEngine.Custom(@"C:\path\to\tesseract.exe");
|
||||
|
||||
// Scan some images
|
||||
var controller = new ScanController(scanningContext);
|
||||
var devices = await controller.GetDeviceList();
|
||||
var options = new ScanOptions { Device = devices.First() };
|
||||
var images = await controller.Scan(options).ToListAsync();
|
||||
|
||||
// Export to PDF with OCR
|
||||
var pdfExporter = new PdfExporter(scanningContext);
|
||||
// We specify the language code for OCR. This is based on the name of the .traineddata file, and is found here:
|
||||
// https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016
|
||||
await pdfExporter.Export("doc.pdf", images, ocrParams: new OcrParams("eng"));
|
||||
}
|
||||
}
|
@ -74,8 +74,7 @@ public class ContextualTests : IDisposable
|
||||
var tesseractPath = NativeLibrary.FindExePath(PlatformCompat.System.TesseractExecutableName, depsRoot);
|
||||
CopyResourceToFile(BinaryResources.eng_traineddata, fast, "eng.traineddata");
|
||||
CopyResourceToFile(BinaryResources.heb_traineddata, fast, "heb.traineddata");
|
||||
ScanningContext.OcrEngine =
|
||||
new TesseractOcrEngine(tesseractPath, FolderPath);
|
||||
ScanningContext.OcrEngine = TesseractOcrEngine.CustomWithModes(tesseractPath, FolderPath);
|
||||
}
|
||||
|
||||
public void SetUpFakeOcr() => SetUpFakeOcr(new());
|
||||
|
@ -7,7 +7,7 @@
|
||||
/// For language codes, see
|
||||
/// https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016
|
||||
/// </summary>
|
||||
public record OcrParams(string? LanguageCode, OcrMode Mode, double TimeoutInSeconds)
|
||||
public record OcrParams(string? LanguageCode, OcrMode Mode = OcrMode.Default, double TimeoutInSeconds = 0)
|
||||
{
|
||||
private OcrParams()
|
||||
: this(null, OcrMode.Default, 0)
|
||||
|
@ -3,6 +3,7 @@ using System.Threading;
|
||||
using System.Xml;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using NAPS2.Scan;
|
||||
using NAPS2.Unmanaged;
|
||||
|
||||
namespace NAPS2.Ocr;
|
||||
|
||||
@ -10,13 +11,54 @@ public class TesseractOcrEngine : IOcrEngine
|
||||
{
|
||||
private readonly string _tesseractPath;
|
||||
private readonly string? _languageDataBasePath;
|
||||
private readonly bool _withModes;
|
||||
|
||||
public TesseractOcrEngine(string tesseractPath, string? languageDataBasePath = null)
|
||||
/// <summary>
|
||||
/// Gets a TesseractOcrEngine instance configured to use the Tesseract executable on the system PATH with the
|
||||
/// system-installed language data.
|
||||
/// </summary>
|
||||
public static TesseractOcrEngine System() =>
|
||||
new("tesseract");
|
||||
|
||||
/// <summary>
|
||||
/// Gets a TesseractOcrEngine instance configured to use the Tesseract executable from the NAPS2.Tesseract.Binaries
|
||||
/// nuget package using language data .traineddata files in the specified folder.
|
||||
/// </summary>
|
||||
public static TesseractOcrEngine Bundled(string languageDataPath) =>
|
||||
new(BundlePath, languageDataPath, false);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a TesseractOcrEngine instance configured to use the Tesseract executable from the NAPS2.Tesseract.Binaries
|
||||
/// nuget package using language data .traineddata files in the specified folder. The folder is expected to have
|
||||
/// subfolders named "best" and "fast" with the actual .trainneddata files that will be used based on the OcrMode.
|
||||
/// </summary>
|
||||
public static TesseractOcrEngine BundledWithModes(string languageDataBasePath) =>
|
||||
new(BundlePath, languageDataBasePath, true);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a TesseractOcrEngine instance configured to use the specified Tesseract executable, optionally looking for
|
||||
/// .traineddata files in the specified folder.
|
||||
/// </summary>
|
||||
public static TesseractOcrEngine Custom(string tesseractExePath, string? languageDataPath = null) =>
|
||||
new(tesseractExePath, languageDataPath, false);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a TesseractOcrEngine instance configured to use the specified Tesseract executable using language data
|
||||
/// .traineddata files in the specified folder. The folder is expected to have subfolders named "best" and "fast"
|
||||
/// with the actual .trainneddata files that will be used based on the OcrMode.
|
||||
/// </summary>
|
||||
public static TesseractOcrEngine CustomWithModes(string tesseractExePath, string languageDataBasePath) =>
|
||||
new(tesseractExePath, languageDataBasePath, true);
|
||||
|
||||
private static string BundlePath => NativeLibrary.FindExePath(PlatformCompat.System.TesseractExecutableName);
|
||||
|
||||
private TesseractOcrEngine(string tesseractPath, string? languageDataBasePath = null, bool withModes = true)
|
||||
{
|
||||
_tesseractPath = tesseractPath;
|
||||
_languageDataBasePath = languageDataBasePath;
|
||||
_withModes = withModes;
|
||||
}
|
||||
|
||||
|
||||
public async Task<OcrResult?> ProcessImage(ScanningContext scanningContext, string imagePath, OcrParams ocrParams,
|
||||
CancellationToken cancelToken)
|
||||
{
|
||||
@ -36,8 +78,12 @@ public class TesseractOcrEngine : IOcrEngine
|
||||
};
|
||||
if (_languageDataBasePath != null)
|
||||
{
|
||||
string subfolder = ocrParams.Mode == OcrMode.Best ? "best" : "fast";
|
||||
string languageDataPath = Path.Combine(_languageDataBasePath, subfolder);
|
||||
string languageDataPath = _languageDataBasePath;
|
||||
if (_withModes)
|
||||
{
|
||||
string subfolder = ocrParams.Mode == OcrMode.Best ? "best" : "fast";
|
||||
languageDataPath = Path.Combine(languageDataPath, subfolder);
|
||||
}
|
||||
startInfo.EnvironmentVariables["TESSDATA_PREFIX"] = languageDataPath;
|
||||
var tessdata = new DirectoryInfo(languageDataPath);
|
||||
EnsureHocrConfigExists(tessdata);
|
||||
@ -193,7 +239,7 @@ public class TesseractOcrEngine : IOcrEngine
|
||||
}
|
||||
return bounds;
|
||||
}
|
||||
|
||||
|
||||
// TODO: Consider adding back CanProcess, or otherwise using this code to get the languages from a system engine
|
||||
// private void CheckIfInstalled()
|
||||
// {
|
||||
|
@ -20,8 +20,6 @@ internal interface ISystemCompat
|
||||
|
||||
bool ShouldRememberBackgroundOperations { get; }
|
||||
|
||||
bool UseSystemTesseract { get; }
|
||||
|
||||
bool RenderInWorker { get; }
|
||||
|
||||
bool SupportsWinX86Worker { get; }
|
||||
|
@ -26,8 +26,6 @@ internal class LinuxSystemCompat : ISystemCompat
|
||||
|
||||
public bool ShouldRememberBackgroundOperations => true;
|
||||
|
||||
public bool UseSystemTesseract => false;
|
||||
|
||||
public bool RenderInWorker => false;
|
||||
|
||||
public bool SupportsWinX86Worker => false;
|
||||
|
@ -26,8 +26,6 @@ internal class MacSystemCompat : ISystemCompat
|
||||
|
||||
public bool ShouldRememberBackgroundOperations => true;
|
||||
|
||||
public bool UseSystemTesseract => false;
|
||||
|
||||
public bool RenderInWorker => false;
|
||||
|
||||
public bool SupportsWinX86Worker => false;
|
||||
|
@ -23,8 +23,6 @@ internal abstract class WindowsSystemCompat : ISystemCompat
|
||||
|
||||
public bool ShouldRememberBackgroundOperations => true;
|
||||
|
||||
public bool UseSystemTesseract => false;
|
||||
|
||||
public bool RenderInWorker => true;
|
||||
|
||||
public bool SupportsWinX86Worker => true;
|
||||
|
@ -27,11 +27,11 @@ NAPS2.Sdk is modular, and depending on your needs you may have to reference a di
|
||||
- **[NAPS2.Sdk.Worker.Win32](https://www.nuget.org/packages/NAPS2.Sdk.Worker.Win32/)**
|
||||
- For scanning with [TWAIN on Windows](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/TwainSample.cs).
|
||||
- **[NAPS2.Pdfium.Binaries](https://www.nuget.org/packages/NAPS2.Pdfium.Binaries/)**
|
||||
- For [importing PDFs]().
|
||||
- For [importing PDFs](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/PdfImportSample.cs).
|
||||
- **[NAPS2.Sane.Binaries](https://www.nuget.org/packages/NAPS2.Sane.Binaries/)**
|
||||
- For [using SANE drivers]() on Mac. (Linux has them pre-installed, and Windows isn't supported.)
|
||||
- **[NAPS2.Tesseract.Binaries](https://www.nuget.org/packages/NAPS2.Tesseract.Binaries/)**
|
||||
- For [running OCR](). (You can also use a separate Tesseract installation if you like.)
|
||||
- For [running OCR](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/OcrSample.cs). (You can also use a separate Tesseract installation if you like.)
|
||||
- **[NAPS2.Escl.Server](https://www.nuget.org/packages/NAPS2.Escl.Server/)**
|
||||
- For [sharing scanners](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/NetworkSharingSample.cs) across the local network.
|
||||
|
||||
@ -73,6 +73,7 @@ More [samples](https://github.com/cyanfish/naps2/tree/master/NAPS2.Sdk.Samples):
|
||||
- [Scan with TWAIN drivers](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/TwainSample.cs)
|
||||
- [Scan to System.Drawing.Bitmap](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/ScanToBitmapSample.cs)
|
||||
- [Import and export PDFs](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/PdfImportSample.cs)
|
||||
- [Export PDFs with OCR](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/OcrSample.cs)
|
||||
- [Store image data on the filesystem](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/FileStorageSample.cs)
|
||||
- [Share scanners on the local network](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/NetworkSharingSample.cs)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user