WIP: Rewrite tesseract code and strip out all old tesseract configs. Going forward we'll bundle the exe and only support the latest traineddata.

This commit is contained in:
Ben Olden-Cooligan 2022-06-17 21:04:35 -07:00
parent 9ab618acee
commit f4dde8ee14
31 changed files with 485 additions and 598 deletions

View File

@ -31,7 +31,7 @@ public class CommonModule : NinjectModule
Bind<IScannedImagePrinter>().To<PrintDocumentPrinter>();
Bind<IEmailProviderFactory>().To<NinjectEmailProviderFactory>();
Bind<IMapiWrapper>().To<MapiWrapper>();
Bind<OcrEngineManager>().ToMethod(ctx => OcrEngineManager.Default);
// TODO: Bind TesseractLanguageManager
Bind<OcrRequestQueue>().ToSelf().InSingletonScope();
// Scan

View File

@ -19,11 +19,10 @@ public static class StaticConfiguration
Debug.Listeners.Add(new NLogTraceListener());
#endif
// TODO: Initialize TesseractLanguageManager here?
var customPath = config.Get(c => c.ComponentsPath);
var basePath = string.IsNullOrWhiteSpace(customPath)
? Paths.Components
: Environment.ExpandEnvironmentVariables(customPath);
OcrEngineManager.Default = new OcrEngineManager(basePath);
}
}

View File

@ -20,7 +20,7 @@ public class AutomatedScanning
private readonly ErrorOutput _errorOutput;
private readonly IScannedImageImporter _scannedImageImporter;
private readonly IOperationFactory _operationFactory;
private readonly OcrEngineManager _ocrEngineManager;
private readonly TesseractLanguageManager _tesseractLanguageManager;
private readonly IFormFactory _formFactory;
private readonly ScopedConfig _config;
private readonly TransactionConfigScope<CommonConfig> _userTransact;
@ -43,7 +43,7 @@ public class AutomatedScanning
public AutomatedScanning(ConsoleOutput output, AutomatedScanningOptions options, ImageContext imageContext,
IScanPerformer scanPerformer, ErrorOutput errorOutput, IEmailProviderFactory emailProviderFactory,
IScannedImageImporter scannedImageImporter, IOperationFactory operationFactory,
OcrEngineManager ocrEngineManager, IFormFactory formFactory, ScopedConfig config,
TesseractLanguageManager tesseractLanguageManager, IFormFactory formFactory, ScopedConfig config,
IProfileManager profileManager, RecoveryStorageManager recoveryStorageManager, ScanningContext scanningContext)
{
_output = output;
@ -54,7 +54,7 @@ public class AutomatedScanning
_emailProviderFactory = emailProviderFactory;
_scannedImageImporter = scannedImageImporter;
_operationFactory = operationFactory;
_ocrEngineManager = ocrEngineManager;
_tesseractLanguageManager = tesseractLanguageManager;
_formFactory = formFactory;
_config = config;
_profileManager = profileManager;
@ -173,12 +173,7 @@ public class AutomatedScanning
private void InstallComponents()
{
var availableComponents = new List<IExternalComponent>();
var ocrEngine = _ocrEngineManager.EngineToInstall;
if (ocrEngine != null)
{
availableComponents.Add(ocrEngine.Component);
availableComponents.AddRange(ocrEngine.LanguageComponents);
}
availableComponents.AddRange(_tesseractLanguageManager.LanguageComponents);
var componentDict = availableComponents.ToDictionary(x => x.Id.ToLowerInvariant());
var installId = _options.Install.ToLowerInvariant();

View File

@ -100,7 +100,7 @@ public class CommandLineIntegrationTests : ContextualTexts
public override void Load()
{
Rebind<ImageContext>().ToConstant(_imageContext);
Rebind<OcrEngineManager>().ToConstant(new OcrEngineManager());
// TODO: Bind TesseractLanguageManager
Rebind<IScanDriverFactory>().ToConstant(_scanDriverFactory);
Rebind<IScanBridgeFactory>().To<InProcScanBridgeFactory>();
Rebind<ConsoleOutput>().ToSelf().WithConstructorArgument("writer", new TestOutputTextWriter(_testOutputHelper));

View File

@ -38,6 +38,10 @@
<Generator>ResXFileCodeGenerator</Generator>
<LastGenOutput>ImageImporterTestsData.Designer.cs</LastGenOutput>
</EmbeddedResource>
<EmbeddedResource Update="Ocr\TesseractResources.resx">
<Generator>ResXFileCodeGenerator</Generator>
<LastGenOutput>TesseractResources.Designer.cs</LastGenOutput>
</EmbeddedResource>
</ItemGroup>
<ItemGroup>
@ -46,6 +50,20 @@
<AutoGen>True</AutoGen>
<DependentUpon>ImageImporterTestsData.resx</DependentUpon>
</Compile>
<Compile Update="Ocr\TesseractResources.Designer.cs">
<DesignTime>True</DesignTime>
<AutoGen>True</AutoGen>
<DependentUpon>TesseractTestResources.resx</DependentUpon>
</Compile>
</ItemGroup>
<ItemGroup>
<None Update="Resources\tesseract-5.1.0-x64.exe">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Resources\eng.traineddata">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>
</Project>

View File

@ -1,10 +0,0 @@
namespace NAPS2.Sdk.Tests.Ocr;
public class TesseractEngineTests
{
// TODO: Ocr tests
// (1) Integration test that runs a real tesseract, produces a real pdf, and parses the pdf
// (2) Unit test for hocr parsing
// (3) Unit tests for OcrEngineManager
// (4) ?
}

View File

@ -0,0 +1,34 @@
using System.Threading;
using NAPS2.Ocr;
using Xunit;
namespace NAPS2.Sdk.Tests.Ocr;
public class TesseractOcrEngineTests : ContextualTexts
{
private readonly TesseractOcrEngine _engine;
private readonly string _testImagePath;
public TesseractOcrEngineTests()
{
var exePath = Path.Combine(FolderPath, "tesseract.exe");
File.WriteAllBytes(exePath, TesseractResources.tesseract_x64);
var tessdataPath = Path.Combine(FolderPath, "fast");
Directory.CreateDirectory(tessdataPath);
var engDataPath = Path.Combine(tessdataPath, "eng.traineddata");
File.WriteAllBytes(engDataPath, TesseractResources.eng_traineddata);
_testImagePath = Path.Combine(FolderPath, "ocr_test.jpg");
File.WriteAllBytes(_testImagePath, TesseractResources.ocr_test);
_engine = new TesseractOcrEngine(exePath, FolderPath);
}
[Fact]
public async Task RunTesseract()
{
var result = await _engine.ProcessImage(_testImagePath, new OcrParams("eng", OcrMode.Fast, 0), CancellationToken.None);
Assert.NotNull(result);
}
}

View File

@ -0,0 +1,93 @@
//------------------------------------------------------------------------------
// <auto-generated>
// This code was generated by a tool.
// Runtime Version:4.0.30319.42000
//
// Changes to this file may cause incorrect behavior and will be lost if
// the code is regenerated.
// </auto-generated>
//------------------------------------------------------------------------------
namespace NAPS2.Sdk.Tests.Ocr {
using System;
/// <summary>
/// A strongly-typed resource class, for looking up localized strings, etc.
/// </summary>
// This class was auto-generated by the StronglyTypedResourceBuilder
// class via a tool like ResGen or Visual Studio.
// To add or remove a member, edit your .ResX file then rerun ResGen
// with the /str option, or rebuild your VS project.
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "4.0.0.0")]
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
[global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
internal class TesseractResources {
private static global::System.Resources.ResourceManager resourceMan;
private static global::System.Globalization.CultureInfo resourceCulture;
[global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")]
internal TesseractResources() {
}
/// <summary>
/// Returns the cached ResourceManager instance used by this class.
/// </summary>
[global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
internal static global::System.Resources.ResourceManager ResourceManager {
get {
if (object.ReferenceEquals(resourceMan, null)) {
global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("NAPS2.Sdk.Tests.Ocr.TesseractResources", typeof(TesseractResources).Assembly);
resourceMan = temp;
}
return resourceMan;
}
}
/// <summary>
/// Overrides the current thread's CurrentUICulture property for all
/// resource lookups using this strongly typed resource class.
/// </summary>
[global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
internal static global::System.Globalization.CultureInfo Culture {
get {
return resourceCulture;
}
set {
resourceCulture = value;
}
}
/// <summary>
/// Looks up a localized resource of type System.Byte[].
/// </summary>
internal static byte[] eng_traineddata {
get {
object obj = ResourceManager.GetObject("eng_traineddata", resourceCulture);
return ((byte[])(obj));
}
}
/// <summary>
/// Looks up a localized resource of type System.Byte[].
/// </summary>
internal static byte[] ocr_test {
get {
object obj = ResourceManager.GetObject("ocr_test", resourceCulture);
return ((byte[])(obj));
}
}
/// <summary>
/// Looks up a localized resource of type System.Byte[].
/// </summary>
internal static byte[] tesseract_x64 {
get {
object obj = ResourceManager.GetObject("tesseract_x64", resourceCulture);
return ((byte[])(obj));
}
}
}
}

View File

@ -0,0 +1,130 @@
<?xml version="1.0" encoding="utf-8"?>
<root>
<!--
Microsoft ResX Schema
Version 2.0
The primary goals of this format is to allow a simple XML format
that is mostly human readable. The generation and parsing of the
various data types are done through the TypeConverter classes
associated with the data types.
Example:
... ado.net/XML headers & schema ...
<resheader name="resmimetype">text/microsoft-resx</resheader>
<resheader name="version">2.0</resheader>
<resheader name="reader">System.Resources.ResXResourceReader, System.Windows.Forms, ...</resheader>
<resheader name="writer">System.Resources.ResXResourceWriter, System.Windows.Forms, ...</resheader>
<data name="Name1"><value>this is my long string</value><comment>this is a comment</comment></data>
<data name="Color1" type="System.Drawing.Color, System.Drawing">Blue</data>
<data name="Bitmap1" mimetype="application/x-microsoft.net.object.binary.base64">
<value>[base64 mime encoded serialized .NET Framework object]</value>
</data>
<data name="Icon1" type="System.Drawing.Icon, System.Drawing" mimetype="application/x-microsoft.net.object.bytearray.base64">
<value>[base64 mime encoded string representing a byte array form of the .NET Framework object]</value>
<comment>This is a comment</comment>
</data>
There are any number of "resheader" rows that contain simple
name/value pairs.
Each data row contains a name, and value. The row also contains a
type or mimetype. Type corresponds to a .NET class that support
text/value conversion through the TypeConverter architecture.
Classes that don't support this are serialized and stored with the
mimetype set.
The mimetype is used for serialized objects, and tells the
ResXResourceReader how to depersist the object. This is currently not
extensible. For a given mimetype the value must be set accordingly:
Note - application/x-microsoft.net.object.binary.base64 is the format
that the ResXResourceWriter will generate, however the reader can
read any of the formats listed below.
mimetype: application/x-microsoft.net.object.binary.base64
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Binary.BinaryFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.soap.base64
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Soap.SoapFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.bytearray.base64
value : The object must be serialized into a byte array
: using a System.ComponentModel.TypeConverter
: and then encoded with base64 encoding.
-->
<xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
<xsd:import namespace="http://www.w3.org/XML/1998/namespace" />
<xsd:element name="root" msdata:IsDataSet="true">
<xsd:complexType>
<xsd:choice maxOccurs="unbounded">
<xsd:element name="metadata">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" />
</xsd:sequence>
<xsd:attribute name="name" use="required" type="xsd:string" />
<xsd:attribute name="type" type="xsd:string" />
<xsd:attribute name="mimetype" type="xsd:string" />
<xsd:attribute ref="xml:space" />
</xsd:complexType>
</xsd:element>
<xsd:element name="assembly">
<xsd:complexType>
<xsd:attribute name="alias" type="xsd:string" />
<xsd:attribute name="name" type="xsd:string" />
</xsd:complexType>
</xsd:element>
<xsd:element name="data">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
<xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
</xsd:sequence>
<xsd:attribute name="name" type="xsd:string" use="required" msdata:Ordinal="1" />
<xsd:attribute name="type" type="xsd:string" msdata:Ordinal="3" />
<xsd:attribute name="mimetype" type="xsd:string" msdata:Ordinal="4" />
<xsd:attribute ref="xml:space" />
</xsd:complexType>
</xsd:element>
<xsd:element name="resheader">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
</xsd:sequence>
<xsd:attribute name="name" type="xsd:string" use="required" />
</xsd:complexType>
</xsd:element>
</xsd:choice>
</xsd:complexType>
</xsd:element>
</xsd:schema>
<resheader name="resmimetype">
<value>text/microsoft-resx</value>
</resheader>
<resheader name="version">
<value>2.0</value>
</resheader>
<resheader name="reader">
<value>System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</resheader>
<resheader name="writer">
<value>System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</resheader>
<assembly alias="System.Windows.Forms" name="System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" />
<data name="tesseract_x64" type="System.Resources.ResXFileRef, System.Windows.Forms">
<value>..\Resources\tesseract-5.1.0-x64.exe;System.Byte[], mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</data>
<data name="eng_traineddata" type="System.Resources.ResXFileRef, System.Windows.Forms">
<value>..\Resources\eng.traineddata;System.Byte[], mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</data>
<data name="ocr_test" type="System.Resources.ResXFileRef, System.Windows.Forms">
<value>..\Resources\ocr_test.jpg;System.Byte[], mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</data>
</root>

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 86 KiB

View File

@ -20,7 +20,7 @@ public class AutoSaver
private readonly TiffHelper _tiffHelper;
private readonly ImageContext _imageContext;
public AutoSaver(IConfigProvider<PdfSettings> pdfSettingsProvider, IConfigProvider<ImageSettings> imageSettingsProvider, OcrEngineManager ocrEngineManager, OcrRequestQueue ocrRequestQueue, ErrorOutput errorOutput, DialogHelper dialogHelper, OperationProgress operationProgress, ISaveNotify notify, PdfExporter pdfExporter, OverwritePrompt overwritePrompt, ScopedConfig config, TiffHelper tiffHelper, ImageContext imageContext)
public AutoSaver(IConfigProvider<PdfSettings> pdfSettingsProvider, IConfigProvider<ImageSettings> imageSettingsProvider, ErrorOutput errorOutput, DialogHelper dialogHelper, OperationProgress operationProgress, ISaveNotify notify, PdfExporter pdfExporter, OverwritePrompt overwritePrompt, ScopedConfig config, TiffHelper tiffHelper, ImageContext imageContext)
{
_pdfSettingsProvider = pdfSettingsProvider;
_imageSettingsProvider = imageSettingsProvider;

View File

@ -74,10 +74,6 @@ public class PdfSharpExporter : PdfExporter
{
Log.Error("Supported OCR engine not installed.", ocrParams.LanguageCode);
}
else if (!activeEngine.CanProcess(ocrParams.LanguageCode))
{
Log.Error("OCR files not available for '{0}'.", ocrParams.LanguageCode);
}
else
{
ocrEngine = activeEngine;

View File

@ -1,27 +1,8 @@
using System.Threading;
using NAPS2.Dependencies;
namespace NAPS2.Ocr;
public interface IOcrEngine
{
bool CanProcess(string langCode);
Task<OcrResult?> ProcessImage(string imagePath, OcrParams ocrParams, CancellationToken cancelToken);
bool IsSupported { get; }
bool IsInstalled { get; }
bool CanInstall { get; }
IEnumerable<Language> InstalledLanguages { get; }
IEnumerable<Language> NotInstalledLanguages { get; }
IExternalComponent Component { get; }
IEnumerable<IExternalComponent> LanguageComponents { get; }
IEnumerable<OcrMode> SupportedModes { get; }
}

View File

@ -50,10 +50,6 @@ public class OcrController
{
throw new InvalidOperationException("OCR is enabled but no language code is specified.");
}
if (!Engine.CanProcess(OcrParams.LanguageCode))
{
throw new InvalidOperationException("OCR is enabled but the engine can't handle the specified language.");
}
CancellationTokenSource cts = new CancellationTokenSource();
_cancellationTokenSources.Add(image.GetWeakReference(), cts);

View File

@ -1,82 +0,0 @@
using NAPS2.Scan;
namespace NAPS2.Ocr;
public class OcrEngineManager : IOcrEngineProvider
{
private static OcrEngineManager _default = new OcrEngineManager();
public static OcrEngineManager Default
{
get
{
TestingContext.NoStaticDefaults();
return _default;
}
set => _default = value ?? throw new ArgumentNullException(nameof(value));
}
private readonly List<IOcrEngine> _engines;
/// <summary>
/// Creates a new instance of OcrEngineManager that only looks for Tesseract on the system path.
/// </summary>
public OcrEngineManager()
{
_engines = new List<IOcrEngine>
{
new TesseractSystemEngine()
};
}
/// <summary>
/// Creates a new instance of OcrEngineManager with the specified engines. The order of engines is important; preferred/newer first.
/// </summary>
/// <param name="orderedEngineList"></param>
public OcrEngineManager(IEnumerable<IOcrEngine> orderedEngineList)
{
_engines = orderedEngineList.ToList();
}
/// <summary>
/// Creates a new instance of OcrEngineManager with the default set of engines.
/// <param name="basePath">The base path for installed engines.</param>
/// </summary>
public OcrEngineManager(string basePath)
{
_engines = new List<IOcrEngine>
{
new Tesseract400Beta4Engine(basePath),
new Tesseract304Engine(basePath),
new Tesseract304XpEngine(basePath),
new Tesseract302Engine(basePath),
new TesseractSystemEngine()
};
}
public IEnumerable<IOcrEngine> Engines => _engines;
public bool IsReady => _engines.Any(x => x.IsSupported && x.IsInstalled && x.InstalledLanguages.Any());
public bool IsNewestReady
{
get
{
var latest = _engines.FirstOrDefault(x => x.IsSupported);
if (latest == null) return false;
return latest.IsInstalled && latest.InstalledLanguages.Any();
}
}
public bool CanUpgrade => !IsNewestReady && _engines.Any(x => x.IsInstalled);
public bool MustUpgrade => !IsReady && _engines.Any(x => x.IsInstalled);
public bool MustInstallPackage => _engines.All(x => (!x.IsSupported || !x.CanInstall) && !x.IsInstalled);
public IOcrEngine? ActiveEngine => _engines.FirstOrDefault(x => x.IsSupported && x.IsInstalled && x.InstalledLanguages.Any());
public IOcrEngine? InstalledEngine => _engines.FirstOrDefault(x => x.IsInstalled && x.InstalledLanguages.Any());
public IOcrEngine? EngineToInstall => _engines.FirstOrDefault(x => x.IsSupported && x.CanInstall);
}

View File

@ -1,22 +0,0 @@
using NAPS2.Dependencies;
namespace NAPS2.Ocr;
public class Tesseract302Engine : TesseractBaseEngine
{
public Tesseract302Engine(string basePath)
{
// Using the newer data since we just need the 302 engine for backwards compatibility
LanguageData = TesseractLanguageData.V304;
TesseractBasePath = Path.Combine(basePath, "tesseract-3.0.2");
TesseractExePath = "tesseract.exe";
TesseractHocrExtension = ".html";
PlatformSupport = PlatformSupport.Windows;
CanInstall = false;
Component = new ExternalComponent("ocr", Path.Combine(TesseractBasePath, TesseractExePath), null);
LanguageComponents = LanguageData.Data.Select(x =>
new ExternalComponent($"ocr-{x.Code}", Path.Combine(TesseractBasePath, "tessdata", x.Filename.Replace(".gz", "")), null));
}
}

View File

@ -1,29 +0,0 @@
using NAPS2.Dependencies;
namespace NAPS2.Ocr;
public class Tesseract304Engine : TesseractBaseEngine
{
protected static readonly List<DownloadMirror> Mirrors = new List<DownloadMirror>
{
new DownloadMirror(PlatformSupport.ModernWindows.Or(PlatformSupport.Linux), @"https://github.com/cyanfish/naps2-components/releases/download/tesseract-3.04/{0}"),
new DownloadMirror(PlatformSupport.ModernWindows.Or(PlatformSupport.Linux), @"https://sourceforge.net/projects/naps2/files/components/tesseract-3.04/{0}/download"),
new DownloadMirror(PlatformSupport.WindowsXp, @"http://xp-mirror.naps2.com/tesseract-3.04/{0}")
};
public Tesseract304Engine(string basePath)
{
LanguageData = TesseractLanguageData.V304;
TesseractBasePath = Path.Combine(basePath, "tesseract-3.0.4");
TesseractExePath = "tesseract.exe";
PlatformSupport = PlatformSupport.ModernWindows;
CanInstall = true;
Component = new ExternalComponent("ocr", Path.Combine(TesseractBasePath, TesseractExePath),
new DownloadInfo("tesseract.exe.gz", Mirrors, 1.32, "0b0fd21cd886c04c60ed5c3f38b9120b408139b3", DownloadFormat.Gzip));
LanguageComponents = LanguageData.Data.Select(x =>
new ExternalComponent($"ocr-{x.Code}", Path.Combine(TesseractBasePath, "tessdata", x.Filename.Replace(".gz", "")),
new DownloadInfo(x.Filename, Mirrors, x.Size, x.Sha1, DownloadFormat.Gzip)));
}
}

View File

@ -1,15 +0,0 @@
using NAPS2.Dependencies;
namespace NAPS2.Ocr;
public class Tesseract304XpEngine : Tesseract304Engine
{
public Tesseract304XpEngine(string basePath) : base(basePath)
{
TesseractExePath = "tesseract_xp.exe";
PlatformSupport = PlatformSupport.Windows;
Component = new ExternalComponent("ocr", Path.Combine(TesseractBasePath, TesseractExePath),
new DownloadInfo("tesseract_xp.exe.gz", Mirrors, 1.32, "98d15e4765caae864f16fa2ab106e3fd6adbe8c3", DownloadFormat.Gzip));
}
}

View File

@ -1,51 +0,0 @@
using NAPS2.Dependencies;
namespace NAPS2.Ocr;
public class Tesseract400Beta4Engine : TesseractBaseEngine
{
protected static readonly List<DownloadMirror> Mirrors = new List<DownloadMirror>
{
new DownloadMirror(PlatformSupport.ModernWindows.Or(PlatformSupport.Linux), @"https://github.com/cyanfish/naps2-components/releases/download/tesseract-4.0.0b4/{0}"),
new DownloadMirror(PlatformSupport.ModernWindows.Or(PlatformSupport.Linux), @"https://sourceforge.net/projects/naps2/files/components/tesseract-4.0.0b4/{0}/download")
};
public Tesseract400Beta4Engine(string basePath)
{
string exeFolder = Environment.Is64BitProcess ? "w64" : "w32";
LanguageData = TesseractLanguageData.V400B4;
TesseractBasePath = Path.Combine(basePath, "tesseract-4.0.0b4");
TesseractExePath = Path.Combine(exeFolder, "tesseract.exe");
PlatformSupport = PlatformSupport.ModernWindows;
CanInstall = true;
SupportedModes = new[] { OcrMode.Fast, OcrMode.Best, OcrMode.Legacy };
var download = Environment.Is64BitProcess
? new DownloadInfo("tesseract.exe.w64.zip", Mirrors, 1.83, "4eba9aaf8800a100ef059c512be572e39ae72f4d", DownloadFormat.Zip)
: new DownloadInfo("tesseract.exe.w32.zip", Mirrors, 1.56, "300ad281a5fa1c734dbb4a8a4dd49e3a8ab921a4", DownloadFormat.Zip);
Component = new MultiFileExternalComponent("ocr", Path.Combine(TesseractBasePath, exeFolder), new[] { "tesseract.exe" }, download);
LanguageComponents = LanguageData.Data.Select(x =>
new MultiFileExternalComponent($"ocr-{x.Code}", TesseractBasePath, new[] { $"best/{x.Code}.traineddata", $"fast/{x.Code}.traineddata" },
new DownloadInfo(x.Filename, Mirrors, x.Size, x.Sha1, DownloadFormat.Zip)));
}
protected override RunInfo TesseractRunInfo(OcrParams ocrParams)
{
OcrMode mode = ocrParams.Mode;
string folder = mode == OcrMode.Fast || mode == OcrMode.Default ? "fast" : "best";
if (ocrParams.LanguageCode.Split('+').All(code => !File.Exists(Path.Combine(TesseractBasePath, folder, $"{code.ToLowerInvariant()}.traineddata"))))
{
// Use the other source if the selected one doesn't exist
folder = folder == "fast" ? "best" : "fast";
mode = folder == "fast" ? OcrMode.Fast : OcrMode.Best;
}
return new RunInfo
{
Arguments = mode == OcrMode.Best ? "--oem 1" : mode == OcrMode.Legacy ? "--oem 0" : "",
DataPath = folder,
PrefixPath = folder
};
}
}

View File

@ -37,124 +37,9 @@ public class TesseractLanguageData
public TesseractLanguage[] Data { get; set; }
#region Tesseract 3.04 Language Data (auto-generated)
#region Tesseract Language Data (auto-generated)
public static readonly TesseractLanguageData V304 = new TesseractLanguageData(new []
{
new TesseractLanguage("afr.traineddata.gz", "afr", "Afrikaans", 1.93, "a669186130bf1fc6c78226ac868c82b70a44c70b"),
new TesseractLanguage("amh.traineddata.gz", "amh", "Amharic", 1.03, "1153cbbac7306d42e72ca639ff3f36f45dcb15a2"),
new TesseractLanguage("ara.traineddata.gz", "ara", "Arabic", 1.62, "87b76c73fdcc4c54ec1f03d83b6df665430c2b06", true),
new TesseractLanguage("asm.traineddata.gz", "asm", "Assamese", 6.56, "223900790d10f638b7dca2a8b8e8a15295d1f19c"),
new TesseractLanguage("aze.traineddata.gz", "aze", "Azerbaijani", 2.54, "01607e49fe6ba6604f65d9b57c77b403ab74040a", true),
new TesseractLanguage("aze_cyrl.traineddata.gz", "aze_cyrl", "Azerbaijani (Cyrillic)", 0.97, "f9c9b153e8825bb92d9c8005342ac3d5ea81d0bc"),
new TesseractLanguage("bel.traineddata.gz", "bel", "Belarusian", 2.43, "3ac0935dd22f4f2730286d5cb127324d27718410"),
new TesseractLanguage("ben.traineddata.gz", "ben", "Bengali", 6.45, "479674b283db6e84fdfb17386056f2e9a5b41b9c"),
new TesseractLanguage("bod.traineddata.gz", "bod", "Tibetan", 10.74, "3ff199544dc9e7994658231cbc999878e23463db"),
new TesseractLanguage("bos.traineddata.gz", "bos", "Bosnian", 1.87, "9d0bb89c53251789bba06de1452cf1a74d978f35"),
new TesseractLanguage("bul.traineddata.gz", "bul", "Bulgarian", 2.20, "ac0481cc1fe62c3af5a34d57fa1571dfd2a95865"),
new TesseractLanguage("cat.traineddata.gz", "cat", "Catalan", 1.97, "e1e1dc2e37f6b085bdefdb9d0d63d3ad086ef1f4"),
new TesseractLanguage("ceb.traineddata.gz", "ceb", "Cebuano", 0.58, "f867102f828b6495996370eea6ed8688af219b17"),
new TesseractLanguage("ces.traineddata.gz", "ces", "Czech", 4.65, "155f60a0994f1590d3d3ba29ec1a5bca3f16efdd"),
new TesseractLanguage("chi_sim.traineddata.gz", "chi_sim", "Chinese (Simplified)", 17.60, "9bd65dcecd2581e8f588cec11cd1e2f754885fcb"),
new TesseractLanguage("chi_tra.traineddata.gz", "chi_tra", "Chinese (Traditional)", 24.11, "5abef9af8a4fd83a0d156ee2e1d5234c80bb836b"),
new TesseractLanguage("chr.traineddata.gz", "chr", "Cherokee", 0.36, "d3677cb6c57ec1b14625a5594dad159a1ad9ec93"),
new TesseractLanguage("cym.traineddata.gz", "cym", "Welsh", 1.36, "a5d5733d45710f6da1c4b19f0903bf5edb10a484"),
new TesseractLanguage("dan.traineddata.gz", "dan", "Danish", 2.76, "eb813b0c299261b9535a2c684e51f159f05ae8ea"),
new TesseractLanguage("dan_frak.traineddata.gz", "dan_frak", "Danish (Fraktur)", 0.65, "dcb540024688da096399e52ff9826aad1d71479c"),
new TesseractLanguage("deu.traineddata.gz", "deu", "German", 5.48, "f575f3fcb554077b906aaaac8850d5bd56967cbd"),
new TesseractLanguage("deu_frak.traineddata.gz", "deu_frak", "German (Fraktur)", 0.78, "28ac257129f881b3a09c099004048bf6de4bc952"),
new TesseractLanguage("dzo.traineddata.gz", "dzo", "Dzongkha", 1.32, "6eb0c943242e4d906cbebec2cf43b2ca63979424"),
new TesseractLanguage("ell.traineddata.gz", "ell", "Greek", 2.00, "e54ab7455c1d4715652253321f693e221b61ac8b"),
new TesseractLanguage("eng.traineddata.gz", "eng", "English", 9.02, "36bfd5953540b3c294c62402e303f381cee156f3"),
new TesseractLanguage("enm.traineddata.gz", "enm", "Middle English (1100-1500)", 0.77, "02486b802f4f83b5d9198309955cbf4aa38e5e05"),
new TesseractLanguage("epo.traineddata.gz", "epo", "Esperanto", 2.42, "465dfb934eb45116ebe7f3c4e3adf28826e49dca"),
new TesseractLanguage("equ.traineddata.gz", "equ", "Math / equation detection", 0.78, "c9bc582875cf7c7903b529a9cdb0b9f4669b840d"),
new TesseractLanguage("est.traineddata.gz", "est", "Estonian", 3.62, "d743f2456fa32ce7bbbb80cb40951eb742692596"),
new TesseractLanguage("eus.traineddata.gz", "eus", "Basque", 1.83, "d991552b861e5ea1dca59ffca7e295b323e62bbf"),
new TesseractLanguage("fas.traineddata.gz", "fas", "Persian", 1.75, "c8a7a6b11c3f455b07a397af2e51705a68ff5f77", true),
new TesseractLanguage("fin.traineddata.gz", "fin", "Finnish", 4.98, "90232ad3572901a35bd4bbc736d47184171fa0fd"),
new TesseractLanguage("fra.traineddata.gz", "fra", "French", 5.65, "2bebc5a4c981443c1cbff254e0ca3120004a6c7b"),
new TesseractLanguage("frk.traineddata.gz", "frk", "Frankish", 6.64, "1a6984f8b5768ae663f293ea04594fca229bdb16"),
new TesseractLanguage("frm.traineddata.gz", "frm", "Middle French (ca. 1400-1600)", 6.34, "64e0c6e00352833b206f8b26b6410d0d544b798d"),
new TesseractLanguage("gle.traineddata.gz", "gle", "Irish", 1.25, "994c111e9c24e74bf7105f42a3e39d87ea24f258"),
new TesseractLanguage("glg.traineddata.gz", "glg", "Galician", 2.04, "201c627e518099c15dbbecd72e6e4782e389f619"),
new TesseractLanguage("grc.traineddata.gz", "grc", "Ancient Greek", 1.88, "ae58a943620c485d33ba95b3fcaca79314105d56"),
new TesseractLanguage("guj.traineddata.gz", "guj", "Gujarati", 4.39, "f469d7257f39dcdd0668d768886f19084816b10e"),
new TesseractLanguage("hat.traineddata.gz", "hat", "Haitian", 0.49, "1667e25ebfe6dc74695af413f291e20f1eec552a"),
new TesseractLanguage("heb.traineddata.gz", "heb", "Hebrew", 1.51, "64401c999ef08d6190a11a4347c8f9acf40a8e50", true),
new TesseractLanguage("hin.traineddata.gz", "hin", "Hindi", 6.28, "dae6a9a729ad84eded87fef69004d89249170d44"),
new TesseractLanguage("hrv.traineddata.gz", "hrv", "Croatian", 3.33, "b05db705553607afe3d3f2385dc7f272f348a59c"),
new TesseractLanguage("hun.traineddata.gz", "hun", "Hungarian", 4.62, "250f8b5ad6464e3f0ad8694c0b54392cf6c9d73b"),
new TesseractLanguage("iku.traineddata.gz", "iku", "Inuktitut", 0.30, "119af8b174547aa9cb00f04512d4960d523863ad"),
new TesseractLanguage("ind.traineddata.gz", "ind", "Indonesian", 2.51, "f46f56473ba850408499678c349bdb6dc544dc67"),
new TesseractLanguage("isl.traineddata.gz", "isl", "Icelandic", 2.28, "54004c851361c36ddf48b4443caf79188fa757b6"),
new TesseractLanguage("ita.traineddata.gz", "ita", "Italian", 5.40, "1730f0e32cad3bd76a4f58de67d7c8e2cde17b51"),
new TesseractLanguage("ita_old.traineddata.gz", "ita_old", "Italian (Old)", 5.35, "b7a4293b464cbcce08fd5dc15a9831cff888cdf0"),
new TesseractLanguage("jav.traineddata.gz", "jav", "Javanese", 1.60, "3caa600f063705a2649be289038f381ecdaa8989"),
new TesseractLanguage("jpn.traineddata.gz", "jpn", "Japanese", 13.65, "7545927e6c60888a61556af4247e81c7a08cc17d"),
new TesseractLanguage("kan.traineddata.gz", "kan", "Kannada", 15.12, "53d26da4fde19b5663f4e7748809ba4baf12fe96"),
new TesseractLanguage("kat.traineddata.gz", "kat", "Georgian", 2.23, "8c48267883781ad2278f052259fe4094c64ef9bb"),
new TesseractLanguage("kat_old.traineddata.gz", "kat_old", "Georgian (Old)", 0.19, "88e8312c3fc30ba03811d5d571e44158bc0ab5bf"),
new TesseractLanguage("kaz.traineddata.gz", "kaz", "Kazakh", 1.65, "45c6603afcfe4d81990439df3bed13dd1b4c654b"),
new TesseractLanguage("khm.traineddata.gz", "khm", "Central Khmer", 20.96, "d5a542959114b154db4db61419cd57aba1e3cf5a"),
new TesseractLanguage("kir.traineddata.gz", "kir", "Kirghiz", 2.02, "ee9ba20cde7597688140fc43b14e49417d1052b7"),
new TesseractLanguage("kor.traineddata.gz", "kor", "Korean", 5.11, "39b452ede31b196c66442ea580b5664377eabdab"),
new TesseractLanguage("kur.traineddata.gz", "kur", "Kurdish", 0.73, "a36683c3f62415e1d12529b7642b9463c880db0c", true),
new TesseractLanguage("lao.traineddata.gz", "lao", "Lao", 8.70, "95dbad397571d2d2c13ed63ddc16a51fca343cfb"),
new TesseractLanguage("lat.traineddata.gz", "lat", "Latin", 2.04, "43dc27088ecce88915f6de15c7f6ec9037eebfee"),
new TesseractLanguage("lav.traineddata.gz", "lav", "Latvian", 2.91, "db4e13d875a4c88bd6d8873a7db95fcbd7f9114b"),
new TesseractLanguage("lit.traineddata.gz", "lit", "Lithuanian", 3.28, "fae20b8933a2c49fb9d98539299c7452d530514a"),
new TesseractLanguage("mal.traineddata.gz", "mal", "Malayalam", 3.49, "77a6553e0a37ddf5935a4e81b918850b8babb379"),
new TesseractLanguage("mar.traineddata.gz", "mar", "Marathi", 5.85, "36297ba7adad4e476815a1ab962b556994e85196"),
new TesseractLanguage("mkd.traineddata.gz", "mkd", "Macedonian", 1.36, "63a9ce25d9e2ce9e169ac17e422564809be21fb2"),
new TesseractLanguage("mlt.traineddata.gz", "mlt", "Maltese", 1.96, "18cb93ee612c4c7989c005cdf3a228c4e524db67"),
new TesseractLanguage("msa.traineddata.gz", "msa", "Malay", 2.47, "a40a2af1a06db7cbf4ecef903bff645d7ee3cfc3"),
new TesseractLanguage("mya.traineddata.gz", "mya", "Burmese", 29.36, "f5875d22dc164da4176856ced8521790dfa986a8"),
new TesseractLanguage("nep.traineddata.gz", "nep", "Nepali", 6.53, "55940992c6269123a49c0f0f616d766f9cb3aa4c"),
new TesseractLanguage("nld.traineddata.gz", "nld", "Dutch", 6.83, "7a19402e128c97ffb5044780c055344e4b92cceb"),
new TesseractLanguage("nor.traineddata.gz", "nor", "Norwegian", 3.14, "33fd288a93a5260954b0fca37894ce50d8872971"),
new TesseractLanguage("ori.traineddata.gz", "ori", "Oriya", 3.06, "cc4951bf162f3e06f83a7f63868dc0ba2a86c83c"),
// new TesseractLanguage { Filename = "osd.traineddata.gz", Code = "osd", LangName = "", Size = 4.08, Sha1 = "d8c10c1fca9b954ca2500e6abeee94b50329f486" },
new TesseractLanguage("pan.traineddata.gz", "pan", "Panjabi", 4.06, "ec846c1a93576f85878de4b06fa82241782cf2a4"),
new TesseractLanguage("pol.traineddata.gz", "pol", "Polish", 5.41, "55a31b8724722219ce80f0a75685f267ae221d3d"),
new TesseractLanguage("por.traineddata.gz", "por", "Portuguese", 5.06, "c486d3ba8ad2d7555f894352313f4c5cfb287dca"),
new TesseractLanguage("pus.traineddata.gz", "pus", "Pushto", 0.88, "c45f471412ae0a7b4ed92141c828963911fa5f15"),
new TesseractLanguage("ron.traineddata.gz", "ron", "Romanian", 2.99, "e21ef667ff7bb90904cf0d731ebe184854cde616"),
new TesseractLanguage("rus.traineddata.gz", "rus", "Russian", 6.05, "96d7897ddecc7f944b5c1751e9ff44416cc3ee21"),
new TesseractLanguage("san.traineddata.gz", "san", "Sanskrit", 9.52, "c324b96fc4f1dcd2295329081f18be98e1c71053"),
new TesseractLanguage("sin.traineddata.gz", "sin", "Sinhala", 2.60, "145f8b7da56fe12340d4a0ce3f0c1385e437398c"),
new TesseractLanguage("slk.traineddata.gz", "slk", "Slovakian", 3.45, "abe9737fb49c9284a10cbb87b9efa773234af5c3"),
new TesseractLanguage("slk_frak.traineddata.gz", "slk_frak", "Slovakian (Fraktur)", 0.28, "e12b4fd2b4d2739656ed28142ba5db081d49fce2"),
new TesseractLanguage("slv.traineddata.gz", "slv", "Slovenian", 2.47, "d94468d01fec2bbcb8be23e97ec5329ef58c541f"),
new TesseractLanguage("spa.traineddata.gz", "spa", "Spanish", 6.31, "89160dbb92dbb5bcd6c48237315f6aa892450ef1"),
new TesseractLanguage("spa_old.traineddata.gz", "spa_old", "Spanish (Old)", 6.57, "9d13656da6a91ca4717f9235340f0304c7f77110"),
new TesseractLanguage("sqi.traineddata.gz", "sqi", "Albanian", 2.40, "30957e11c55610634dfdd2704ff0d6036c2e4ca5"),
new TesseractLanguage("srp.traineddata.gz", "srp", "Serbian", 1.56, "5a7ef0c3c37d7f1891bde5a96b92b2fd3e48783a"),
new TesseractLanguage("srp_latn.traineddata.gz", "srp_latn", "Serbian (Latin)", 2.27, "2aa8ff0e22440d3aab1a59e47b416bcd7ab2e7ae"),
new TesseractLanguage("swa.traineddata.gz", "swa", "Swahili", 1.43, "6010b9255c1cd98c8bda39cd18904bf7782942e1"),
new TesseractLanguage("swe.traineddata.gz", "swe", "Swedish", 3.64, "1bd6fd11f36b3ca04342a521773179269c5410e3"),
new TesseractLanguage("syr.traineddata.gz", "syr", "Syriac", 1.06, "01aa53fd62897bcbfc053401405485d6f6aa9df9"),
new TesseractLanguage("tam.traineddata.gz", "tam", "Tamil", 1.99, "eaca5e8c91d7995894ff2dafc4b824f305d6fff0"),
new TesseractLanguage("tel.traineddata.gz", "tel", "Telugu", 16.81, "1f5b1e2f3d8a772b406e4a2b9d8ec38f1eec4cc6"),
new TesseractLanguage("tgk.traineddata.gz", "tgk", "Tajik", 0.40, "b839d70a88e1dc2a019d1b7e76b83e5dcb0df440"),
new TesseractLanguage("tgl.traineddata.gz", "tgl", "Tagalog", 1.56, "0bdbb9e5f763ebfeef8fc9cd0ba1913bd7309755"),
new TesseractLanguage("tha.traineddata.gz", "tha", "Thai", 5.61, "7a171182716c99c19c1cc9b934a70ef5bee7893a"),
new TesseractLanguage("tir.traineddata.gz", "tir", "Tigrinya", 0.60, "4292700b180a505c4a45666a13eac6e144b48615"),
new TesseractLanguage("tur.traineddata.gz", "tur", "Turkish", 5.61, "8d72dc5ec5f22073f6b3ae2f79534e36aa8f63e8"),
new TesseractLanguage("uig.traineddata.gz", "uig", "Uighur", 0.72, "d20262f24476229539b4b87efa9327428052b241"),
new TesseractLanguage("ukr.traineddata.gz", "ukr", "Ukrainian", 2.92, "0871744dfacfa446e212e5c7e671c790b5fdd2f0"),
new TesseractLanguage("urd.traineddata.gz", "urd", "Urdu", 1.83, "be2964ca83114ee04b3a258e71525b8a1a670c97", true),
new TesseractLanguage("uzb.traineddata.gz", "uzb", "Uzbek", 1.55, "8de3127c90628514d61c0ded9510d4b2728f4b69"),
new TesseractLanguage("uzb_cyrl.traineddata.gz", "uzb_cyrl", "Uzbek (Cyrillic)", 1.19, "e1190d147d6ce3770d768724c82e103b06c93061"),
new TesseractLanguage("vie.traineddata.gz", "vie", "Vietnamese", 2.27, "571e132cd3ed26f5c33943efe7aa17835d277a15"),
new TesseractLanguage("yid.traineddata.gz", "yid", "Yiddish", 1.60, "0dbb6e19b660b57283f954eb5183cc2f3677fdda"),
});
#endregion
#region Tesseract 4.00Beta4 Language Data (auto-generated)
public static readonly TesseractLanguageData V400B4 = new TesseractLanguageData(new[]
public static readonly TesseractLanguageData Latest = new(new[]
{
new TesseractLanguage("afr.traineddata.zip", "afr", "Afrikaans", 5.44, "4278120a18e3464194df302f55417afc35415af7"),
new TesseractLanguage("amh.traineddata.zip", "amh", "Amharic", 5.55, "166219c79a3c92775ac8cc987fba91899dc63f7d"),

View File

@ -0,0 +1,34 @@
using NAPS2.Dependencies;
namespace NAPS2.Ocr;
public class TesseractLanguageManager
{
private static readonly List<DownloadMirror> Mirrors = new()
{
new(PlatformSupport.ModernWindows.Or(PlatformSupport.Linux), @"https://github.com/cyanfish/naps2-components/releases/download/tesseract-4.0.0b4/{0}"),
new(PlatformSupport.ModernWindows.Or(PlatformSupport.Linux), @"https://sourceforge.net/projects/naps2/files/components/tesseract-4.0.0b4/{0}/download")
};
public TesseractLanguageManager(string basePath)
{
LanguageData = TesseractLanguageData.Latest;
TesseractBasePath = Path.Combine(basePath, "tesseract-4.0.0b4");
LanguageComponents = LanguageData.Data.Select(x =>
new MultiFileExternalComponent($"ocr-{x.Code}", TesseractBasePath, new[] { $"best/{x.Code}.traineddata", $"fast/{x.Code}.traineddata" },
new DownloadInfo(x.Filename, Mirrors, x.Size, x.Sha1, DownloadFormat.Zip)));
}
public virtual IEnumerable<Language> InstalledLanguages =>
LanguageComponents.Where(x => x.IsInstalled).Select(x => LanguageData.LanguageMap[x.Id]);
public virtual IEnumerable<Language> NotInstalledLanguages =>
LanguageComponents.Where(x => !x.IsInstalled).Select(x => LanguageData.LanguageMap[x.Id]);
public string TesseractBasePath { get; protected init; }
public TesseractLanguageData LanguageData { get; protected init; }
public IEnumerable<IExternalComponent> LanguageComponents { get; protected init; }
}

View File

@ -1,45 +1,39 @@
using System.Threading;
using NAPS2.Dependencies;
using System.Threading;
namespace NAPS2.Ocr;
public abstract class TesseractBaseEngine : IOcrEngine
public class TesseractOcrEngine : IOcrEngine
{
private const int CHECK_INTERVAL = 500;
private readonly string _exePath;
private readonly string? _languageDataBasePath;
public bool CanProcess(string langCode)
public TesseractOcrEngine(string exePath, string? languageDataBasePath)
{
if (string.IsNullOrEmpty(langCode) || !IsInstalled || !IsSupported)
{
return false;
}
// Support multiple specified languages (e.g. "eng+fra")
return langCode.Split('+').All(code => InstalledLanguages.Any(x => x.Code == code));
_exePath = exePath;
_languageDataBasePath = languageDataBasePath;
}
public async Task<OcrResult?> ProcessImage(string imagePath, OcrParams ocrParams, CancellationToken cancelToken)
{
string tempHocrFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName());
string tempHocrFilePathWithExt = tempHocrFilePath + TesseractHocrExtension;
string tempHocrFilePathWithExt = tempHocrFilePath + ".hocr";
try
{
var runInfo = TesseractRunInfo(ocrParams);
var startInfo = new ProcessStartInfo
{
FileName = Path.Combine(TesseractBasePath, TesseractExePath),
Arguments = $"\"{imagePath}\" \"{tempHocrFilePath}\" -l {ocrParams.LanguageCode} {runInfo.Arguments} hocr",
FileName = _exePath,
Arguments = $"\"{imagePath}\" \"{tempHocrFilePath}\" -l {ocrParams.LanguageCode} hocr",
UseShellExecute = false,
CreateNoWindow = true,
RedirectStandardOutput = true,
RedirectStandardError = true
};
if (runInfo.PrefixPath != null)
if (_languageDataBasePath != null)
{
startInfo.EnvironmentVariables["TESSDATA_PREFIX"] = Path.Combine(TesseractBasePath, runInfo.PrefixPath);
}
if (runInfo.DataPath != null)
{
var tessdata = new DirectoryInfo(Path.Combine(TesseractBasePath, runInfo.DataPath));
string subfolder = ocrParams.Mode == OcrMode.Best ? "best" : "fast";
string languageDataPath = Path.Combine(_languageDataBasePath, subfolder);
startInfo.EnvironmentVariables["TESSDATA_PREFIX"] = languageDataPath;
var tessdata = new DirectoryInfo(languageDataPath);
EnsureHocrConfigExists(tessdata);
}
var tesseractProcess = Process.Start(startInfo);
@ -107,8 +101,10 @@ public abstract class TesseractBaseEngine : IOcrEngine
var elements = hocrDocument.Descendants()
.Where(x => x.Attributes("class").Any(y => y.Value == "ocrx_word"))
.Select(x => new OcrResultElement(x.Value, GetBounds(x.Attribute("title"))));
var rtl = InstalledLanguages.Where(x => x.Code == ocrParams.LanguageCode).Select(x => x.RTL)
.FirstOrDefault();
var rtl = false;
// TODO: Can we detect rtl from the hocr file?
// var rtl = _data.InstalledLanguages.Where(x => x.Code == ocrParams.LanguageCode).Select(x => x.RTL)
// .FirstOrDefault();
return new OcrResult(pageBounds, elements, rtl);
}
catch (Exception e)
@ -162,51 +158,32 @@ public abstract class TesseractBaseEngine : IOcrEngine
}
return bounds;
}
public bool CanInstall { get; protected set; }
public IEnumerable<OcrMode> SupportedModes { get; protected set; }
public IExternalComponent Component { get; protected set; }
public IEnumerable<IExternalComponent> LanguageComponents { get; protected set; }
public virtual bool IsSupported => PlatformSupport.Validate();
public virtual bool IsInstalled => Component.IsInstalled;
public virtual IEnumerable<Language> InstalledLanguages =>
LanguageComponents.Where(x => x.IsInstalled).Select(x => LanguageData.LanguageMap[x.Id]);
public virtual IEnumerable<Language> NotInstalledLanguages =>
LanguageComponents.Where(x => !x.IsInstalled).Select(x => LanguageData.LanguageMap[x.Id]);
protected string TesseractBasePath { get; set; }
protected string TesseractExePath { get; set; }
protected string TesseractHocrExtension { get; set; } = ".hocr";
protected DownloadInfo DownloadInfo { get; set; }
protected PlatformSupport PlatformSupport { get; set; }
protected TesseractLanguageData LanguageData { get; set; }
protected virtual RunInfo TesseractRunInfo(OcrParams ocrParams) => new RunInfo
{
Arguments = "",
DataPath = "tessdata",
PrefixPath = ""
};
protected class RunInfo
{
public string Arguments { get; set; }
public string? PrefixPath { get; set; }
public string? DataPath { get; set; }
}
// TODO: For local engine (where we need to manually direct to the language data)
// public override TesseractRunInfo TesseractRunInfo(OcrParams ocrParams)
// {
// OcrMode mode = ocrParams.Mode;
// string folder = mode == OcrMode.Fast || mode == OcrMode.Default ? "fast" : "best";
// if (ocrParams.LanguageCode.Split('+').All(code => !File.Exists(Path.Combine(TesseractBasePath, folder, $"{code.ToLowerInvariant()}.traineddata"))))
// {
// // Use the other source if the selected one doesn't exist
// folder = folder == "fast" ? "best" : "fast";
// mode = folder == "fast" ? OcrMode.Fast : OcrMode.Best;
// }
//
// return new()
// {
// Arguments = mode == OcrMode.Best ? "--oem 1" : mode == OcrMode.Legacy ? "--oem 0" : "",
// DataPath = folder,
// PrefixPath = folder
// };
// TODO: For system engine (where the language data is externally managed)
// public override TesseractRunInfo TesseractRunInfo(OcrParams ocrParams) => new()
// {
// Arguments = "",
// DataPath = null,
// PrefixPath = null
// };
}

View File

@ -0,0 +1,10 @@
namespace NAPS2.Ocr;
public class TesseractRunInfo
{
public string Arguments { get; set; }
public string? PrefixPath { get; set; }
public string? DataPath { get; set; }
}

View File

@ -1,77 +1,77 @@
using NAPS2.Dependencies;
namespace NAPS2.Ocr;
public class TesseractSystemEngine : TesseractBaseEngine
{
private bool _isInstalled;
private DateTime? _installCheckTime;
private List<Language>? _installedLanguages;
public TesseractSystemEngine()
{
// Use the most complete set of language mappings
LanguageData = TesseractLanguageData.V400B4;
TesseractBasePath = "";
TesseractExePath = "tesseract";
PlatformSupport = PlatformSupport.Linux;
CanInstall = false;
}
protected override RunInfo TesseractRunInfo(OcrParams ocrParams) => new RunInfo
{
Arguments = "",
DataPath = null,
PrefixPath = null
};
public override bool IsInstalled
{
get
{
CheckIfInstalled();
return _isInstalled;
}
}
public override IEnumerable<Language> InstalledLanguages
{
get
{
CheckIfInstalled();
return _installedLanguages ?? Enumerable.Empty<Language>();
}
}
public override IEnumerable<Language> NotInstalledLanguages => Enumerable.Empty<Language>();
private void CheckIfInstalled()
{
if (IsSupported && (_installCheckTime == null || _installCheckTime < DateTime.Now - TimeSpan.FromSeconds(2)))
{
try
{
var process = Process.Start(new ProcessStartInfo
{
FileName = TesseractExePath,
Arguments = "--list-langs",
UseShellExecute = false,
RedirectStandardOutput = true,
RedirectStandardError = true
});
if (process != null && process.Id != 0)
{
var codes = process.StandardError.ReadToEnd().Split(new[] {'\r', '\n'}, StringSplitOptions.RemoveEmptyEntries).Where(x => x.Length == 3);
_installedLanguages = codes.Select(code => LanguageData.LanguageMap.Get($"ocr-{code}")).WhereNotNull().ToList();
_isInstalled = true;
process.Kill();
}
}
catch (Exception)
{
// Component is not installed on the system path (or had an error)
}
_installCheckTime = DateTime.Now;
}
}
}
// using NAPS2.Dependencies;
//
// namespace NAPS2.Ocr;
//
// public class TesseractSystemData : TesseractOcrEngineData
// {
// private bool _isInstalled;
// private DateTime? _installCheckTime;
// private List<Language>? _installedLanguages;
//
// public TesseractSystemData()
// {
// // Use the most complete set of language mappings
// LanguageData = TesseractLanguageData.V400B4;
// TesseractBasePath = "";
// TesseractExePath = "tesseract";
// PlatformSupport = PlatformSupport.Linux;
// CanInstall = false;
// }
//
// public override TesseractRunInfo TesseractRunInfo(OcrParams ocrParams) => new()
// {
// Arguments = "",
// DataPath = null,
// PrefixPath = null
// };
//
// public override bool IsInstalled
// {
// get
// {
// CheckIfInstalled();
// return _isInstalled;
// }
// }
//
// public override IEnumerable<Language> InstalledLanguages
// {
// get
// {
// CheckIfInstalled();
// return _installedLanguages ?? Enumerable.Empty<Language>();
// }
// }
//
// public override IEnumerable<Language> NotInstalledLanguages => Enumerable.Empty<Language>();
//
// private void CheckIfInstalled()
// {
// if (IsSupported && (_installCheckTime == null || _installCheckTime < DateTime.Now - TimeSpan.FromSeconds(2)))
// {
// try
// {
// var process = Process.Start(new ProcessStartInfo
// {
// FileName = TesseractExePath,
// Arguments = "--list-langs",
// UseShellExecute = false,
// RedirectStandardOutput = true,
// RedirectStandardError = true
// });
// if (process != null && process.Id != 0)
// {
// var codes = process.StandardError.ReadToEnd().Split(new[] {'\r', '\n'}, StringSplitOptions.RemoveEmptyEntries).Where(x => x.Length == 3);
// _installedLanguages = codes.Select(code => LanguageData.LanguageMap.Get($"ocr-{code}")).WhereNotNull().ToList();
// _isInstalled = true;
// process.Kill();
// }
// }
// catch (Exception)
// {
// // Component is not installed on the system path (or had an error)
// }
// _installCheckTime = DateTime.Now;
// }
// }
// }

View File

@ -15,18 +15,16 @@ public class BatchScanPerformer : IBatchScanPerformer
private readonly PdfExporter _pdfExporter;
private readonly IOperationFactory _operationFactory;
private readonly IConfigProvider<PdfSettings> _pdfSettingsProvider;
private readonly OcrEngineManager _ocrEngineManager;
private readonly IFormFactory _formFactory;
private readonly ScopedConfig _config;
private readonly IProfileManager _profileManager;
public BatchScanPerformer(IScanPerformer scanPerformer, PdfExporter pdfExporter, IOperationFactory operationFactory, IConfigProvider<PdfSettings> pdfSettingsProvider, OcrEngineManager ocrEngineManager, IFormFactory formFactory, ScopedConfig config, IProfileManager profileManager)
public BatchScanPerformer(IScanPerformer scanPerformer, PdfExporter pdfExporter, IOperationFactory operationFactory, IConfigProvider<PdfSettings> pdfSettingsProvider, IFormFactory formFactory, ScopedConfig config, IProfileManager profileManager)
{
_scanPerformer = scanPerformer;
_pdfExporter = pdfExporter;
_operationFactory = operationFactory;
_pdfSettingsProvider = pdfSettingsProvider;
_ocrEngineManager = ocrEngineManager;
_formFactory = formFactory;
_config = config;
_profileManager = profileManager;
@ -34,7 +32,7 @@ public class BatchScanPerformer : IBatchScanPerformer
public async Task PerformBatchScan(IConfigProvider<BatchSettings> settings, FormBase batchForm, Action<ProcessedImage> imageCallback, Action<string> progressCallback, CancellationToken cancelToken)
{
var state = new BatchState(_scanPerformer, _pdfExporter, _operationFactory, _pdfSettingsProvider, _ocrEngineManager, _formFactory, _config, _profileManager)
var state = new BatchState(_scanPerformer, _pdfExporter, _operationFactory, _pdfSettingsProvider, _formFactory, _config, _profileManager)
{
Settings = settings,
ProgressCallback = progressCallback,
@ -51,7 +49,6 @@ public class BatchScanPerformer : IBatchScanPerformer
private readonly PdfExporter _pdfExporter;
private readonly IOperationFactory _operationFactory;
private readonly IConfigProvider<PdfSettings> _pdfSettingsProvider;
private readonly OcrEngineManager _ocrEngineManager;
private readonly IFormFactory _formFactory;
private readonly ScopedConfig _config;
private readonly IProfileManager _profileManager;
@ -61,13 +58,12 @@ public class BatchScanPerformer : IBatchScanPerformer
private List<List<ProcessedImage>> _scans;
public BatchState(IScanPerformer scanPerformer, PdfExporter pdfExporter, IOperationFactory operationFactory,
IConfigProvider<PdfSettings> pdfSettingsProvider, OcrEngineManager ocrEngineManager, IFormFactory formFactory, ScopedConfig config, IProfileManager profileManager)
IConfigProvider<PdfSettings> pdfSettingsProvider, IFormFactory formFactory, ScopedConfig config, IProfileManager profileManager)
{
_scanPerformer = scanPerformer;
_pdfExporter = pdfExporter;
_operationFactory = operationFactory;
_pdfSettingsProvider = pdfSettingsProvider;
_ocrEngineManager = ocrEngineManager;
_formFactory = formFactory;
_config = config;
_profileManager = profileManager;

View File

@ -20,12 +20,12 @@ internal class ScanPerformer : IScanPerformer
private readonly ErrorOutput _errorOutput;
private readonly ScanOptionsValidator _scanOptionsValidator;
private readonly IScanBridgeFactory _scanBridgeFactory;
private readonly OcrEngineManager _ocrEngineManager;
private readonly IOcrEngine _ocrEngine;
public ScanPerformer(IFormFactory formFactory, ScopedConfig config, OperationProgress operationProgress,
AutoSaver autoSaver, IProfileManager profileManager, ErrorOutput errorOutput,
ScanOptionsValidator scanOptionsValidator, IScanBridgeFactory scanBridgeFactory,
ScanningContext scanningContext, OcrEngineManager ocrEngineManager)
ScanningContext scanningContext, IOcrEngine ocrEngine)
{
_formFactory = formFactory;
_config = config;
@ -36,7 +36,7 @@ internal class ScanPerformer : IScanPerformer
_scanOptionsValidator = scanOptionsValidator;
_scanBridgeFactory = scanBridgeFactory;
_scanningContext = scanningContext;
_ocrEngineManager = ocrEngineManager;
_ocrEngine = ocrEngine;
}
public async Task<ScanDevice> PromptForDevice(ScanProfile scanProfile, IntPtr dialogParent = default)
@ -101,7 +101,7 @@ internal class ScanPerformer : IScanPerformer
OcrController ocrController = new OcrController(_scanningContext);
if (scanParams.DoOcr)
{
ocrController.Engine = _ocrEngineManager.ActiveEngine;
ocrController.Engine = _ocrEngine;
if (ocrController.Engine == null)
{
Log.Error("OCR is enabled but no OCR engine is available.");

View File

@ -34,7 +34,7 @@ namespace NAPS2.WinForms
private readonly ImageContext _imageContext;
private readonly StringWrapper _stringWrapper;
private readonly RecoveryManager _recoveryManager;
private readonly OcrEngineManager _ocrEngineManager;
private readonly TesseractLanguageManager _tesseractLanguageManager;
private readonly IScanPerformer _scanPerformer;
private readonly IScannedImagePrinter _scannedImagePrinter;
private readonly StillImage _stillImage;
@ -70,7 +70,7 @@ namespace NAPS2.WinForms
#region Initialization and Culture
public FDesktop(ImageContext imageContext, StringWrapper stringWrapper, RecoveryManager recoveryManager, OcrEngineManager ocrEngineManager,
public FDesktop(ImageContext imageContext, StringWrapper stringWrapper, RecoveryManager recoveryManager, TesseractLanguageManager tesseractLanguageManager,
IScanPerformer scanPerformer, IScannedImagePrinter scannedImagePrinter, StillImage stillImage, IOperationFactory operationFactory,
KeyboardShortcutManager ksm, ThumbnailRenderer thumbnailRenderer, WinFormsExportHelper exportHelper, ImageClipboard imageClipboard,
NotificationManager notify, CultureInitializer cultureInitializer, IWorkerFactory workerFactory, OperationProgress operationProgress,
@ -80,7 +80,7 @@ namespace NAPS2.WinForms
_imageContext = imageContext;
_stringWrapper = stringWrapper;
_recoveryManager = recoveryManager;
_ocrEngineManager = ocrEngineManager;
_tesseractLanguageManager = tesseractLanguageManager;
_scanPerformer = scanPerformer;
_scannedImagePrinter = scannedImagePrinter;
_stillImage = stillImage;
@ -1165,33 +1165,14 @@ namespace NAPS2.WinForms
private void tsOcr_Click(object sender, EventArgs e)
{
if (_ocrEngineManager.MustUpgrade && !Config.Get(c => c.NoUpdatePrompt))
if (_tesseractLanguageManager.InstalledLanguages.Any())
{
// Re-download a fixed version on Windows XP if needed
MessageBox.Show(MiscResources.OcrUpdateAvailable, "", MessageBoxButtons.OK, MessageBoxIcon.Information);
var progressForm = FormFactory.Create<FDownloadProgress>();
progressForm.QueueFile(_ocrEngineManager.EngineToInstall.Component);
progressForm.ShowDialog();
}
if (_ocrEngineManager.MustInstallPackage)
{
const string packages = "\ntesseract-ocr";
MessageBox.Show(MiscResources.TesseractNotAvailable + packages, MiscResources.Error, MessageBoxButtons.OK, MessageBoxIcon.Error);
}
else if (_ocrEngineManager.IsReady)
{
if (_ocrEngineManager.CanUpgrade && !Config.Get(c => c.NoUpdatePrompt))
{
MessageBox.Show(MiscResources.OcrUpdateAvailable, "", MessageBoxButtons.OK, MessageBoxIcon.Information);
FormFactory.Create<FOcrLanguageDownload>().ShowDialog();
}
FormFactory.Create<FOcrSetup>().ShowDialog();
}
else
{
FormFactory.Create<FOcrLanguageDownload>().ShowDialog();
if (_ocrEngineManager.IsReady)
if (_tesseractLanguageManager.InstalledLanguages.Any())
{
FormFactory.Create<FOcrSetup>().ShowDialog();
}

View File

@ -5,26 +5,16 @@ namespace NAPS2.WinForms
{
public partial class FOcrLanguageDownload : FormBase
{
private readonly OcrEngineManager _ocrEngineManager;
private readonly IOcrEngine _engineToInstall;
private readonly TesseractLanguageManager _tesseractLanguageManager;
public FOcrLanguageDownload(OcrEngineManager ocrEngineManager)
public FOcrLanguageDownload(TesseractLanguageManager tesseractLanguageManager)
{
_ocrEngineManager = ocrEngineManager;
_engineToInstall = ocrEngineManager.EngineToInstall;
_tesseractLanguageManager = tesseractLanguageManager;
InitializeComponent();
var initialSelection = new HashSet<string>();
if (ocrEngineManager.InstalledEngine != null && ocrEngineManager.InstalledEngine != _engineToInstall)
{
// Upgrading from an old version, so pre-select previously used languages
foreach (var lang in ocrEngineManager.InstalledEngine.LanguageComponents.Where(x => x.IsInstalled))
{
initialSelection.Add(lang.Id);
}
}
if (!_engineToInstall.InstalledLanguages.Any())
// TODO: We used to select old installed languages here, maybe we could do it again if we get new lang data
if (!_tesseractLanguageManager.InstalledLanguages.Any())
{
// Fresh install, so pre-select English as a sensible default
initialSelection.Add("ocr-eng");
@ -32,7 +22,7 @@ namespace NAPS2.WinForms
// Populate the list of language options
// Special case for English: sorted to the top of the list
var languageOptions = _engineToInstall.NotInstalledLanguages
var languageOptions = _tesseractLanguageManager.NotInstalledLanguages
.OrderBy(x => x.Code == "eng" ? "AAA" : x.Name);
foreach (var languageOption in languageOptions)
{
@ -63,16 +53,11 @@ namespace NAPS2.WinForms
private void UpdateView()
{
var selectedLanguages = SelectedLanguages;
double downloadSize = _engineToInstall.LanguageComponents.Where(x => selectedLanguages.Contains(x.Id)).Select(x => x.DownloadInfo.Size).Sum();
if (!_engineToInstall.IsInstalled)
{
downloadSize += _engineToInstall.Component.DownloadInfo.Size;
}
double downloadSize = _tesseractLanguageManager.LanguageComponents.Where(x => selectedLanguages.Contains(x.Id)).Select(x => x.DownloadInfo.Size).Sum();
labelSizeEstimate.Text = string.Format(MiscResources.EstimatedDownloadSize, downloadSize.ToString("f1"));
btnDownload.Enabled = lvLanguages.Items.Cast<ListViewItem>().Any(x => x.Checked) || _engineToInstall.InstalledLanguages.Any() && !_engineToInstall.IsInstalled;
btnDownload.Enabled = lvLanguages.Items.Cast<ListViewItem>().Any(x => x.Checked);
}
private HashSet<string> SelectedLanguages
@ -94,13 +79,8 @@ namespace NAPS2.WinForms
{
var progressForm = FormFactory.Create<FDownloadProgress>();
if (!_engineToInstall.IsInstalled)
{
progressForm.QueueFile(_engineToInstall.Component);
}
var selectedLanguages = SelectedLanguages;
foreach (var langComponent in _engineToInstall.LanguageComponents.Where(x => selectedLanguages.Contains(x.Id)))
foreach (var langComponent in _tesseractLanguageManager.LanguageComponents.Where(x => selectedLanguages.Contains(x.Id)))
{
progressForm.QueueFile(langComponent);
}

View File

@ -6,22 +6,18 @@ namespace NAPS2.WinForms;
public partial class FOcrSetup : FormBase
{
private readonly OcrEngineManager _ocrEngineManager;
private readonly List<OcrMode> _availableModes;
private readonly TesseractLanguageManager _tesseractLanguageManager;
private readonly List<OcrMode> _availableModes = new() { OcrMode.Fast, OcrMode.Best };
public FOcrSetup(OcrEngineManager ocrEngineManager)
public FOcrSetup(TesseractLanguageManager tesseractLanguageManager)
{
_ocrEngineManager = ocrEngineManager;
_tesseractLanguageManager = tesseractLanguageManager;
InitializeComponent();
comboOcrMode.Format += (sender, e) => e.Value = ((Enum)e.ListItem).Description();
_availableModes = ocrEngineManager.ActiveEngine?.SupportedModes?.ToList();
if (_availableModes != null)
foreach (var mode in _availableModes)
{
foreach (var mode in _availableModes)
{
comboOcrMode.Items.Add(mode);
}
comboOcrMode.Items.Add(mode);
}
}
@ -38,11 +34,6 @@ public partial class FOcrSetup : FormBase
comboLanguages.DisplayMember = "Name";
comboLanguages.ValueMember = "Code";
ConditionalControls.UnlockHeight(this);
ConditionalControls.SetVisible(comboOcrMode, _availableModes != null, 8);
labelOcrMode.Visible = _availableModes != null;
ConditionalControls.LockHeight(this);
checkBoxEnableOcr.Checked = Config.Get(c => c.EnableOcr);
SetSelectedValue(comboLanguages, Config.Get(c => c.OcrLanguageCode));
SetSelectedItem(comboOcrMode, Config.Get(c => c.OcrMode));
@ -73,12 +64,10 @@ public partial class FOcrSetup : FormBase
private void LoadLanguages()
{
var languages = _ocrEngineManager.ActiveEngine?.InstalledLanguages
var languages = _tesseractLanguageManager.InstalledLanguages
.OrderBy(x => x.Name)
.ToList();
comboLanguages.DataSource = languages;
linkGetLanguages.Visible = _ocrEngineManager.EngineToInstall != null;
}
private void UpdateView()

View File

@ -17,4 +17,6 @@
<s:Boolean x:Key="/Default/UserDictionary/Words/=mapi/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Pdfium/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Pdfs/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=rmse/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>
<s:Boolean x:Key="/Default/UserDictionary/Words/=rmse/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=TESSDATA/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Tesseract/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>