mirror of
https://github.com/cyanfish/naps2.git
synced 2024-08-16 10:40:35 +03:00
WIP: Rewrite tesseract code and strip out all old tesseract configs. Going forward we'll bundle the exe and only support the latest traineddata.
This commit is contained in:
parent
9ab618acee
commit
f4dde8ee14
@ -31,7 +31,7 @@ public class CommonModule : NinjectModule
|
||||
Bind<IScannedImagePrinter>().To<PrintDocumentPrinter>();
|
||||
Bind<IEmailProviderFactory>().To<NinjectEmailProviderFactory>();
|
||||
Bind<IMapiWrapper>().To<MapiWrapper>();
|
||||
Bind<OcrEngineManager>().ToMethod(ctx => OcrEngineManager.Default);
|
||||
// TODO: Bind TesseractLanguageManager
|
||||
Bind<OcrRequestQueue>().ToSelf().InSingletonScope();
|
||||
|
||||
// Scan
|
||||
|
@ -19,11 +19,10 @@ public static class StaticConfiguration
|
||||
Debug.Listeners.Add(new NLogTraceListener());
|
||||
#endif
|
||||
|
||||
// TODO: Initialize TesseractLanguageManager here?
|
||||
var customPath = config.Get(c => c.ComponentsPath);
|
||||
var basePath = string.IsNullOrWhiteSpace(customPath)
|
||||
? Paths.Components
|
||||
: Environment.ExpandEnvironmentVariables(customPath);
|
||||
|
||||
OcrEngineManager.Default = new OcrEngineManager(basePath);
|
||||
}
|
||||
}
|
@ -20,7 +20,7 @@ public class AutomatedScanning
|
||||
private readonly ErrorOutput _errorOutput;
|
||||
private readonly IScannedImageImporter _scannedImageImporter;
|
||||
private readonly IOperationFactory _operationFactory;
|
||||
private readonly OcrEngineManager _ocrEngineManager;
|
||||
private readonly TesseractLanguageManager _tesseractLanguageManager;
|
||||
private readonly IFormFactory _formFactory;
|
||||
private readonly ScopedConfig _config;
|
||||
private readonly TransactionConfigScope<CommonConfig> _userTransact;
|
||||
@ -43,7 +43,7 @@ public class AutomatedScanning
|
||||
public AutomatedScanning(ConsoleOutput output, AutomatedScanningOptions options, ImageContext imageContext,
|
||||
IScanPerformer scanPerformer, ErrorOutput errorOutput, IEmailProviderFactory emailProviderFactory,
|
||||
IScannedImageImporter scannedImageImporter, IOperationFactory operationFactory,
|
||||
OcrEngineManager ocrEngineManager, IFormFactory formFactory, ScopedConfig config,
|
||||
TesseractLanguageManager tesseractLanguageManager, IFormFactory formFactory, ScopedConfig config,
|
||||
IProfileManager profileManager, RecoveryStorageManager recoveryStorageManager, ScanningContext scanningContext)
|
||||
{
|
||||
_output = output;
|
||||
@ -54,7 +54,7 @@ public class AutomatedScanning
|
||||
_emailProviderFactory = emailProviderFactory;
|
||||
_scannedImageImporter = scannedImageImporter;
|
||||
_operationFactory = operationFactory;
|
||||
_ocrEngineManager = ocrEngineManager;
|
||||
_tesseractLanguageManager = tesseractLanguageManager;
|
||||
_formFactory = formFactory;
|
||||
_config = config;
|
||||
_profileManager = profileManager;
|
||||
@ -173,12 +173,7 @@ public class AutomatedScanning
|
||||
private void InstallComponents()
|
||||
{
|
||||
var availableComponents = new List<IExternalComponent>();
|
||||
var ocrEngine = _ocrEngineManager.EngineToInstall;
|
||||
if (ocrEngine != null)
|
||||
{
|
||||
availableComponents.Add(ocrEngine.Component);
|
||||
availableComponents.AddRange(ocrEngine.LanguageComponents);
|
||||
}
|
||||
availableComponents.AddRange(_tesseractLanguageManager.LanguageComponents);
|
||||
|
||||
var componentDict = availableComponents.ToDictionary(x => x.Id.ToLowerInvariant());
|
||||
var installId = _options.Install.ToLowerInvariant();
|
||||
|
@ -100,7 +100,7 @@ public class CommandLineIntegrationTests : ContextualTexts
|
||||
public override void Load()
|
||||
{
|
||||
Rebind<ImageContext>().ToConstant(_imageContext);
|
||||
Rebind<OcrEngineManager>().ToConstant(new OcrEngineManager());
|
||||
// TODO: Bind TesseractLanguageManager
|
||||
Rebind<IScanDriverFactory>().ToConstant(_scanDriverFactory);
|
||||
Rebind<IScanBridgeFactory>().To<InProcScanBridgeFactory>();
|
||||
Rebind<ConsoleOutput>().ToSelf().WithConstructorArgument("writer", new TestOutputTextWriter(_testOutputHelper));
|
||||
|
@ -38,6 +38,10 @@
|
||||
<Generator>ResXFileCodeGenerator</Generator>
|
||||
<LastGenOutput>ImageImporterTestsData.Designer.cs</LastGenOutput>
|
||||
</EmbeddedResource>
|
||||
<EmbeddedResource Update="Ocr\TesseractResources.resx">
|
||||
<Generator>ResXFileCodeGenerator</Generator>
|
||||
<LastGenOutput>TesseractResources.Designer.cs</LastGenOutput>
|
||||
</EmbeddedResource>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
@ -46,6 +50,20 @@
|
||||
<AutoGen>True</AutoGen>
|
||||
<DependentUpon>ImageImporterTestsData.resx</DependentUpon>
|
||||
</Compile>
|
||||
<Compile Update="Ocr\TesseractResources.Designer.cs">
|
||||
<DesignTime>True</DesignTime>
|
||||
<AutoGen>True</AutoGen>
|
||||
<DependentUpon>TesseractTestResources.resx</DependentUpon>
|
||||
</Compile>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<None Update="Resources\tesseract-5.1.0-x64.exe">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="Resources\eng.traineddata">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
@ -1,10 +0,0 @@
|
||||
namespace NAPS2.Sdk.Tests.Ocr;
|
||||
|
||||
public class TesseractEngineTests
|
||||
{
|
||||
// TODO: Ocr tests
|
||||
// (1) Integration test that runs a real tesseract, produces a real pdf, and parses the pdf
|
||||
// (2) Unit test for hocr parsing
|
||||
// (3) Unit tests for OcrEngineManager
|
||||
// (4) ?
|
||||
}
|
34
NAPS2.Sdk.Tests/Ocr/TesseractOcrEngineTests.cs
Normal file
34
NAPS2.Sdk.Tests/Ocr/TesseractOcrEngineTests.cs
Normal file
@ -0,0 +1,34 @@
|
||||
using System.Threading;
|
||||
using NAPS2.Ocr;
|
||||
using Xunit;
|
||||
|
||||
namespace NAPS2.Sdk.Tests.Ocr;
|
||||
|
||||
public class TesseractOcrEngineTests : ContextualTexts
|
||||
{
|
||||
private readonly TesseractOcrEngine _engine;
|
||||
private readonly string _testImagePath;
|
||||
|
||||
public TesseractOcrEngineTests()
|
||||
{
|
||||
var exePath = Path.Combine(FolderPath, "tesseract.exe");
|
||||
File.WriteAllBytes(exePath, TesseractResources.tesseract_x64);
|
||||
|
||||
var tessdataPath = Path.Combine(FolderPath, "fast");
|
||||
Directory.CreateDirectory(tessdataPath);
|
||||
var engDataPath = Path.Combine(tessdataPath, "eng.traineddata");
|
||||
File.WriteAllBytes(engDataPath, TesseractResources.eng_traineddata);
|
||||
|
||||
_testImagePath = Path.Combine(FolderPath, "ocr_test.jpg");
|
||||
File.WriteAllBytes(_testImagePath, TesseractResources.ocr_test);
|
||||
|
||||
_engine = new TesseractOcrEngine(exePath, FolderPath);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task RunTesseract()
|
||||
{
|
||||
var result = await _engine.ProcessImage(_testImagePath, new OcrParams("eng", OcrMode.Fast, 0), CancellationToken.None);
|
||||
Assert.NotNull(result);
|
||||
}
|
||||
}
|
93
NAPS2.Sdk.Tests/Ocr/TesseractResources.Designer.cs
generated
Normal file
93
NAPS2.Sdk.Tests/Ocr/TesseractResources.Designer.cs
generated
Normal file
@ -0,0 +1,93 @@
|
||||
//------------------------------------------------------------------------------
|
||||
// <auto-generated>
|
||||
// This code was generated by a tool.
|
||||
// Runtime Version:4.0.30319.42000
|
||||
//
|
||||
// Changes to this file may cause incorrect behavior and will be lost if
|
||||
// the code is regenerated.
|
||||
// </auto-generated>
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
namespace NAPS2.Sdk.Tests.Ocr {
|
||||
using System;
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// A strongly-typed resource class, for looking up localized strings, etc.
|
||||
/// </summary>
|
||||
// This class was auto-generated by the StronglyTypedResourceBuilder
|
||||
// class via a tool like ResGen or Visual Studio.
|
||||
// To add or remove a member, edit your .ResX file then rerun ResGen
|
||||
// with the /str option, or rebuild your VS project.
|
||||
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "4.0.0.0")]
|
||||
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
|
||||
[global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
|
||||
internal class TesseractResources {
|
||||
|
||||
private static global::System.Resources.ResourceManager resourceMan;
|
||||
|
||||
private static global::System.Globalization.CultureInfo resourceCulture;
|
||||
|
||||
[global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")]
|
||||
internal TesseractResources() {
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the cached ResourceManager instance used by this class.
|
||||
/// </summary>
|
||||
[global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
|
||||
internal static global::System.Resources.ResourceManager ResourceManager {
|
||||
get {
|
||||
if (object.ReferenceEquals(resourceMan, null)) {
|
||||
global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("NAPS2.Sdk.Tests.Ocr.TesseractResources", typeof(TesseractResources).Assembly);
|
||||
resourceMan = temp;
|
||||
}
|
||||
return resourceMan;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Overrides the current thread's CurrentUICulture property for all
|
||||
/// resource lookups using this strongly typed resource class.
|
||||
/// </summary>
|
||||
[global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
|
||||
internal static global::System.Globalization.CultureInfo Culture {
|
||||
get {
|
||||
return resourceCulture;
|
||||
}
|
||||
set {
|
||||
resourceCulture = value;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Looks up a localized resource of type System.Byte[].
|
||||
/// </summary>
|
||||
internal static byte[] eng_traineddata {
|
||||
get {
|
||||
object obj = ResourceManager.GetObject("eng_traineddata", resourceCulture);
|
||||
return ((byte[])(obj));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Looks up a localized resource of type System.Byte[].
|
||||
/// </summary>
|
||||
internal static byte[] ocr_test {
|
||||
get {
|
||||
object obj = ResourceManager.GetObject("ocr_test", resourceCulture);
|
||||
return ((byte[])(obj));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Looks up a localized resource of type System.Byte[].
|
||||
/// </summary>
|
||||
internal static byte[] tesseract_x64 {
|
||||
get {
|
||||
object obj = ResourceManager.GetObject("tesseract_x64", resourceCulture);
|
||||
return ((byte[])(obj));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
130
NAPS2.Sdk.Tests/Ocr/TesseractResources.resx
Normal file
130
NAPS2.Sdk.Tests/Ocr/TesseractResources.resx
Normal file
@ -0,0 +1,130 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<root>
|
||||
<!--
|
||||
Microsoft ResX Schema
|
||||
|
||||
Version 2.0
|
||||
|
||||
The primary goals of this format is to allow a simple XML format
|
||||
that is mostly human readable. The generation and parsing of the
|
||||
various data types are done through the TypeConverter classes
|
||||
associated with the data types.
|
||||
|
||||
Example:
|
||||
|
||||
... ado.net/XML headers & schema ...
|
||||
<resheader name="resmimetype">text/microsoft-resx</resheader>
|
||||
<resheader name="version">2.0</resheader>
|
||||
<resheader name="reader">System.Resources.ResXResourceReader, System.Windows.Forms, ...</resheader>
|
||||
<resheader name="writer">System.Resources.ResXResourceWriter, System.Windows.Forms, ...</resheader>
|
||||
<data name="Name1"><value>this is my long string</value><comment>this is a comment</comment></data>
|
||||
<data name="Color1" type="System.Drawing.Color, System.Drawing">Blue</data>
|
||||
<data name="Bitmap1" mimetype="application/x-microsoft.net.object.binary.base64">
|
||||
<value>[base64 mime encoded serialized .NET Framework object]</value>
|
||||
</data>
|
||||
<data name="Icon1" type="System.Drawing.Icon, System.Drawing" mimetype="application/x-microsoft.net.object.bytearray.base64">
|
||||
<value>[base64 mime encoded string representing a byte array form of the .NET Framework object]</value>
|
||||
<comment>This is a comment</comment>
|
||||
</data>
|
||||
|
||||
There are any number of "resheader" rows that contain simple
|
||||
name/value pairs.
|
||||
|
||||
Each data row contains a name, and value. The row also contains a
|
||||
type or mimetype. Type corresponds to a .NET class that support
|
||||
text/value conversion through the TypeConverter architecture.
|
||||
Classes that don't support this are serialized and stored with the
|
||||
mimetype set.
|
||||
|
||||
The mimetype is used for serialized objects, and tells the
|
||||
ResXResourceReader how to depersist the object. This is currently not
|
||||
extensible. For a given mimetype the value must be set accordingly:
|
||||
|
||||
Note - application/x-microsoft.net.object.binary.base64 is the format
|
||||
that the ResXResourceWriter will generate, however the reader can
|
||||
read any of the formats listed below.
|
||||
|
||||
mimetype: application/x-microsoft.net.object.binary.base64
|
||||
value : The object must be serialized with
|
||||
: System.Runtime.Serialization.Formatters.Binary.BinaryFormatter
|
||||
: and then encoded with base64 encoding.
|
||||
|
||||
mimetype: application/x-microsoft.net.object.soap.base64
|
||||
value : The object must be serialized with
|
||||
: System.Runtime.Serialization.Formatters.Soap.SoapFormatter
|
||||
: and then encoded with base64 encoding.
|
||||
|
||||
mimetype: application/x-microsoft.net.object.bytearray.base64
|
||||
value : The object must be serialized into a byte array
|
||||
: using a System.ComponentModel.TypeConverter
|
||||
: and then encoded with base64 encoding.
|
||||
-->
|
||||
<xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
|
||||
<xsd:import namespace="http://www.w3.org/XML/1998/namespace" />
|
||||
<xsd:element name="root" msdata:IsDataSet="true">
|
||||
<xsd:complexType>
|
||||
<xsd:choice maxOccurs="unbounded">
|
||||
<xsd:element name="metadata">
|
||||
<xsd:complexType>
|
||||
<xsd:sequence>
|
||||
<xsd:element name="value" type="xsd:string" minOccurs="0" />
|
||||
</xsd:sequence>
|
||||
<xsd:attribute name="name" use="required" type="xsd:string" />
|
||||
<xsd:attribute name="type" type="xsd:string" />
|
||||
<xsd:attribute name="mimetype" type="xsd:string" />
|
||||
<xsd:attribute ref="xml:space" />
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
<xsd:element name="assembly">
|
||||
<xsd:complexType>
|
||||
<xsd:attribute name="alias" type="xsd:string" />
|
||||
<xsd:attribute name="name" type="xsd:string" />
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
<xsd:element name="data">
|
||||
<xsd:complexType>
|
||||
<xsd:sequence>
|
||||
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
|
||||
<xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
|
||||
</xsd:sequence>
|
||||
<xsd:attribute name="name" type="xsd:string" use="required" msdata:Ordinal="1" />
|
||||
<xsd:attribute name="type" type="xsd:string" msdata:Ordinal="3" />
|
||||
<xsd:attribute name="mimetype" type="xsd:string" msdata:Ordinal="4" />
|
||||
<xsd:attribute ref="xml:space" />
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
<xsd:element name="resheader">
|
||||
<xsd:complexType>
|
||||
<xsd:sequence>
|
||||
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
|
||||
</xsd:sequence>
|
||||
<xsd:attribute name="name" type="xsd:string" use="required" />
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
</xsd:choice>
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
</xsd:schema>
|
||||
<resheader name="resmimetype">
|
||||
<value>text/microsoft-resx</value>
|
||||
</resheader>
|
||||
<resheader name="version">
|
||||
<value>2.0</value>
|
||||
</resheader>
|
||||
<resheader name="reader">
|
||||
<value>System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
|
||||
</resheader>
|
||||
<resheader name="writer">
|
||||
<value>System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
|
||||
</resheader>
|
||||
<assembly alias="System.Windows.Forms" name="System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" />
|
||||
<data name="tesseract_x64" type="System.Resources.ResXFileRef, System.Windows.Forms">
|
||||
<value>..\Resources\tesseract-5.1.0-x64.exe;System.Byte[], mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
|
||||
</data>
|
||||
<data name="eng_traineddata" type="System.Resources.ResXFileRef, System.Windows.Forms">
|
||||
<value>..\Resources\eng.traineddata;System.Byte[], mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
|
||||
</data>
|
||||
<data name="ocr_test" type="System.Resources.ResXFileRef, System.Windows.Forms">
|
||||
<value>..\Resources\ocr_test.jpg;System.Byte[], mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
|
||||
</data>
|
||||
</root>
|
BIN
NAPS2.Sdk.Tests/Resources/eng.traineddata
Normal file
BIN
NAPS2.Sdk.Tests/Resources/eng.traineddata
Normal file
Binary file not shown.
BIN
NAPS2.Sdk.Tests/Resources/ocr_test.jpg
Normal file
BIN
NAPS2.Sdk.Tests/Resources/ocr_test.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 86 KiB |
@ -20,7 +20,7 @@ public class AutoSaver
|
||||
private readonly TiffHelper _tiffHelper;
|
||||
private readonly ImageContext _imageContext;
|
||||
|
||||
public AutoSaver(IConfigProvider<PdfSettings> pdfSettingsProvider, IConfigProvider<ImageSettings> imageSettingsProvider, OcrEngineManager ocrEngineManager, OcrRequestQueue ocrRequestQueue, ErrorOutput errorOutput, DialogHelper dialogHelper, OperationProgress operationProgress, ISaveNotify notify, PdfExporter pdfExporter, OverwritePrompt overwritePrompt, ScopedConfig config, TiffHelper tiffHelper, ImageContext imageContext)
|
||||
public AutoSaver(IConfigProvider<PdfSettings> pdfSettingsProvider, IConfigProvider<ImageSettings> imageSettingsProvider, ErrorOutput errorOutput, DialogHelper dialogHelper, OperationProgress operationProgress, ISaveNotify notify, PdfExporter pdfExporter, OverwritePrompt overwritePrompt, ScopedConfig config, TiffHelper tiffHelper, ImageContext imageContext)
|
||||
{
|
||||
_pdfSettingsProvider = pdfSettingsProvider;
|
||||
_imageSettingsProvider = imageSettingsProvider;
|
||||
|
@ -74,10 +74,6 @@ public class PdfSharpExporter : PdfExporter
|
||||
{
|
||||
Log.Error("Supported OCR engine not installed.", ocrParams.LanguageCode);
|
||||
}
|
||||
else if (!activeEngine.CanProcess(ocrParams.LanguageCode))
|
||||
{
|
||||
Log.Error("OCR files not available for '{0}'.", ocrParams.LanguageCode);
|
||||
}
|
||||
else
|
||||
{
|
||||
ocrEngine = activeEngine;
|
||||
|
@ -1,27 +1,8 @@
|
||||
using System.Threading;
|
||||
using NAPS2.Dependencies;
|
||||
|
||||
namespace NAPS2.Ocr;
|
||||
|
||||
public interface IOcrEngine
|
||||
{
|
||||
bool CanProcess(string langCode);
|
||||
|
||||
Task<OcrResult?> ProcessImage(string imagePath, OcrParams ocrParams, CancellationToken cancelToken);
|
||||
|
||||
bool IsSupported { get; }
|
||||
|
||||
bool IsInstalled { get; }
|
||||
|
||||
bool CanInstall { get; }
|
||||
|
||||
IEnumerable<Language> InstalledLanguages { get; }
|
||||
|
||||
IEnumerable<Language> NotInstalledLanguages { get; }
|
||||
|
||||
IExternalComponent Component { get; }
|
||||
|
||||
IEnumerable<IExternalComponent> LanguageComponents { get; }
|
||||
|
||||
IEnumerable<OcrMode> SupportedModes { get; }
|
||||
}
|
@ -50,10 +50,6 @@ public class OcrController
|
||||
{
|
||||
throw new InvalidOperationException("OCR is enabled but no language code is specified.");
|
||||
}
|
||||
if (!Engine.CanProcess(OcrParams.LanguageCode))
|
||||
{
|
||||
throw new InvalidOperationException("OCR is enabled but the engine can't handle the specified language.");
|
||||
}
|
||||
|
||||
CancellationTokenSource cts = new CancellationTokenSource();
|
||||
_cancellationTokenSources.Add(image.GetWeakReference(), cts);
|
||||
|
@ -1,82 +0,0 @@
|
||||
using NAPS2.Scan;
|
||||
|
||||
namespace NAPS2.Ocr;
|
||||
|
||||
public class OcrEngineManager : IOcrEngineProvider
|
||||
{
|
||||
private static OcrEngineManager _default = new OcrEngineManager();
|
||||
|
||||
public static OcrEngineManager Default
|
||||
{
|
||||
get
|
||||
{
|
||||
TestingContext.NoStaticDefaults();
|
||||
return _default;
|
||||
}
|
||||
set => _default = value ?? throw new ArgumentNullException(nameof(value));
|
||||
}
|
||||
|
||||
private readonly List<IOcrEngine> _engines;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new instance of OcrEngineManager that only looks for Tesseract on the system path.
|
||||
/// </summary>
|
||||
public OcrEngineManager()
|
||||
{
|
||||
_engines = new List<IOcrEngine>
|
||||
{
|
||||
new TesseractSystemEngine()
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new instance of OcrEngineManager with the specified engines. The order of engines is important; preferred/newer first.
|
||||
/// </summary>
|
||||
/// <param name="orderedEngineList"></param>
|
||||
public OcrEngineManager(IEnumerable<IOcrEngine> orderedEngineList)
|
||||
{
|
||||
_engines = orderedEngineList.ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new instance of OcrEngineManager with the default set of engines.
|
||||
/// <param name="basePath">The base path for installed engines.</param>
|
||||
/// </summary>
|
||||
public OcrEngineManager(string basePath)
|
||||
{
|
||||
_engines = new List<IOcrEngine>
|
||||
{
|
||||
new Tesseract400Beta4Engine(basePath),
|
||||
new Tesseract304Engine(basePath),
|
||||
new Tesseract304XpEngine(basePath),
|
||||
new Tesseract302Engine(basePath),
|
||||
new TesseractSystemEngine()
|
||||
};
|
||||
}
|
||||
|
||||
public IEnumerable<IOcrEngine> Engines => _engines;
|
||||
|
||||
public bool IsReady => _engines.Any(x => x.IsSupported && x.IsInstalled && x.InstalledLanguages.Any());
|
||||
|
||||
public bool IsNewestReady
|
||||
{
|
||||
get
|
||||
{
|
||||
var latest = _engines.FirstOrDefault(x => x.IsSupported);
|
||||
if (latest == null) return false;
|
||||
return latest.IsInstalled && latest.InstalledLanguages.Any();
|
||||
}
|
||||
}
|
||||
|
||||
public bool CanUpgrade => !IsNewestReady && _engines.Any(x => x.IsInstalled);
|
||||
|
||||
public bool MustUpgrade => !IsReady && _engines.Any(x => x.IsInstalled);
|
||||
|
||||
public bool MustInstallPackage => _engines.All(x => (!x.IsSupported || !x.CanInstall) && !x.IsInstalled);
|
||||
|
||||
public IOcrEngine? ActiveEngine => _engines.FirstOrDefault(x => x.IsSupported && x.IsInstalled && x.InstalledLanguages.Any());
|
||||
|
||||
public IOcrEngine? InstalledEngine => _engines.FirstOrDefault(x => x.IsInstalled && x.InstalledLanguages.Any());
|
||||
|
||||
public IOcrEngine? EngineToInstall => _engines.FirstOrDefault(x => x.IsSupported && x.CanInstall);
|
||||
}
|
@ -1,22 +0,0 @@
|
||||
using NAPS2.Dependencies;
|
||||
|
||||
namespace NAPS2.Ocr;
|
||||
|
||||
public class Tesseract302Engine : TesseractBaseEngine
|
||||
{
|
||||
public Tesseract302Engine(string basePath)
|
||||
{
|
||||
// Using the newer data since we just need the 302 engine for backwards compatibility
|
||||
LanguageData = TesseractLanguageData.V304;
|
||||
TesseractBasePath = Path.Combine(basePath, "tesseract-3.0.2");
|
||||
TesseractExePath = "tesseract.exe";
|
||||
TesseractHocrExtension = ".html";
|
||||
PlatformSupport = PlatformSupport.Windows;
|
||||
CanInstall = false;
|
||||
|
||||
Component = new ExternalComponent("ocr", Path.Combine(TesseractBasePath, TesseractExePath), null);
|
||||
|
||||
LanguageComponents = LanguageData.Data.Select(x =>
|
||||
new ExternalComponent($"ocr-{x.Code}", Path.Combine(TesseractBasePath, "tessdata", x.Filename.Replace(".gz", "")), null));
|
||||
}
|
||||
}
|
@ -1,29 +0,0 @@
|
||||
using NAPS2.Dependencies;
|
||||
|
||||
namespace NAPS2.Ocr;
|
||||
|
||||
public class Tesseract304Engine : TesseractBaseEngine
|
||||
{
|
||||
protected static readonly List<DownloadMirror> Mirrors = new List<DownloadMirror>
|
||||
{
|
||||
new DownloadMirror(PlatformSupport.ModernWindows.Or(PlatformSupport.Linux), @"https://github.com/cyanfish/naps2-components/releases/download/tesseract-3.04/{0}"),
|
||||
new DownloadMirror(PlatformSupport.ModernWindows.Or(PlatformSupport.Linux), @"https://sourceforge.net/projects/naps2/files/components/tesseract-3.04/{0}/download"),
|
||||
new DownloadMirror(PlatformSupport.WindowsXp, @"http://xp-mirror.naps2.com/tesseract-3.04/{0}")
|
||||
};
|
||||
|
||||
public Tesseract304Engine(string basePath)
|
||||
{
|
||||
LanguageData = TesseractLanguageData.V304;
|
||||
TesseractBasePath = Path.Combine(basePath, "tesseract-3.0.4");
|
||||
TesseractExePath = "tesseract.exe";
|
||||
PlatformSupport = PlatformSupport.ModernWindows;
|
||||
CanInstall = true;
|
||||
|
||||
Component = new ExternalComponent("ocr", Path.Combine(TesseractBasePath, TesseractExePath),
|
||||
new DownloadInfo("tesseract.exe.gz", Mirrors, 1.32, "0b0fd21cd886c04c60ed5c3f38b9120b408139b3", DownloadFormat.Gzip));
|
||||
|
||||
LanguageComponents = LanguageData.Data.Select(x =>
|
||||
new ExternalComponent($"ocr-{x.Code}", Path.Combine(TesseractBasePath, "tessdata", x.Filename.Replace(".gz", "")),
|
||||
new DownloadInfo(x.Filename, Mirrors, x.Size, x.Sha1, DownloadFormat.Gzip)));
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
using NAPS2.Dependencies;
|
||||
|
||||
namespace NAPS2.Ocr;
|
||||
|
||||
public class Tesseract304XpEngine : Tesseract304Engine
|
||||
{
|
||||
public Tesseract304XpEngine(string basePath) : base(basePath)
|
||||
{
|
||||
TesseractExePath = "tesseract_xp.exe";
|
||||
PlatformSupport = PlatformSupport.Windows;
|
||||
|
||||
Component = new ExternalComponent("ocr", Path.Combine(TesseractBasePath, TesseractExePath),
|
||||
new DownloadInfo("tesseract_xp.exe.gz", Mirrors, 1.32, "98d15e4765caae864f16fa2ab106e3fd6adbe8c3", DownloadFormat.Gzip));
|
||||
}
|
||||
}
|
@ -1,51 +0,0 @@
|
||||
using NAPS2.Dependencies;
|
||||
|
||||
namespace NAPS2.Ocr;
|
||||
|
||||
public class Tesseract400Beta4Engine : TesseractBaseEngine
|
||||
{
|
||||
protected static readonly List<DownloadMirror> Mirrors = new List<DownloadMirror>
|
||||
{
|
||||
new DownloadMirror(PlatformSupport.ModernWindows.Or(PlatformSupport.Linux), @"https://github.com/cyanfish/naps2-components/releases/download/tesseract-4.0.0b4/{0}"),
|
||||
new DownloadMirror(PlatformSupport.ModernWindows.Or(PlatformSupport.Linux), @"https://sourceforge.net/projects/naps2/files/components/tesseract-4.0.0b4/{0}/download")
|
||||
};
|
||||
|
||||
public Tesseract400Beta4Engine(string basePath)
|
||||
{
|
||||
string exeFolder = Environment.Is64BitProcess ? "w64" : "w32";
|
||||
LanguageData = TesseractLanguageData.V400B4;
|
||||
TesseractBasePath = Path.Combine(basePath, "tesseract-4.0.0b4");
|
||||
TesseractExePath = Path.Combine(exeFolder, "tesseract.exe");
|
||||
PlatformSupport = PlatformSupport.ModernWindows;
|
||||
CanInstall = true;
|
||||
SupportedModes = new[] { OcrMode.Fast, OcrMode.Best, OcrMode.Legacy };
|
||||
|
||||
var download = Environment.Is64BitProcess
|
||||
? new DownloadInfo("tesseract.exe.w64.zip", Mirrors, 1.83, "4eba9aaf8800a100ef059c512be572e39ae72f4d", DownloadFormat.Zip)
|
||||
: new DownloadInfo("tesseract.exe.w32.zip", Mirrors, 1.56, "300ad281a5fa1c734dbb4a8a4dd49e3a8ab921a4", DownloadFormat.Zip);
|
||||
Component = new MultiFileExternalComponent("ocr", Path.Combine(TesseractBasePath, exeFolder), new[] { "tesseract.exe" }, download);
|
||||
|
||||
LanguageComponents = LanguageData.Data.Select(x =>
|
||||
new MultiFileExternalComponent($"ocr-{x.Code}", TesseractBasePath, new[] { $"best/{x.Code}.traineddata", $"fast/{x.Code}.traineddata" },
|
||||
new DownloadInfo(x.Filename, Mirrors, x.Size, x.Sha1, DownloadFormat.Zip)));
|
||||
}
|
||||
|
||||
protected override RunInfo TesseractRunInfo(OcrParams ocrParams)
|
||||
{
|
||||
OcrMode mode = ocrParams.Mode;
|
||||
string folder = mode == OcrMode.Fast || mode == OcrMode.Default ? "fast" : "best";
|
||||
if (ocrParams.LanguageCode.Split('+').All(code => !File.Exists(Path.Combine(TesseractBasePath, folder, $"{code.ToLowerInvariant()}.traineddata"))))
|
||||
{
|
||||
// Use the other source if the selected one doesn't exist
|
||||
folder = folder == "fast" ? "best" : "fast";
|
||||
mode = folder == "fast" ? OcrMode.Fast : OcrMode.Best;
|
||||
}
|
||||
|
||||
return new RunInfo
|
||||
{
|
||||
Arguments = mode == OcrMode.Best ? "--oem 1" : mode == OcrMode.Legacy ? "--oem 0" : "",
|
||||
DataPath = folder,
|
||||
PrefixPath = folder
|
||||
};
|
||||
}
|
||||
}
|
@ -37,124 +37,9 @@ public class TesseractLanguageData
|
||||
|
||||
public TesseractLanguage[] Data { get; set; }
|
||||
|
||||
#region Tesseract 3.04 Language Data (auto-generated)
|
||||
#region Tesseract Language Data (auto-generated)
|
||||
|
||||
public static readonly TesseractLanguageData V304 = new TesseractLanguageData(new []
|
||||
{
|
||||
new TesseractLanguage("afr.traineddata.gz", "afr", "Afrikaans", 1.93, "a669186130bf1fc6c78226ac868c82b70a44c70b"),
|
||||
new TesseractLanguage("amh.traineddata.gz", "amh", "Amharic", 1.03, "1153cbbac7306d42e72ca639ff3f36f45dcb15a2"),
|
||||
new TesseractLanguage("ara.traineddata.gz", "ara", "Arabic", 1.62, "87b76c73fdcc4c54ec1f03d83b6df665430c2b06", true),
|
||||
new TesseractLanguage("asm.traineddata.gz", "asm", "Assamese", 6.56, "223900790d10f638b7dca2a8b8e8a15295d1f19c"),
|
||||
new TesseractLanguage("aze.traineddata.gz", "aze", "Azerbaijani", 2.54, "01607e49fe6ba6604f65d9b57c77b403ab74040a", true),
|
||||
new TesseractLanguage("aze_cyrl.traineddata.gz", "aze_cyrl", "Azerbaijani (Cyrillic)", 0.97, "f9c9b153e8825bb92d9c8005342ac3d5ea81d0bc"),
|
||||
new TesseractLanguage("bel.traineddata.gz", "bel", "Belarusian", 2.43, "3ac0935dd22f4f2730286d5cb127324d27718410"),
|
||||
new TesseractLanguage("ben.traineddata.gz", "ben", "Bengali", 6.45, "479674b283db6e84fdfb17386056f2e9a5b41b9c"),
|
||||
new TesseractLanguage("bod.traineddata.gz", "bod", "Tibetan", 10.74, "3ff199544dc9e7994658231cbc999878e23463db"),
|
||||
new TesseractLanguage("bos.traineddata.gz", "bos", "Bosnian", 1.87, "9d0bb89c53251789bba06de1452cf1a74d978f35"),
|
||||
new TesseractLanguage("bul.traineddata.gz", "bul", "Bulgarian", 2.20, "ac0481cc1fe62c3af5a34d57fa1571dfd2a95865"),
|
||||
new TesseractLanguage("cat.traineddata.gz", "cat", "Catalan", 1.97, "e1e1dc2e37f6b085bdefdb9d0d63d3ad086ef1f4"),
|
||||
new TesseractLanguage("ceb.traineddata.gz", "ceb", "Cebuano", 0.58, "f867102f828b6495996370eea6ed8688af219b17"),
|
||||
new TesseractLanguage("ces.traineddata.gz", "ces", "Czech", 4.65, "155f60a0994f1590d3d3ba29ec1a5bca3f16efdd"),
|
||||
new TesseractLanguage("chi_sim.traineddata.gz", "chi_sim", "Chinese (Simplified)", 17.60, "9bd65dcecd2581e8f588cec11cd1e2f754885fcb"),
|
||||
new TesseractLanguage("chi_tra.traineddata.gz", "chi_tra", "Chinese (Traditional)", 24.11, "5abef9af8a4fd83a0d156ee2e1d5234c80bb836b"),
|
||||
new TesseractLanguage("chr.traineddata.gz", "chr", "Cherokee", 0.36, "d3677cb6c57ec1b14625a5594dad159a1ad9ec93"),
|
||||
new TesseractLanguage("cym.traineddata.gz", "cym", "Welsh", 1.36, "a5d5733d45710f6da1c4b19f0903bf5edb10a484"),
|
||||
new TesseractLanguage("dan.traineddata.gz", "dan", "Danish", 2.76, "eb813b0c299261b9535a2c684e51f159f05ae8ea"),
|
||||
new TesseractLanguage("dan_frak.traineddata.gz", "dan_frak", "Danish (Fraktur)", 0.65, "dcb540024688da096399e52ff9826aad1d71479c"),
|
||||
new TesseractLanguage("deu.traineddata.gz", "deu", "German", 5.48, "f575f3fcb554077b906aaaac8850d5bd56967cbd"),
|
||||
new TesseractLanguage("deu_frak.traineddata.gz", "deu_frak", "German (Fraktur)", 0.78, "28ac257129f881b3a09c099004048bf6de4bc952"),
|
||||
new TesseractLanguage("dzo.traineddata.gz", "dzo", "Dzongkha", 1.32, "6eb0c943242e4d906cbebec2cf43b2ca63979424"),
|
||||
new TesseractLanguage("ell.traineddata.gz", "ell", "Greek", 2.00, "e54ab7455c1d4715652253321f693e221b61ac8b"),
|
||||
new TesseractLanguage("eng.traineddata.gz", "eng", "English", 9.02, "36bfd5953540b3c294c62402e303f381cee156f3"),
|
||||
new TesseractLanguage("enm.traineddata.gz", "enm", "Middle English (1100-1500)", 0.77, "02486b802f4f83b5d9198309955cbf4aa38e5e05"),
|
||||
new TesseractLanguage("epo.traineddata.gz", "epo", "Esperanto", 2.42, "465dfb934eb45116ebe7f3c4e3adf28826e49dca"),
|
||||
new TesseractLanguage("equ.traineddata.gz", "equ", "Math / equation detection", 0.78, "c9bc582875cf7c7903b529a9cdb0b9f4669b840d"),
|
||||
new TesseractLanguage("est.traineddata.gz", "est", "Estonian", 3.62, "d743f2456fa32ce7bbbb80cb40951eb742692596"),
|
||||
new TesseractLanguage("eus.traineddata.gz", "eus", "Basque", 1.83, "d991552b861e5ea1dca59ffca7e295b323e62bbf"),
|
||||
new TesseractLanguage("fas.traineddata.gz", "fas", "Persian", 1.75, "c8a7a6b11c3f455b07a397af2e51705a68ff5f77", true),
|
||||
new TesseractLanguage("fin.traineddata.gz", "fin", "Finnish", 4.98, "90232ad3572901a35bd4bbc736d47184171fa0fd"),
|
||||
new TesseractLanguage("fra.traineddata.gz", "fra", "French", 5.65, "2bebc5a4c981443c1cbff254e0ca3120004a6c7b"),
|
||||
new TesseractLanguage("frk.traineddata.gz", "frk", "Frankish", 6.64, "1a6984f8b5768ae663f293ea04594fca229bdb16"),
|
||||
new TesseractLanguage("frm.traineddata.gz", "frm", "Middle French (ca. 1400-1600)", 6.34, "64e0c6e00352833b206f8b26b6410d0d544b798d"),
|
||||
new TesseractLanguage("gle.traineddata.gz", "gle", "Irish", 1.25, "994c111e9c24e74bf7105f42a3e39d87ea24f258"),
|
||||
new TesseractLanguage("glg.traineddata.gz", "glg", "Galician", 2.04, "201c627e518099c15dbbecd72e6e4782e389f619"),
|
||||
new TesseractLanguage("grc.traineddata.gz", "grc", "Ancient Greek", 1.88, "ae58a943620c485d33ba95b3fcaca79314105d56"),
|
||||
new TesseractLanguage("guj.traineddata.gz", "guj", "Gujarati", 4.39, "f469d7257f39dcdd0668d768886f19084816b10e"),
|
||||
new TesseractLanguage("hat.traineddata.gz", "hat", "Haitian", 0.49, "1667e25ebfe6dc74695af413f291e20f1eec552a"),
|
||||
new TesseractLanguage("heb.traineddata.gz", "heb", "Hebrew", 1.51, "64401c999ef08d6190a11a4347c8f9acf40a8e50", true),
|
||||
new TesseractLanguage("hin.traineddata.gz", "hin", "Hindi", 6.28, "dae6a9a729ad84eded87fef69004d89249170d44"),
|
||||
new TesseractLanguage("hrv.traineddata.gz", "hrv", "Croatian", 3.33, "b05db705553607afe3d3f2385dc7f272f348a59c"),
|
||||
new TesseractLanguage("hun.traineddata.gz", "hun", "Hungarian", 4.62, "250f8b5ad6464e3f0ad8694c0b54392cf6c9d73b"),
|
||||
new TesseractLanguage("iku.traineddata.gz", "iku", "Inuktitut", 0.30, "119af8b174547aa9cb00f04512d4960d523863ad"),
|
||||
new TesseractLanguage("ind.traineddata.gz", "ind", "Indonesian", 2.51, "f46f56473ba850408499678c349bdb6dc544dc67"),
|
||||
new TesseractLanguage("isl.traineddata.gz", "isl", "Icelandic", 2.28, "54004c851361c36ddf48b4443caf79188fa757b6"),
|
||||
new TesseractLanguage("ita.traineddata.gz", "ita", "Italian", 5.40, "1730f0e32cad3bd76a4f58de67d7c8e2cde17b51"),
|
||||
new TesseractLanguage("ita_old.traineddata.gz", "ita_old", "Italian (Old)", 5.35, "b7a4293b464cbcce08fd5dc15a9831cff888cdf0"),
|
||||
new TesseractLanguage("jav.traineddata.gz", "jav", "Javanese", 1.60, "3caa600f063705a2649be289038f381ecdaa8989"),
|
||||
new TesseractLanguage("jpn.traineddata.gz", "jpn", "Japanese", 13.65, "7545927e6c60888a61556af4247e81c7a08cc17d"),
|
||||
new TesseractLanguage("kan.traineddata.gz", "kan", "Kannada", 15.12, "53d26da4fde19b5663f4e7748809ba4baf12fe96"),
|
||||
new TesseractLanguage("kat.traineddata.gz", "kat", "Georgian", 2.23, "8c48267883781ad2278f052259fe4094c64ef9bb"),
|
||||
new TesseractLanguage("kat_old.traineddata.gz", "kat_old", "Georgian (Old)", 0.19, "88e8312c3fc30ba03811d5d571e44158bc0ab5bf"),
|
||||
new TesseractLanguage("kaz.traineddata.gz", "kaz", "Kazakh", 1.65, "45c6603afcfe4d81990439df3bed13dd1b4c654b"),
|
||||
new TesseractLanguage("khm.traineddata.gz", "khm", "Central Khmer", 20.96, "d5a542959114b154db4db61419cd57aba1e3cf5a"),
|
||||
new TesseractLanguage("kir.traineddata.gz", "kir", "Kirghiz", 2.02, "ee9ba20cde7597688140fc43b14e49417d1052b7"),
|
||||
new TesseractLanguage("kor.traineddata.gz", "kor", "Korean", 5.11, "39b452ede31b196c66442ea580b5664377eabdab"),
|
||||
new TesseractLanguage("kur.traineddata.gz", "kur", "Kurdish", 0.73, "a36683c3f62415e1d12529b7642b9463c880db0c", true),
|
||||
new TesseractLanguage("lao.traineddata.gz", "lao", "Lao", 8.70, "95dbad397571d2d2c13ed63ddc16a51fca343cfb"),
|
||||
new TesseractLanguage("lat.traineddata.gz", "lat", "Latin", 2.04, "43dc27088ecce88915f6de15c7f6ec9037eebfee"),
|
||||
new TesseractLanguage("lav.traineddata.gz", "lav", "Latvian", 2.91, "db4e13d875a4c88bd6d8873a7db95fcbd7f9114b"),
|
||||
new TesseractLanguage("lit.traineddata.gz", "lit", "Lithuanian", 3.28, "fae20b8933a2c49fb9d98539299c7452d530514a"),
|
||||
new TesseractLanguage("mal.traineddata.gz", "mal", "Malayalam", 3.49, "77a6553e0a37ddf5935a4e81b918850b8babb379"),
|
||||
new TesseractLanguage("mar.traineddata.gz", "mar", "Marathi", 5.85, "36297ba7adad4e476815a1ab962b556994e85196"),
|
||||
new TesseractLanguage("mkd.traineddata.gz", "mkd", "Macedonian", 1.36, "63a9ce25d9e2ce9e169ac17e422564809be21fb2"),
|
||||
new TesseractLanguage("mlt.traineddata.gz", "mlt", "Maltese", 1.96, "18cb93ee612c4c7989c005cdf3a228c4e524db67"),
|
||||
new TesseractLanguage("msa.traineddata.gz", "msa", "Malay", 2.47, "a40a2af1a06db7cbf4ecef903bff645d7ee3cfc3"),
|
||||
new TesseractLanguage("mya.traineddata.gz", "mya", "Burmese", 29.36, "f5875d22dc164da4176856ced8521790dfa986a8"),
|
||||
new TesseractLanguage("nep.traineddata.gz", "nep", "Nepali", 6.53, "55940992c6269123a49c0f0f616d766f9cb3aa4c"),
|
||||
new TesseractLanguage("nld.traineddata.gz", "nld", "Dutch", 6.83, "7a19402e128c97ffb5044780c055344e4b92cceb"),
|
||||
new TesseractLanguage("nor.traineddata.gz", "nor", "Norwegian", 3.14, "33fd288a93a5260954b0fca37894ce50d8872971"),
|
||||
new TesseractLanguage("ori.traineddata.gz", "ori", "Oriya", 3.06, "cc4951bf162f3e06f83a7f63868dc0ba2a86c83c"),
|
||||
// new TesseractLanguage { Filename = "osd.traineddata.gz", Code = "osd", LangName = "", Size = 4.08, Sha1 = "d8c10c1fca9b954ca2500e6abeee94b50329f486" },
|
||||
new TesseractLanguage("pan.traineddata.gz", "pan", "Panjabi", 4.06, "ec846c1a93576f85878de4b06fa82241782cf2a4"),
|
||||
new TesseractLanguage("pol.traineddata.gz", "pol", "Polish", 5.41, "55a31b8724722219ce80f0a75685f267ae221d3d"),
|
||||
new TesseractLanguage("por.traineddata.gz", "por", "Portuguese", 5.06, "c486d3ba8ad2d7555f894352313f4c5cfb287dca"),
|
||||
new TesseractLanguage("pus.traineddata.gz", "pus", "Pushto", 0.88, "c45f471412ae0a7b4ed92141c828963911fa5f15"),
|
||||
new TesseractLanguage("ron.traineddata.gz", "ron", "Romanian", 2.99, "e21ef667ff7bb90904cf0d731ebe184854cde616"),
|
||||
new TesseractLanguage("rus.traineddata.gz", "rus", "Russian", 6.05, "96d7897ddecc7f944b5c1751e9ff44416cc3ee21"),
|
||||
new TesseractLanguage("san.traineddata.gz", "san", "Sanskrit", 9.52, "c324b96fc4f1dcd2295329081f18be98e1c71053"),
|
||||
new TesseractLanguage("sin.traineddata.gz", "sin", "Sinhala", 2.60, "145f8b7da56fe12340d4a0ce3f0c1385e437398c"),
|
||||
new TesseractLanguage("slk.traineddata.gz", "slk", "Slovakian", 3.45, "abe9737fb49c9284a10cbb87b9efa773234af5c3"),
|
||||
new TesseractLanguage("slk_frak.traineddata.gz", "slk_frak", "Slovakian (Fraktur)", 0.28, "e12b4fd2b4d2739656ed28142ba5db081d49fce2"),
|
||||
new TesseractLanguage("slv.traineddata.gz", "slv", "Slovenian", 2.47, "d94468d01fec2bbcb8be23e97ec5329ef58c541f"),
|
||||
new TesseractLanguage("spa.traineddata.gz", "spa", "Spanish", 6.31, "89160dbb92dbb5bcd6c48237315f6aa892450ef1"),
|
||||
new TesseractLanguage("spa_old.traineddata.gz", "spa_old", "Spanish (Old)", 6.57, "9d13656da6a91ca4717f9235340f0304c7f77110"),
|
||||
new TesseractLanguage("sqi.traineddata.gz", "sqi", "Albanian", 2.40, "30957e11c55610634dfdd2704ff0d6036c2e4ca5"),
|
||||
new TesseractLanguage("srp.traineddata.gz", "srp", "Serbian", 1.56, "5a7ef0c3c37d7f1891bde5a96b92b2fd3e48783a"),
|
||||
new TesseractLanguage("srp_latn.traineddata.gz", "srp_latn", "Serbian (Latin)", 2.27, "2aa8ff0e22440d3aab1a59e47b416bcd7ab2e7ae"),
|
||||
new TesseractLanguage("swa.traineddata.gz", "swa", "Swahili", 1.43, "6010b9255c1cd98c8bda39cd18904bf7782942e1"),
|
||||
new TesseractLanguage("swe.traineddata.gz", "swe", "Swedish", 3.64, "1bd6fd11f36b3ca04342a521773179269c5410e3"),
|
||||
new TesseractLanguage("syr.traineddata.gz", "syr", "Syriac", 1.06, "01aa53fd62897bcbfc053401405485d6f6aa9df9"),
|
||||
new TesseractLanguage("tam.traineddata.gz", "tam", "Tamil", 1.99, "eaca5e8c91d7995894ff2dafc4b824f305d6fff0"),
|
||||
new TesseractLanguage("tel.traineddata.gz", "tel", "Telugu", 16.81, "1f5b1e2f3d8a772b406e4a2b9d8ec38f1eec4cc6"),
|
||||
new TesseractLanguage("tgk.traineddata.gz", "tgk", "Tajik", 0.40, "b839d70a88e1dc2a019d1b7e76b83e5dcb0df440"),
|
||||
new TesseractLanguage("tgl.traineddata.gz", "tgl", "Tagalog", 1.56, "0bdbb9e5f763ebfeef8fc9cd0ba1913bd7309755"),
|
||||
new TesseractLanguage("tha.traineddata.gz", "tha", "Thai", 5.61, "7a171182716c99c19c1cc9b934a70ef5bee7893a"),
|
||||
new TesseractLanguage("tir.traineddata.gz", "tir", "Tigrinya", 0.60, "4292700b180a505c4a45666a13eac6e144b48615"),
|
||||
new TesseractLanguage("tur.traineddata.gz", "tur", "Turkish", 5.61, "8d72dc5ec5f22073f6b3ae2f79534e36aa8f63e8"),
|
||||
new TesseractLanguage("uig.traineddata.gz", "uig", "Uighur", 0.72, "d20262f24476229539b4b87efa9327428052b241"),
|
||||
new TesseractLanguage("ukr.traineddata.gz", "ukr", "Ukrainian", 2.92, "0871744dfacfa446e212e5c7e671c790b5fdd2f0"),
|
||||
new TesseractLanguage("urd.traineddata.gz", "urd", "Urdu", 1.83, "be2964ca83114ee04b3a258e71525b8a1a670c97", true),
|
||||
new TesseractLanguage("uzb.traineddata.gz", "uzb", "Uzbek", 1.55, "8de3127c90628514d61c0ded9510d4b2728f4b69"),
|
||||
new TesseractLanguage("uzb_cyrl.traineddata.gz", "uzb_cyrl", "Uzbek (Cyrillic)", 1.19, "e1190d147d6ce3770d768724c82e103b06c93061"),
|
||||
new TesseractLanguage("vie.traineddata.gz", "vie", "Vietnamese", 2.27, "571e132cd3ed26f5c33943efe7aa17835d277a15"),
|
||||
new TesseractLanguage("yid.traineddata.gz", "yid", "Yiddish", 1.60, "0dbb6e19b660b57283f954eb5183cc2f3677fdda"),
|
||||
});
|
||||
|
||||
#endregion
|
||||
|
||||
#region Tesseract 4.00Beta4 Language Data (auto-generated)
|
||||
|
||||
public static readonly TesseractLanguageData V400B4 = new TesseractLanguageData(new[]
|
||||
public static readonly TesseractLanguageData Latest = new(new[]
|
||||
{
|
||||
new TesseractLanguage("afr.traineddata.zip", "afr", "Afrikaans", 5.44, "4278120a18e3464194df302f55417afc35415af7"),
|
||||
new TesseractLanguage("amh.traineddata.zip", "amh", "Amharic", 5.55, "166219c79a3c92775ac8cc987fba91899dc63f7d"),
|
||||
|
34
NAPS2.Sdk/Ocr/TesseractLanguageManager.cs
Normal file
34
NAPS2.Sdk/Ocr/TesseractLanguageManager.cs
Normal file
@ -0,0 +1,34 @@
|
||||
using NAPS2.Dependencies;
|
||||
|
||||
namespace NAPS2.Ocr;
|
||||
|
||||
public class TesseractLanguageManager
|
||||
{
|
||||
private static readonly List<DownloadMirror> Mirrors = new()
|
||||
{
|
||||
new(PlatformSupport.ModernWindows.Or(PlatformSupport.Linux), @"https://github.com/cyanfish/naps2-components/releases/download/tesseract-4.0.0b4/{0}"),
|
||||
new(PlatformSupport.ModernWindows.Or(PlatformSupport.Linux), @"https://sourceforge.net/projects/naps2/files/components/tesseract-4.0.0b4/{0}/download")
|
||||
};
|
||||
|
||||
public TesseractLanguageManager(string basePath)
|
||||
{
|
||||
LanguageData = TesseractLanguageData.Latest;
|
||||
TesseractBasePath = Path.Combine(basePath, "tesseract-4.0.0b4");
|
||||
|
||||
LanguageComponents = LanguageData.Data.Select(x =>
|
||||
new MultiFileExternalComponent($"ocr-{x.Code}", TesseractBasePath, new[] { $"best/{x.Code}.traineddata", $"fast/{x.Code}.traineddata" },
|
||||
new DownloadInfo(x.Filename, Mirrors, x.Size, x.Sha1, DownloadFormat.Zip)));
|
||||
}
|
||||
|
||||
public virtual IEnumerable<Language> InstalledLanguages =>
|
||||
LanguageComponents.Where(x => x.IsInstalled).Select(x => LanguageData.LanguageMap[x.Id]);
|
||||
|
||||
public virtual IEnumerable<Language> NotInstalledLanguages =>
|
||||
LanguageComponents.Where(x => !x.IsInstalled).Select(x => LanguageData.LanguageMap[x.Id]);
|
||||
|
||||
public string TesseractBasePath { get; protected init; }
|
||||
|
||||
public TesseractLanguageData LanguageData { get; protected init; }
|
||||
|
||||
public IEnumerable<IExternalComponent> LanguageComponents { get; protected init; }
|
||||
}
|
@ -1,45 +1,39 @@
|
||||
using System.Threading;
|
||||
using NAPS2.Dependencies;
|
||||
using System.Threading;
|
||||
|
||||
namespace NAPS2.Ocr;
|
||||
|
||||
public abstract class TesseractBaseEngine : IOcrEngine
|
||||
public class TesseractOcrEngine : IOcrEngine
|
||||
{
|
||||
private const int CHECK_INTERVAL = 500;
|
||||
private readonly string _exePath;
|
||||
private readonly string? _languageDataBasePath;
|
||||
|
||||
public bool CanProcess(string langCode)
|
||||
public TesseractOcrEngine(string exePath, string? languageDataBasePath)
|
||||
{
|
||||
if (string.IsNullOrEmpty(langCode) || !IsInstalled || !IsSupported)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// Support multiple specified languages (e.g. "eng+fra")
|
||||
return langCode.Split('+').All(code => InstalledLanguages.Any(x => x.Code == code));
|
||||
_exePath = exePath;
|
||||
_languageDataBasePath = languageDataBasePath;
|
||||
}
|
||||
|
||||
|
||||
public async Task<OcrResult?> ProcessImage(string imagePath, OcrParams ocrParams, CancellationToken cancelToken)
|
||||
{
|
||||
string tempHocrFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName());
|
||||
string tempHocrFilePathWithExt = tempHocrFilePath + TesseractHocrExtension;
|
||||
string tempHocrFilePathWithExt = tempHocrFilePath + ".hocr";
|
||||
try
|
||||
{
|
||||
var runInfo = TesseractRunInfo(ocrParams);
|
||||
var startInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = Path.Combine(TesseractBasePath, TesseractExePath),
|
||||
Arguments = $"\"{imagePath}\" \"{tempHocrFilePath}\" -l {ocrParams.LanguageCode} {runInfo.Arguments} hocr",
|
||||
FileName = _exePath,
|
||||
Arguments = $"\"{imagePath}\" \"{tempHocrFilePath}\" -l {ocrParams.LanguageCode} hocr",
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true,
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true
|
||||
};
|
||||
if (runInfo.PrefixPath != null)
|
||||
if (_languageDataBasePath != null)
|
||||
{
|
||||
startInfo.EnvironmentVariables["TESSDATA_PREFIX"] = Path.Combine(TesseractBasePath, runInfo.PrefixPath);
|
||||
}
|
||||
if (runInfo.DataPath != null)
|
||||
{
|
||||
var tessdata = new DirectoryInfo(Path.Combine(TesseractBasePath, runInfo.DataPath));
|
||||
string subfolder = ocrParams.Mode == OcrMode.Best ? "best" : "fast";
|
||||
string languageDataPath = Path.Combine(_languageDataBasePath, subfolder);
|
||||
startInfo.EnvironmentVariables["TESSDATA_PREFIX"] = languageDataPath;
|
||||
var tessdata = new DirectoryInfo(languageDataPath);
|
||||
EnsureHocrConfigExists(tessdata);
|
||||
}
|
||||
var tesseractProcess = Process.Start(startInfo);
|
||||
@ -107,8 +101,10 @@ public abstract class TesseractBaseEngine : IOcrEngine
|
||||
var elements = hocrDocument.Descendants()
|
||||
.Where(x => x.Attributes("class").Any(y => y.Value == "ocrx_word"))
|
||||
.Select(x => new OcrResultElement(x.Value, GetBounds(x.Attribute("title"))));
|
||||
var rtl = InstalledLanguages.Where(x => x.Code == ocrParams.LanguageCode).Select(x => x.RTL)
|
||||
.FirstOrDefault();
|
||||
var rtl = false;
|
||||
// TODO: Can we detect rtl from the hocr file?
|
||||
// var rtl = _data.InstalledLanguages.Where(x => x.Code == ocrParams.LanguageCode).Select(x => x.RTL)
|
||||
// .FirstOrDefault();
|
||||
return new OcrResult(pageBounds, elements, rtl);
|
||||
}
|
||||
catch (Exception e)
|
||||
@ -162,51 +158,32 @@ public abstract class TesseractBaseEngine : IOcrEngine
|
||||
}
|
||||
return bounds;
|
||||
}
|
||||
|
||||
public bool CanInstall { get; protected set; }
|
||||
|
||||
public IEnumerable<OcrMode> SupportedModes { get; protected set; }
|
||||
|
||||
public IExternalComponent Component { get; protected set; }
|
||||
|
||||
public IEnumerable<IExternalComponent> LanguageComponents { get; protected set; }
|
||||
|
||||
public virtual bool IsSupported => PlatformSupport.Validate();
|
||||
|
||||
public virtual bool IsInstalled => Component.IsInstalled;
|
||||
|
||||
public virtual IEnumerable<Language> InstalledLanguages =>
|
||||
LanguageComponents.Where(x => x.IsInstalled).Select(x => LanguageData.LanguageMap[x.Id]);
|
||||
|
||||
public virtual IEnumerable<Language> NotInstalledLanguages =>
|
||||
LanguageComponents.Where(x => !x.IsInstalled).Select(x => LanguageData.LanguageMap[x.Id]);
|
||||
|
||||
protected string TesseractBasePath { get; set; }
|
||||
|
||||
protected string TesseractExePath { get; set; }
|
||||
|
||||
protected string TesseractHocrExtension { get; set; } = ".hocr";
|
||||
|
||||
protected DownloadInfo DownloadInfo { get; set; }
|
||||
|
||||
protected PlatformSupport PlatformSupport { get; set; }
|
||||
|
||||
protected TesseractLanguageData LanguageData { get; set; }
|
||||
|
||||
protected virtual RunInfo TesseractRunInfo(OcrParams ocrParams) => new RunInfo
|
||||
{
|
||||
Arguments = "",
|
||||
DataPath = "tessdata",
|
||||
PrefixPath = ""
|
||||
};
|
||||
|
||||
|
||||
protected class RunInfo
|
||||
{
|
||||
public string Arguments { get; set; }
|
||||
|
||||
public string? PrefixPath { get; set; }
|
||||
|
||||
public string? DataPath { get; set; }
|
||||
}
|
||||
|
||||
|
||||
// TODO: For local engine (where we need to manually direct to the language data)
|
||||
// public override TesseractRunInfo TesseractRunInfo(OcrParams ocrParams)
|
||||
// {
|
||||
// OcrMode mode = ocrParams.Mode;
|
||||
// string folder = mode == OcrMode.Fast || mode == OcrMode.Default ? "fast" : "best";
|
||||
// if (ocrParams.LanguageCode.Split('+').All(code => !File.Exists(Path.Combine(TesseractBasePath, folder, $"{code.ToLowerInvariant()}.traineddata"))))
|
||||
// {
|
||||
// // Use the other source if the selected one doesn't exist
|
||||
// folder = folder == "fast" ? "best" : "fast";
|
||||
// mode = folder == "fast" ? OcrMode.Fast : OcrMode.Best;
|
||||
// }
|
||||
//
|
||||
// return new()
|
||||
// {
|
||||
// Arguments = mode == OcrMode.Best ? "--oem 1" : mode == OcrMode.Legacy ? "--oem 0" : "",
|
||||
// DataPath = folder,
|
||||
// PrefixPath = folder
|
||||
// };
|
||||
|
||||
// TODO: For system engine (where the language data is externally managed)
|
||||
// public override TesseractRunInfo TesseractRunInfo(OcrParams ocrParams) => new()
|
||||
// {
|
||||
// Arguments = "",
|
||||
// DataPath = null,
|
||||
// PrefixPath = null
|
||||
// };
|
||||
}
|
10
NAPS2.Sdk/Ocr/TesseractRunInfo.cs
Normal file
10
NAPS2.Sdk/Ocr/TesseractRunInfo.cs
Normal file
@ -0,0 +1,10 @@
|
||||
namespace NAPS2.Ocr;
|
||||
|
||||
public class TesseractRunInfo
|
||||
{
|
||||
public string Arguments { get; set; }
|
||||
|
||||
public string? PrefixPath { get; set; }
|
||||
|
||||
public string? DataPath { get; set; }
|
||||
}
|
@ -1,77 +1,77 @@
|
||||
using NAPS2.Dependencies;
|
||||
|
||||
namespace NAPS2.Ocr;
|
||||
|
||||
public class TesseractSystemEngine : TesseractBaseEngine
|
||||
{
|
||||
private bool _isInstalled;
|
||||
private DateTime? _installCheckTime;
|
||||
private List<Language>? _installedLanguages;
|
||||
|
||||
public TesseractSystemEngine()
|
||||
{
|
||||
// Use the most complete set of language mappings
|
||||
LanguageData = TesseractLanguageData.V400B4;
|
||||
TesseractBasePath = "";
|
||||
TesseractExePath = "tesseract";
|
||||
PlatformSupport = PlatformSupport.Linux;
|
||||
CanInstall = false;
|
||||
}
|
||||
|
||||
protected override RunInfo TesseractRunInfo(OcrParams ocrParams) => new RunInfo
|
||||
{
|
||||
Arguments = "",
|
||||
DataPath = null,
|
||||
PrefixPath = null
|
||||
};
|
||||
|
||||
public override bool IsInstalled
|
||||
{
|
||||
get
|
||||
{
|
||||
CheckIfInstalled();
|
||||
return _isInstalled;
|
||||
}
|
||||
}
|
||||
|
||||
public override IEnumerable<Language> InstalledLanguages
|
||||
{
|
||||
get
|
||||
{
|
||||
CheckIfInstalled();
|
||||
return _installedLanguages ?? Enumerable.Empty<Language>();
|
||||
}
|
||||
}
|
||||
|
||||
public override IEnumerable<Language> NotInstalledLanguages => Enumerable.Empty<Language>();
|
||||
|
||||
private void CheckIfInstalled()
|
||||
{
|
||||
if (IsSupported && (_installCheckTime == null || _installCheckTime < DateTime.Now - TimeSpan.FromSeconds(2)))
|
||||
{
|
||||
try
|
||||
{
|
||||
var process = Process.Start(new ProcessStartInfo
|
||||
{
|
||||
FileName = TesseractExePath,
|
||||
Arguments = "--list-langs",
|
||||
UseShellExecute = false,
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true
|
||||
});
|
||||
if (process != null && process.Id != 0)
|
||||
{
|
||||
var codes = process.StandardError.ReadToEnd().Split(new[] {'\r', '\n'}, StringSplitOptions.RemoveEmptyEntries).Where(x => x.Length == 3);
|
||||
_installedLanguages = codes.Select(code => LanguageData.LanguageMap.Get($"ocr-{code}")).WhereNotNull().ToList();
|
||||
_isInstalled = true;
|
||||
process.Kill();
|
||||
}
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
// Component is not installed on the system path (or had an error)
|
||||
}
|
||||
_installCheckTime = DateTime.Now;
|
||||
}
|
||||
}
|
||||
}
|
||||
// using NAPS2.Dependencies;
|
||||
//
|
||||
// namespace NAPS2.Ocr;
|
||||
//
|
||||
// public class TesseractSystemData : TesseractOcrEngineData
|
||||
// {
|
||||
// private bool _isInstalled;
|
||||
// private DateTime? _installCheckTime;
|
||||
// private List<Language>? _installedLanguages;
|
||||
//
|
||||
// public TesseractSystemData()
|
||||
// {
|
||||
// // Use the most complete set of language mappings
|
||||
// LanguageData = TesseractLanguageData.V400B4;
|
||||
// TesseractBasePath = "";
|
||||
// TesseractExePath = "tesseract";
|
||||
// PlatformSupport = PlatformSupport.Linux;
|
||||
// CanInstall = false;
|
||||
// }
|
||||
//
|
||||
// public override TesseractRunInfo TesseractRunInfo(OcrParams ocrParams) => new()
|
||||
// {
|
||||
// Arguments = "",
|
||||
// DataPath = null,
|
||||
// PrefixPath = null
|
||||
// };
|
||||
//
|
||||
// public override bool IsInstalled
|
||||
// {
|
||||
// get
|
||||
// {
|
||||
// CheckIfInstalled();
|
||||
// return _isInstalled;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// public override IEnumerable<Language> InstalledLanguages
|
||||
// {
|
||||
// get
|
||||
// {
|
||||
// CheckIfInstalled();
|
||||
// return _installedLanguages ?? Enumerable.Empty<Language>();
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// public override IEnumerable<Language> NotInstalledLanguages => Enumerable.Empty<Language>();
|
||||
//
|
||||
// private void CheckIfInstalled()
|
||||
// {
|
||||
// if (IsSupported && (_installCheckTime == null || _installCheckTime < DateTime.Now - TimeSpan.FromSeconds(2)))
|
||||
// {
|
||||
// try
|
||||
// {
|
||||
// var process = Process.Start(new ProcessStartInfo
|
||||
// {
|
||||
// FileName = TesseractExePath,
|
||||
// Arguments = "--list-langs",
|
||||
// UseShellExecute = false,
|
||||
// RedirectStandardOutput = true,
|
||||
// RedirectStandardError = true
|
||||
// });
|
||||
// if (process != null && process.Id != 0)
|
||||
// {
|
||||
// var codes = process.StandardError.ReadToEnd().Split(new[] {'\r', '\n'}, StringSplitOptions.RemoveEmptyEntries).Where(x => x.Length == 3);
|
||||
// _installedLanguages = codes.Select(code => LanguageData.LanguageMap.Get($"ocr-{code}")).WhereNotNull().ToList();
|
||||
// _isInstalled = true;
|
||||
// process.Kill();
|
||||
// }
|
||||
// }
|
||||
// catch (Exception)
|
||||
// {
|
||||
// // Component is not installed on the system path (or had an error)
|
||||
// }
|
||||
// _installCheckTime = DateTime.Now;
|
||||
// }
|
||||
// }
|
||||
// }
|
@ -15,18 +15,16 @@ public class BatchScanPerformer : IBatchScanPerformer
|
||||
private readonly PdfExporter _pdfExporter;
|
||||
private readonly IOperationFactory _operationFactory;
|
||||
private readonly IConfigProvider<PdfSettings> _pdfSettingsProvider;
|
||||
private readonly OcrEngineManager _ocrEngineManager;
|
||||
private readonly IFormFactory _formFactory;
|
||||
private readonly ScopedConfig _config;
|
||||
private readonly IProfileManager _profileManager;
|
||||
|
||||
public BatchScanPerformer(IScanPerformer scanPerformer, PdfExporter pdfExporter, IOperationFactory operationFactory, IConfigProvider<PdfSettings> pdfSettingsProvider, OcrEngineManager ocrEngineManager, IFormFactory formFactory, ScopedConfig config, IProfileManager profileManager)
|
||||
public BatchScanPerformer(IScanPerformer scanPerformer, PdfExporter pdfExporter, IOperationFactory operationFactory, IConfigProvider<PdfSettings> pdfSettingsProvider, IFormFactory formFactory, ScopedConfig config, IProfileManager profileManager)
|
||||
{
|
||||
_scanPerformer = scanPerformer;
|
||||
_pdfExporter = pdfExporter;
|
||||
_operationFactory = operationFactory;
|
||||
_pdfSettingsProvider = pdfSettingsProvider;
|
||||
_ocrEngineManager = ocrEngineManager;
|
||||
_formFactory = formFactory;
|
||||
_config = config;
|
||||
_profileManager = profileManager;
|
||||
@ -34,7 +32,7 @@ public class BatchScanPerformer : IBatchScanPerformer
|
||||
|
||||
public async Task PerformBatchScan(IConfigProvider<BatchSettings> settings, FormBase batchForm, Action<ProcessedImage> imageCallback, Action<string> progressCallback, CancellationToken cancelToken)
|
||||
{
|
||||
var state = new BatchState(_scanPerformer, _pdfExporter, _operationFactory, _pdfSettingsProvider, _ocrEngineManager, _formFactory, _config, _profileManager)
|
||||
var state = new BatchState(_scanPerformer, _pdfExporter, _operationFactory, _pdfSettingsProvider, _formFactory, _config, _profileManager)
|
||||
{
|
||||
Settings = settings,
|
||||
ProgressCallback = progressCallback,
|
||||
@ -51,7 +49,6 @@ public class BatchScanPerformer : IBatchScanPerformer
|
||||
private readonly PdfExporter _pdfExporter;
|
||||
private readonly IOperationFactory _operationFactory;
|
||||
private readonly IConfigProvider<PdfSettings> _pdfSettingsProvider;
|
||||
private readonly OcrEngineManager _ocrEngineManager;
|
||||
private readonly IFormFactory _formFactory;
|
||||
private readonly ScopedConfig _config;
|
||||
private readonly IProfileManager _profileManager;
|
||||
@ -61,13 +58,12 @@ public class BatchScanPerformer : IBatchScanPerformer
|
||||
private List<List<ProcessedImage>> _scans;
|
||||
|
||||
public BatchState(IScanPerformer scanPerformer, PdfExporter pdfExporter, IOperationFactory operationFactory,
|
||||
IConfigProvider<PdfSettings> pdfSettingsProvider, OcrEngineManager ocrEngineManager, IFormFactory formFactory, ScopedConfig config, IProfileManager profileManager)
|
||||
IConfigProvider<PdfSettings> pdfSettingsProvider, IFormFactory formFactory, ScopedConfig config, IProfileManager profileManager)
|
||||
{
|
||||
_scanPerformer = scanPerformer;
|
||||
_pdfExporter = pdfExporter;
|
||||
_operationFactory = operationFactory;
|
||||
_pdfSettingsProvider = pdfSettingsProvider;
|
||||
_ocrEngineManager = ocrEngineManager;
|
||||
_formFactory = formFactory;
|
||||
_config = config;
|
||||
_profileManager = profileManager;
|
||||
|
@ -20,12 +20,12 @@ internal class ScanPerformer : IScanPerformer
|
||||
private readonly ErrorOutput _errorOutput;
|
||||
private readonly ScanOptionsValidator _scanOptionsValidator;
|
||||
private readonly IScanBridgeFactory _scanBridgeFactory;
|
||||
private readonly OcrEngineManager _ocrEngineManager;
|
||||
private readonly IOcrEngine _ocrEngine;
|
||||
|
||||
public ScanPerformer(IFormFactory formFactory, ScopedConfig config, OperationProgress operationProgress,
|
||||
AutoSaver autoSaver, IProfileManager profileManager, ErrorOutput errorOutput,
|
||||
ScanOptionsValidator scanOptionsValidator, IScanBridgeFactory scanBridgeFactory,
|
||||
ScanningContext scanningContext, OcrEngineManager ocrEngineManager)
|
||||
ScanningContext scanningContext, IOcrEngine ocrEngine)
|
||||
{
|
||||
_formFactory = formFactory;
|
||||
_config = config;
|
||||
@ -36,7 +36,7 @@ internal class ScanPerformer : IScanPerformer
|
||||
_scanOptionsValidator = scanOptionsValidator;
|
||||
_scanBridgeFactory = scanBridgeFactory;
|
||||
_scanningContext = scanningContext;
|
||||
_ocrEngineManager = ocrEngineManager;
|
||||
_ocrEngine = ocrEngine;
|
||||
}
|
||||
|
||||
public async Task<ScanDevice> PromptForDevice(ScanProfile scanProfile, IntPtr dialogParent = default)
|
||||
@ -101,7 +101,7 @@ internal class ScanPerformer : IScanPerformer
|
||||
OcrController ocrController = new OcrController(_scanningContext);
|
||||
if (scanParams.DoOcr)
|
||||
{
|
||||
ocrController.Engine = _ocrEngineManager.ActiveEngine;
|
||||
ocrController.Engine = _ocrEngine;
|
||||
if (ocrController.Engine == null)
|
||||
{
|
||||
Log.Error("OCR is enabled but no OCR engine is available.");
|
||||
|
@ -34,7 +34,7 @@ namespace NAPS2.WinForms
|
||||
private readonly ImageContext _imageContext;
|
||||
private readonly StringWrapper _stringWrapper;
|
||||
private readonly RecoveryManager _recoveryManager;
|
||||
private readonly OcrEngineManager _ocrEngineManager;
|
||||
private readonly TesseractLanguageManager _tesseractLanguageManager;
|
||||
private readonly IScanPerformer _scanPerformer;
|
||||
private readonly IScannedImagePrinter _scannedImagePrinter;
|
||||
private readonly StillImage _stillImage;
|
||||
@ -70,7 +70,7 @@ namespace NAPS2.WinForms
|
||||
|
||||
#region Initialization and Culture
|
||||
|
||||
public FDesktop(ImageContext imageContext, StringWrapper stringWrapper, RecoveryManager recoveryManager, OcrEngineManager ocrEngineManager,
|
||||
public FDesktop(ImageContext imageContext, StringWrapper stringWrapper, RecoveryManager recoveryManager, TesseractLanguageManager tesseractLanguageManager,
|
||||
IScanPerformer scanPerformer, IScannedImagePrinter scannedImagePrinter, StillImage stillImage, IOperationFactory operationFactory,
|
||||
KeyboardShortcutManager ksm, ThumbnailRenderer thumbnailRenderer, WinFormsExportHelper exportHelper, ImageClipboard imageClipboard,
|
||||
NotificationManager notify, CultureInitializer cultureInitializer, IWorkerFactory workerFactory, OperationProgress operationProgress,
|
||||
@ -80,7 +80,7 @@ namespace NAPS2.WinForms
|
||||
_imageContext = imageContext;
|
||||
_stringWrapper = stringWrapper;
|
||||
_recoveryManager = recoveryManager;
|
||||
_ocrEngineManager = ocrEngineManager;
|
||||
_tesseractLanguageManager = tesseractLanguageManager;
|
||||
_scanPerformer = scanPerformer;
|
||||
_scannedImagePrinter = scannedImagePrinter;
|
||||
_stillImage = stillImage;
|
||||
@ -1165,33 +1165,14 @@ namespace NAPS2.WinForms
|
||||
|
||||
private void tsOcr_Click(object sender, EventArgs e)
|
||||
{
|
||||
if (_ocrEngineManager.MustUpgrade && !Config.Get(c => c.NoUpdatePrompt))
|
||||
if (_tesseractLanguageManager.InstalledLanguages.Any())
|
||||
{
|
||||
// Re-download a fixed version on Windows XP if needed
|
||||
MessageBox.Show(MiscResources.OcrUpdateAvailable, "", MessageBoxButtons.OK, MessageBoxIcon.Information);
|
||||
var progressForm = FormFactory.Create<FDownloadProgress>();
|
||||
progressForm.QueueFile(_ocrEngineManager.EngineToInstall.Component);
|
||||
progressForm.ShowDialog();
|
||||
}
|
||||
|
||||
if (_ocrEngineManager.MustInstallPackage)
|
||||
{
|
||||
const string packages = "\ntesseract-ocr";
|
||||
MessageBox.Show(MiscResources.TesseractNotAvailable + packages, MiscResources.Error, MessageBoxButtons.OK, MessageBoxIcon.Error);
|
||||
}
|
||||
else if (_ocrEngineManager.IsReady)
|
||||
{
|
||||
if (_ocrEngineManager.CanUpgrade && !Config.Get(c => c.NoUpdatePrompt))
|
||||
{
|
||||
MessageBox.Show(MiscResources.OcrUpdateAvailable, "", MessageBoxButtons.OK, MessageBoxIcon.Information);
|
||||
FormFactory.Create<FOcrLanguageDownload>().ShowDialog();
|
||||
}
|
||||
FormFactory.Create<FOcrSetup>().ShowDialog();
|
||||
}
|
||||
else
|
||||
{
|
||||
FormFactory.Create<FOcrLanguageDownload>().ShowDialog();
|
||||
if (_ocrEngineManager.IsReady)
|
||||
if (_tesseractLanguageManager.InstalledLanguages.Any())
|
||||
{
|
||||
FormFactory.Create<FOcrSetup>().ShowDialog();
|
||||
}
|
||||
|
@ -5,26 +5,16 @@ namespace NAPS2.WinForms
|
||||
{
|
||||
public partial class FOcrLanguageDownload : FormBase
|
||||
{
|
||||
private readonly OcrEngineManager _ocrEngineManager;
|
||||
private readonly IOcrEngine _engineToInstall;
|
||||
private readonly TesseractLanguageManager _tesseractLanguageManager;
|
||||
|
||||
public FOcrLanguageDownload(OcrEngineManager ocrEngineManager)
|
||||
public FOcrLanguageDownload(TesseractLanguageManager tesseractLanguageManager)
|
||||
{
|
||||
_ocrEngineManager = ocrEngineManager;
|
||||
_engineToInstall = ocrEngineManager.EngineToInstall;
|
||||
_tesseractLanguageManager = tesseractLanguageManager;
|
||||
InitializeComponent();
|
||||
|
||||
var initialSelection = new HashSet<string>();
|
||||
if (ocrEngineManager.InstalledEngine != null && ocrEngineManager.InstalledEngine != _engineToInstall)
|
||||
{
|
||||
// Upgrading from an old version, so pre-select previously used languages
|
||||
foreach (var lang in ocrEngineManager.InstalledEngine.LanguageComponents.Where(x => x.IsInstalled))
|
||||
{
|
||||
initialSelection.Add(lang.Id);
|
||||
}
|
||||
}
|
||||
|
||||
if (!_engineToInstall.InstalledLanguages.Any())
|
||||
// TODO: We used to select old installed languages here, maybe we could do it again if we get new lang data
|
||||
if (!_tesseractLanguageManager.InstalledLanguages.Any())
|
||||
{
|
||||
// Fresh install, so pre-select English as a sensible default
|
||||
initialSelection.Add("ocr-eng");
|
||||
@ -32,7 +22,7 @@ namespace NAPS2.WinForms
|
||||
|
||||
// Populate the list of language options
|
||||
// Special case for English: sorted to the top of the list
|
||||
var languageOptions = _engineToInstall.NotInstalledLanguages
|
||||
var languageOptions = _tesseractLanguageManager.NotInstalledLanguages
|
||||
.OrderBy(x => x.Code == "eng" ? "AAA" : x.Name);
|
||||
foreach (var languageOption in languageOptions)
|
||||
{
|
||||
@ -63,16 +53,11 @@ namespace NAPS2.WinForms
|
||||
private void UpdateView()
|
||||
{
|
||||
var selectedLanguages = SelectedLanguages;
|
||||
double downloadSize = _engineToInstall.LanguageComponents.Where(x => selectedLanguages.Contains(x.Id)).Select(x => x.DownloadInfo.Size).Sum();
|
||||
|
||||
if (!_engineToInstall.IsInstalled)
|
||||
{
|
||||
downloadSize += _engineToInstall.Component.DownloadInfo.Size;
|
||||
}
|
||||
double downloadSize = _tesseractLanguageManager.LanguageComponents.Where(x => selectedLanguages.Contains(x.Id)).Select(x => x.DownloadInfo.Size).Sum();
|
||||
|
||||
labelSizeEstimate.Text = string.Format(MiscResources.EstimatedDownloadSize, downloadSize.ToString("f1"));
|
||||
|
||||
btnDownload.Enabled = lvLanguages.Items.Cast<ListViewItem>().Any(x => x.Checked) || _engineToInstall.InstalledLanguages.Any() && !_engineToInstall.IsInstalled;
|
||||
btnDownload.Enabled = lvLanguages.Items.Cast<ListViewItem>().Any(x => x.Checked);
|
||||
}
|
||||
|
||||
private HashSet<string> SelectedLanguages
|
||||
@ -94,13 +79,8 @@ namespace NAPS2.WinForms
|
||||
{
|
||||
var progressForm = FormFactory.Create<FDownloadProgress>();
|
||||
|
||||
if (!_engineToInstall.IsInstalled)
|
||||
{
|
||||
progressForm.QueueFile(_engineToInstall.Component);
|
||||
}
|
||||
|
||||
var selectedLanguages = SelectedLanguages;
|
||||
foreach (var langComponent in _engineToInstall.LanguageComponents.Where(x => selectedLanguages.Contains(x.Id)))
|
||||
foreach (var langComponent in _tesseractLanguageManager.LanguageComponents.Where(x => selectedLanguages.Contains(x.Id)))
|
||||
{
|
||||
progressForm.QueueFile(langComponent);
|
||||
}
|
||||
|
@ -6,22 +6,18 @@ namespace NAPS2.WinForms;
|
||||
|
||||
public partial class FOcrSetup : FormBase
|
||||
{
|
||||
private readonly OcrEngineManager _ocrEngineManager;
|
||||
private readonly List<OcrMode> _availableModes;
|
||||
private readonly TesseractLanguageManager _tesseractLanguageManager;
|
||||
private readonly List<OcrMode> _availableModes = new() { OcrMode.Fast, OcrMode.Best };
|
||||
|
||||
public FOcrSetup(OcrEngineManager ocrEngineManager)
|
||||
public FOcrSetup(TesseractLanguageManager tesseractLanguageManager)
|
||||
{
|
||||
_ocrEngineManager = ocrEngineManager;
|
||||
_tesseractLanguageManager = tesseractLanguageManager;
|
||||
InitializeComponent();
|
||||
|
||||
comboOcrMode.Format += (sender, e) => e.Value = ((Enum)e.ListItem).Description();
|
||||
_availableModes = ocrEngineManager.ActiveEngine?.SupportedModes?.ToList();
|
||||
if (_availableModes != null)
|
||||
foreach (var mode in _availableModes)
|
||||
{
|
||||
foreach (var mode in _availableModes)
|
||||
{
|
||||
comboOcrMode.Items.Add(mode);
|
||||
}
|
||||
comboOcrMode.Items.Add(mode);
|
||||
}
|
||||
}
|
||||
|
||||
@ -38,11 +34,6 @@ public partial class FOcrSetup : FormBase
|
||||
comboLanguages.DisplayMember = "Name";
|
||||
comboLanguages.ValueMember = "Code";
|
||||
|
||||
ConditionalControls.UnlockHeight(this);
|
||||
ConditionalControls.SetVisible(comboOcrMode, _availableModes != null, 8);
|
||||
labelOcrMode.Visible = _availableModes != null;
|
||||
ConditionalControls.LockHeight(this);
|
||||
|
||||
checkBoxEnableOcr.Checked = Config.Get(c => c.EnableOcr);
|
||||
SetSelectedValue(comboLanguages, Config.Get(c => c.OcrLanguageCode));
|
||||
SetSelectedItem(comboOcrMode, Config.Get(c => c.OcrMode));
|
||||
@ -73,12 +64,10 @@ public partial class FOcrSetup : FormBase
|
||||
|
||||
private void LoadLanguages()
|
||||
{
|
||||
var languages = _ocrEngineManager.ActiveEngine?.InstalledLanguages
|
||||
var languages = _tesseractLanguageManager.InstalledLanguages
|
||||
.OrderBy(x => x.Name)
|
||||
.ToList();
|
||||
comboLanguages.DataSource = languages;
|
||||
|
||||
linkGetLanguages.Visible = _ocrEngineManager.EngineToInstall != null;
|
||||
}
|
||||
|
||||
private void UpdateView()
|
||||
|
@ -17,4 +17,6 @@
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=mapi/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=Pdfium/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=Pdfs/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=rmse/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=rmse/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=TESSDATA/@EntryIndexedValue">True</s:Boolean>
|
||||
<s:Boolean x:Key="/Default/UserDictionary/Words/=Tesseract/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>
|
Loading…
Reference in New Issue
Block a user