Change tesseract components folder path

This commit is contained in:
Ben Olden-Cooligan 2022-10-21 21:31:04 -07:00
parent 109643df18
commit 5b2782dcd9
3 changed files with 70 additions and 2 deletions

View File

@ -16,7 +16,7 @@ public class ImportAndSaveTests : AppiumTests
File.WriteAllBytes(Path.Combine(FolderPath, "patcht.pdf"), PdfResources.word_patcht_pdf);
File.WriteAllBytes(Path.Combine(FolderPath, "image.pdf"), PdfResources.image_pdf);
File.WriteAllBytes(Path.Combine(FolderPath, "text.jpg"), BinaryResources.ocr_test);
var tessdata = Path.Combine(FolderPath, "components", "tesseract-4.0.0b4", "fast");
var tessdata = Path.Combine(FolderPath, "components", "tesseract4", "fast");
Directory.CreateDirectory(tessdata);
File.WriteAllBytes(Path.Combine(tessdata, "eng.traineddata"), BinaryResources.eng_traineddata);

View File

@ -0,0 +1,49 @@
using NAPS2.Ocr;
using Xunit;
namespace NAPS2.Sdk.Tests.Ocr;
public class TesseractLanguageManagerTests : ContextualTests
{
private readonly string _legacyBasePath;
private readonly string _newBasePath;
public TesseractLanguageManagerTests()
{
_legacyBasePath = Path.Combine(FolderPath, "tesseract-4.0.0b4");
_newBasePath = Path.Combine(FolderPath, "tesseract4");
}
[Fact]
public void UsesNewBasePathOnCleanInstall()
{
var manager = new TesseractLanguageManager(FolderPath);
Assert.Equal(_newBasePath, manager.TessdataBasePath);
}
[Fact]
public void MovesToNewBasePath()
{
Directory.CreateDirectory(_legacyBasePath);
var manager = new TesseractLanguageManager(FolderPath);
Assert.False(Directory.Exists(_legacyBasePath));
Assert.True(Directory.Exists(_newBasePath));
Assert.Equal(_newBasePath, manager.TessdataBasePath);
}
[Fact]
public void UsesNewBasePathWhenBothPresent()
{
Directory.CreateDirectory(_legacyBasePath);
Directory.CreateDirectory(_newBasePath);
var manager = new TesseractLanguageManager(FolderPath);
Assert.True(Directory.Exists(_legacyBasePath));
Assert.True(Directory.Exists(_newBasePath));
Assert.Equal(_newBasePath, manager.TessdataBasePath);
}
}

View File

@ -14,12 +14,31 @@ public class TesseractLanguageManager
public TesseractLanguageManager(string basePath)
{
TessdataBasePath = Path.Combine(basePath, "tesseract-4.0.0b4");
TessdataBasePath = GetTessdataBasePath(basePath);
LanguageComponents = _languageData.Data.Select(x =>
new MultiFileExternalComponent($"ocr-{x.Code}", TessdataBasePath, new[] { $"best/{x.Code}.traineddata", $"fast/{x.Code}.traineddata" },
new DownloadInfo(x.Filename, Mirrors, x.Size, x.Sha1, DownloadFormat.Zip)));
}
private string GetTessdataBasePath(string basePath)
{
var legacyBasePath = Path.Combine(basePath, "tesseract-4.0.0b4");
var newBasePath = Path.Combine(basePath, "tesseract4");
if (Directory.Exists(legacyBasePath) && !Directory.Exists(newBasePath))
{
try
{
Directory.Move(legacyBasePath, newBasePath);
}
catch (Exception)
{
// Ignore errors and keep the legacy path, e.g. if the components folder is read-only
return legacyBasePath;
}
}
return newBasePath;
}
public string TessdataBasePath { get; }
public virtual IEnumerable<Language> InstalledLanguages =>