naps2/NAPS2.App.Tests/Appium/ImportAndSaveTests.cs
Ben Olden-Cooligan 79bba70370 Improve OCR text alignment
This is nearly a full rewrite of the alignment code. Position is now based on the line baseline (provided by Tesseract) and the font size is smarter (defaulting to Tesseract's provided value with various adjustments).

The goals were:
- Have Ctrl+F highlight the word as accurately as possible.
- Have Ctrl+A/Ctrl+C end up with text that matches the original as closely as possible.
- Have PdfSharp and Pdfium produce consistent output.
On my test cases all goals are fully met.

#236
2024-03-26 19:05:17 -07:00

66 lines
2.3 KiB
C#

using System.Threading;
using NAPS2.App.Tests.Targets;
using NAPS2.App.Tests.Verification;
using NAPS2.Sdk.Tests;
using NAPS2.Sdk.Tests.Asserts;
using Xunit;
namespace NAPS2.App.Tests.Appium;
[Collection("appium")]
public class ImportAndSaveTests : AppiumTests
{
[VerifyTheory(AllowDebug = true, WindowsAppium = true)]
[ClassData(typeof(AppiumTestData))]
public void ImportVariousAndSavePdfWithOcr(IAppTestTarget target)
{
Init(target);
CopyResourceToFile(PdfResources.word_generated_pdf, "word.pdf");
CopyResourceToFile(PdfResources.word_patcht_pdf, "patcht.pdf");
CopyResourceToFile(PdfResources.image_pdf, "image.pdf");
CopyResourceToFile(BinaryResources.ocr_test, "text.jpg");
var tessdata = Path.Combine(FolderPath, "components", "tesseract4", "fast");
Directory.CreateDirectory(tessdata);
CopyResourceToFile(BinaryResources.eng_traineddata, tessdata, "eng.traineddata");
ImportFile("word.pdf");
ImportFile("patcht.pdf");
ImportFile("image.pdf");
ImportFile("text.jpg");
ClickAtName("OCR");
ClickAtName("Make PDFs searchable using OCR");
ClickAtName("OK");
ClickAtName("Save PDF");
ResetMainWindow();
var fileTextBox = WaitFor(() => _session.FindElementsByName("File name:").Last());
ClickAt(fileTextBox);
fileTextBox.SendKeys("test.pdf");
ClickAtName("Save");
// Wait for the save to finish
Thread.Sleep(100);
WaitFor(() => !HasElementWithName("Cancel"), 10_000);
var path = Path.Combine(FolderPath, "test.pdf");
PdfAsserts.AssertImages(path,
PdfResources.word_p1,
PdfResources.word_p2,
PdfResources.word_patcht_p1,
ImageResources.dog,
ImageResources.ocr_test);
PdfAsserts.AssertContainsTextOnce("Page one.", path);
PdfAsserts.AssertContainsTextOnce("Page two.", path);
PdfAsserts.AssertContainsTextOnce("ADVERTISEMENT.", path);
PdfAsserts.AssertContainsTextOnce("Sized for printing unscaled", path);
AppTestHelper.AssertNoErrorLog(FolderPath);
}
private void ImportFile(string fileName)
{
ClickAtName("Import");
DoubleClickAtName(fileName);
ResetMainWindow();
Thread.Sleep(100);
}
}