mirror of
https://github.com/cyanfish/naps2.git
synced 2024-09-11 15:26:55 +03:00
Use font subsets for Pdfium OCR exporting
This commit is contained in:
parent
1883e5abad
commit
7863336f01
@ -77,9 +77,7 @@ public class ContextualTests : IDisposable
|
||||
ScanningContext.OcrEngine = TesseractOcrEngine.CustomWithModes(tesseractPath, FolderPath);
|
||||
}
|
||||
|
||||
public void SetUpFakeOcr() => SetUpFakeOcr(new());
|
||||
|
||||
public void SetUpFakeOcr(Dictionary<IMemoryImage, string> ocrTextByImage)
|
||||
public void SetUpFakeOcr(Dictionary<IMemoryImage, string> ocrTextByImage = null, string ifNoMatch = null, int delay = 200)
|
||||
{
|
||||
var ocrMock = Substitute.For<IOcrEngine>();
|
||||
ocrMock.ProcessImage(ScanningContext, Arg.Any<string>(), Arg.Any<OcrParams>(), Arg.Any<CancellationToken>())
|
||||
@ -89,20 +87,31 @@ public class ContextualTests : IDisposable
|
||||
var path = (string) x[1];
|
||||
var ocrParams = (OcrParams) x[2];
|
||||
var ocrImage = ImageContext.Load(path);
|
||||
await Task.Delay(200);
|
||||
// Lock so we don't try to access images simultaneously
|
||||
lock (ocrTextByImage)
|
||||
await Task.Delay(delay);
|
||||
|
||||
OcrResult CreateOcrResult(string text) => new((0, 0, 100, 100),
|
||||
ImmutableList.Create(
|
||||
new OcrResultElement(text, ocrParams.LanguageCode!, false,
|
||||
(0, 0, 10, 10))));
|
||||
|
||||
if (ocrTextByImage != null)
|
||||
{
|
||||
foreach (var image in ocrTextByImage.Keys)
|
||||
// Lock so we don't try to access images simultaneously
|
||||
lock (ocrTextByImage)
|
||||
{
|
||||
if (ImageAsserts.IsSimilar(image, ocrImage))
|
||||
foreach (var image in ocrTextByImage.Keys)
|
||||
{
|
||||
return new OcrResult((0, 0, 100, 100),
|
||||
ImmutableList.Create(
|
||||
new OcrResultElement(ocrTextByImage[image], ocrParams.LanguageCode!, false, (0, 0, 10, 10))));
|
||||
if (ImageAsserts.IsSimilar(image, ocrImage))
|
||||
{
|
||||
return CreateOcrResult(ocrTextByImage[image]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ifNoMatch != null)
|
||||
{
|
||||
return CreateOcrResult(ifNoMatch);
|
||||
}
|
||||
return null;
|
||||
});
|
||||
ScanningContext.OcrEngine = ocrMock;
|
||||
|
@ -22,7 +22,7 @@ public class PdfFontTests : ContextualTests
|
||||
_exporter = new PdfExporter(ScanningContext);
|
||||
_exportPath = Path.Combine(FolderPath, "test.pdf");
|
||||
_pdfiumImportPath = Path.Combine(FolderPath, "import_ocr.pdf");
|
||||
File.WriteAllBytes(_pdfiumImportPath, PdfResources.word_patcht_pdf);
|
||||
File.WriteAllBytes(_pdfiumImportPath, PdfResources.word_ocr_test);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@ -45,10 +45,7 @@ public class PdfFontTests : ContextualTests
|
||||
[MemberData(nameof(AlphabetTestCases))]
|
||||
internal async Task ExportAlphabetsWithPdfSharp(Alphabet alphabet, string langCode, string text, bool rtl)
|
||||
{
|
||||
SetUpFakeOcr(new()
|
||||
{
|
||||
{ LoadImage(ImageResources.dog), text }
|
||||
});
|
||||
SetUpFakeOcr(ifNoMatch: text, delay: 0);
|
||||
|
||||
using var image = CreateScannedImage();
|
||||
await _exporter.Export(_exportPath, [image], ocrParams: new OcrParams(langCode));
|
||||
@ -58,16 +55,15 @@ public class PdfFontTests : ContextualTests
|
||||
text = new string(text.Reverse().ToArray());
|
||||
}
|
||||
PdfAsserts.AssertContainsTextOnce(text, _exportPath);
|
||||
// Rough verification that a font subset is used instead of embedding the whole font
|
||||
Assert.InRange(new FileInfo(_exportPath).Length, 1, 500_000);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[MemberData(nameof(AlphabetTestCases))]
|
||||
internal async Task ExportAlphabetsWithPdfium(Alphabet alphabet, string langCode, string text, bool rtl)
|
||||
{
|
||||
SetUpFakeOcr(new()
|
||||
{
|
||||
{ LoadImage(PdfResources.word_patcht_p1), text }
|
||||
});
|
||||
SetUpFakeOcr(ifNoMatch: text, delay: 0);
|
||||
|
||||
var images = await _importer.Import(_pdfiumImportPath).ToListAsync();
|
||||
await _exporter.Export(_exportPath, images, ocrParams: new OcrParams(langCode));
|
||||
@ -77,6 +73,8 @@ public class PdfFontTests : ContextualTests
|
||||
text = new string(text.Reverse().ToArray());
|
||||
}
|
||||
PdfAsserts.AssertContainsTextOnce(text, _exportPath);
|
||||
// Rough verification that a font subset is used instead of embedding the whole font
|
||||
Assert.InRange(new FileInfo(_exportPath).Length, 1, 500_000);
|
||||
}
|
||||
|
||||
public static IEnumerable<object[]> AlphabetTestCases =
|
||||
|
10
NAPS2.Sdk.Tests/PdfResources.Designer.cs
generated
10
NAPS2.Sdk.Tests/PdfResources.Designer.cs
generated
@ -149,6 +149,16 @@ namespace NAPS2.Sdk.Tests {
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Looks up a localized resource of type System.Byte[].
|
||||
/// </summary>
|
||||
internal static byte[] word_ocr_test {
|
||||
get {
|
||||
object obj = ResourceManager.GetObject("word_ocr_test", resourceCulture);
|
||||
return ((byte[])(obj));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Looks up a localized resource of type System.Byte[].
|
||||
/// </summary>
|
||||
|
@ -133,6 +133,9 @@
|
||||
<data name="word_p2_bw" type="System.Resources.ResXFileRef, System.Windows.Forms">
|
||||
<value>Resources\word_p2_bw.png;System.Byte[], mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
|
||||
</data>
|
||||
<data name="word_ocr_test" type="System.Resources.ResXFileRef, System.Windows.Forms">
|
||||
<value>Resources\word_ocr_test.pdf;System.Byte[], mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
|
||||
</data>
|
||||
<data name="image_pdf" type="System.Resources.ResXFileRef, System.Windows.Forms">
|
||||
<value>Resources\image_pdf.pdf;System.Byte[], mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
|
||||
</data>
|
||||
|
BIN
NAPS2.Sdk.Tests/Resources/word_ocr_test.pdf
Normal file
BIN
NAPS2.Sdk.Tests/Resources/word_ocr_test.pdf
Normal file
Binary file not shown.
@ -8,7 +8,6 @@ using NAPS2.Pdf.Pdfium;
|
||||
using NAPS2.Scan;
|
||||
using PdfSharpCore.Drawing;
|
||||
using PdfSharpCore.Drawing.Layout;
|
||||
using PdfSharpCore.Fonts;
|
||||
using PdfSharpCore.Pdf;
|
||||
using PdfSharpCore.Pdf.IO;
|
||||
using PdfSharpCore.Pdf.Security;
|
||||
@ -63,7 +62,7 @@ public class PdfExporter
|
||||
var document = InitializeDocument(exportParams);
|
||||
|
||||
// TODO: Consider storing text from imported image-based pages in PostProcessingData so it can be saved even
|
||||
// when not exporting with OCR (assuming no transforms).
|
||||
// when not exporting with OCR (assuming no transforms).
|
||||
var ocrEngine = GetOcrEngine(ocrParams);
|
||||
|
||||
var imagePages = new List<PageExportState>();
|
||||
@ -148,8 +147,8 @@ public class PdfExporter
|
||||
});
|
||||
}
|
||||
|
||||
private bool MergePassthroughPages(MemoryStream stream, OutputPathOrStream output, List<PageExportState> passthroughPages,
|
||||
PdfExportParams exportParams, ProgressHandler progress)
|
||||
private bool MergePassthroughPages(MemoryStream stream, OutputPathOrStream output,
|
||||
List<PageExportState> passthroughPages, PdfExportParams exportParams, ProgressHandler progress)
|
||||
{
|
||||
if (!passthroughPages.Any())
|
||||
{
|
||||
@ -167,6 +166,8 @@ public class PdfExporter
|
||||
var password = exportParams.Encryption.EncryptPdf ? exportParams.Encryption.OwnerPassword : null;
|
||||
using var destDoc =
|
||||
Pdfium.PdfDocument.Load(destHandle.AddrOfPinnedObject(), (int) stream.Length, password);
|
||||
using var fontSubsets =
|
||||
new PdfiumFontSubsets(destDoc, passthroughPages.Select(state => state.OcrTask?.Result));
|
||||
foreach (var state in passthroughPages)
|
||||
{
|
||||
destDoc.DeletePage(state.PageIndex);
|
||||
@ -193,7 +194,7 @@ public class PdfExporter
|
||||
if (state.OcrTask?.Result != null)
|
||||
{
|
||||
using var page = destDoc.GetPage(state.PageIndex);
|
||||
DrawOcrTextOnPdfiumPage(state.Page, destDoc, page, state.OcrTask.Result);
|
||||
DrawOcrTextOnPdfiumPage(state.Page, destDoc, page, fontSubsets, state.OcrTask.Result);
|
||||
}
|
||||
if (progress.IsCancellationRequested) return false;
|
||||
}
|
||||
@ -341,7 +342,7 @@ public class PdfExporter
|
||||
if (!_scanningContext.OcrRequestQueue.HasCachedResult(state.OcrEngine!, state.Image, state.OcrParams!))
|
||||
{
|
||||
// Save the image to a file for use in OCR.
|
||||
// We don't need to delete this file as long as we pass it to OcrRequestQueue.Enqueue, which takes
|
||||
// We don't need to delete this file as long as we pass it to OcrRequestQueue.Enqueue, which takes
|
||||
// ownership and guarantees its eventual deletion.
|
||||
using var fileStream = new FileStream(ocrTempFilePath, FileMode.Create, FileAccess.Write);
|
||||
state.Embedder.CopyToStream(fileStream);
|
||||
@ -413,38 +414,22 @@ public class PdfExporter
|
||||
}
|
||||
|
||||
private static void DrawOcrTextOnPdfiumPage(PdfPage page, Pdfium.PdfDocument pdfiumDocument,
|
||||
Pdfium.PdfPage pdfiumPage, OcrResult ocrResult)
|
||||
Pdfium.PdfPage pdfiumPage, PdfiumFontSubsets fontSubsets, OcrResult ocrResult)
|
||||
{
|
||||
using XGraphics gfx = XGraphics.FromPdfPage(page, XGraphicsPdfPageOptions.Prepend);
|
||||
var fontCache = new Dictionary<string, PdfFont>();
|
||||
try
|
||||
foreach (var element in ocrResult.Elements)
|
||||
{
|
||||
foreach (var element in ocrResult.Elements)
|
||||
{
|
||||
var info = GetTextDrawInfo(page, gfx, ocrResult, element);
|
||||
if (info == null) continue;
|
||||
var info = GetTextDrawInfo(page, gfx, ocrResult, element);
|
||||
if (info == null) continue;
|
||||
|
||||
var fontName = PdfFontPicker.GetBestFont(element.LanguageCode);
|
||||
var font = fontCache.GetOrSet(fontName, () =>
|
||||
{
|
||||
var fontInfo = GlobalFontSettings.FontResolver.ResolveTypeface(fontName, false, false);
|
||||
return pdfiumDocument.LoadFont(GlobalFontSettings.FontResolver.GetFont(fontInfo.FaceName));
|
||||
});
|
||||
var textObj = pdfiumDocument.NewText(font, info.FontSize);
|
||||
textObj.TextRenderMode = TextRenderMode.Invisible;
|
||||
textObj.SetText(info.Text);
|
||||
// This ends up being slightly different alignment then the PdfSharp-based text. Maybe at some point we can
|
||||
// try to make them identical, although it's not perfect to begin with.
|
||||
textObj.Matrix = new PdfMatrix(1, 0, 0, 1, info.X, (float) page.Height - (info.Y + info.TextHeight));
|
||||
pdfiumPage.InsertObject(textObj);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
foreach (var font in fontCache.Values)
|
||||
{
|
||||
font.Dispose();
|
||||
}
|
||||
var fontName = PdfFontPicker.GetBestFont(element.LanguageCode);
|
||||
var textObj = pdfiumDocument.NewText(fontSubsets[fontName], info.FontSize);
|
||||
textObj.TextRenderMode = TextRenderMode.Invisible;
|
||||
textObj.SetText(info.Text);
|
||||
// This ends up being slightly different alignment then the PdfSharp-based text. Maybe at some point we can
|
||||
// try to make them identical, although it's not perfect to begin with.
|
||||
textObj.Matrix = new PdfMatrix(1, 0, 0, 1, info.X, (float) page.Height - (info.Y + info.TextHeight));
|
||||
pdfiumPage.InsertObject(textObj);
|
||||
}
|
||||
pdfiumPage.GenerateContent();
|
||||
}
|
||||
|
39
NAPS2.Sdk/Pdf/PdfiumFontSubsets.cs
Normal file
39
NAPS2.Sdk/Pdf/PdfiumFontSubsets.cs
Normal file
@ -0,0 +1,39 @@
|
||||
using NAPS2.Ocr;
|
||||
using NAPS2.Pdf.Pdfium;
|
||||
using PdfSharpCore.Utils;
|
||||
|
||||
namespace NAPS2.Pdf;
|
||||
|
||||
/// <summary>
|
||||
/// Creates and manages the lifetime of font subsets for Pdfium exporting.
|
||||
/// </summary>
|
||||
internal class PdfiumFontSubsets : IDisposable
|
||||
{
|
||||
private readonly Dictionary<string, PdfFont> _fonts;
|
||||
|
||||
public PdfiumFontSubsets(PdfDocument pdfiumDocument, IEnumerable<OcrResult?> ocrResults)
|
||||
{
|
||||
var fontSubsetBuilders = new Dictionary<string, FontSubsetBuilder>();
|
||||
foreach (var element in ocrResults.WhereNotNull().SelectMany(result => result.Elements))
|
||||
{
|
||||
// Map the OCR language to a font that supports its glyphs
|
||||
var fontName = PdfFontPicker.GetBestFont(element.LanguageCode);
|
||||
// TODO: What happens if the font name isn't found?
|
||||
var builder = fontSubsetBuilders.GetOrSet(fontName, () => new FontSubsetBuilder(fontName));
|
||||
// Include the glyphs from the current text in the font subset
|
||||
builder.AddGlyphs(element.Text);
|
||||
}
|
||||
// Load each font subset into Pdfium
|
||||
_fonts = fontSubsetBuilders.ToDictionary(kvp => kvp.Key, kvp => pdfiumDocument.LoadFont(kvp.Value.Build()));
|
||||
}
|
||||
|
||||
public PdfFont this[string fontName] => _fonts[fontName];
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
foreach (var font in _fonts.Values)
|
||||
{
|
||||
font.Dispose();
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user