Use font subsets for Pdfium OCR exporting

This commit is contained in:
Ben Olden-Cooligan 2024-01-21 12:54:53 -08:00
parent 1883e5abad
commit 7863336f01
7 changed files with 98 additions and 54 deletions

View File

@ -77,9 +77,7 @@ public class ContextualTests : IDisposable
ScanningContext.OcrEngine = TesseractOcrEngine.CustomWithModes(tesseractPath, FolderPath);
}
public void SetUpFakeOcr() => SetUpFakeOcr(new());
public void SetUpFakeOcr(Dictionary<IMemoryImage, string> ocrTextByImage)
public void SetUpFakeOcr(Dictionary<IMemoryImage, string> ocrTextByImage = null, string ifNoMatch = null, int delay = 200)
{
var ocrMock = Substitute.For<IOcrEngine>();
ocrMock.ProcessImage(ScanningContext, Arg.Any<string>(), Arg.Any<OcrParams>(), Arg.Any<CancellationToken>())
@ -89,20 +87,31 @@ public class ContextualTests : IDisposable
var path = (string) x[1];
var ocrParams = (OcrParams) x[2];
var ocrImage = ImageContext.Load(path);
await Task.Delay(200);
// Lock so we don't try to access images simultaneously
lock (ocrTextByImage)
await Task.Delay(delay);
OcrResult CreateOcrResult(string text) => new((0, 0, 100, 100),
ImmutableList.Create(
new OcrResultElement(text, ocrParams.LanguageCode!, false,
(0, 0, 10, 10))));
if (ocrTextByImage != null)
{
foreach (var image in ocrTextByImage.Keys)
// Lock so we don't try to access images simultaneously
lock (ocrTextByImage)
{
if (ImageAsserts.IsSimilar(image, ocrImage))
foreach (var image in ocrTextByImage.Keys)
{
return new OcrResult((0, 0, 100, 100),
ImmutableList.Create(
new OcrResultElement(ocrTextByImage[image], ocrParams.LanguageCode!, false, (0, 0, 10, 10))));
if (ImageAsserts.IsSimilar(image, ocrImage))
{
return CreateOcrResult(ocrTextByImage[image]);
}
}
}
}
if (ifNoMatch != null)
{
return CreateOcrResult(ifNoMatch);
}
return null;
});
ScanningContext.OcrEngine = ocrMock;

View File

@ -22,7 +22,7 @@ public class PdfFontTests : ContextualTests
_exporter = new PdfExporter(ScanningContext);
_exportPath = Path.Combine(FolderPath, "test.pdf");
_pdfiumImportPath = Path.Combine(FolderPath, "import_ocr.pdf");
File.WriteAllBytes(_pdfiumImportPath, PdfResources.word_patcht_pdf);
File.WriteAllBytes(_pdfiumImportPath, PdfResources.word_ocr_test);
}
[Fact]
@ -45,10 +45,7 @@ public class PdfFontTests : ContextualTests
[MemberData(nameof(AlphabetTestCases))]
internal async Task ExportAlphabetsWithPdfSharp(Alphabet alphabet, string langCode, string text, bool rtl)
{
SetUpFakeOcr(new()
{
{ LoadImage(ImageResources.dog), text }
});
SetUpFakeOcr(ifNoMatch: text, delay: 0);
using var image = CreateScannedImage();
await _exporter.Export(_exportPath, [image], ocrParams: new OcrParams(langCode));
@ -58,16 +55,15 @@ public class PdfFontTests : ContextualTests
text = new string(text.Reverse().ToArray());
}
PdfAsserts.AssertContainsTextOnce(text, _exportPath);
// Rough verification that a font subset is used instead of embedding the whole font
Assert.InRange(new FileInfo(_exportPath).Length, 1, 500_000);
}
[Theory]
[MemberData(nameof(AlphabetTestCases))]
internal async Task ExportAlphabetsWithPdfium(Alphabet alphabet, string langCode, string text, bool rtl)
{
SetUpFakeOcr(new()
{
{ LoadImage(PdfResources.word_patcht_p1), text }
});
SetUpFakeOcr(ifNoMatch: text, delay: 0);
var images = await _importer.Import(_pdfiumImportPath).ToListAsync();
await _exporter.Export(_exportPath, images, ocrParams: new OcrParams(langCode));
@ -77,6 +73,8 @@ public class PdfFontTests : ContextualTests
text = new string(text.Reverse().ToArray());
}
PdfAsserts.AssertContainsTextOnce(text, _exportPath);
// Rough verification that a font subset is used instead of embedding the whole font
Assert.InRange(new FileInfo(_exportPath).Length, 1, 500_000);
}
public static IEnumerable<object[]> AlphabetTestCases =

View File

@ -149,6 +149,16 @@ namespace NAPS2.Sdk.Tests {
}
}
/// <summary>
/// Looks up a localized resource of type System.Byte[].
/// </summary>
internal static byte[] word_ocr_test {
get {
object obj = ResourceManager.GetObject("word_ocr_test", resourceCulture);
return ((byte[])(obj));
}
}
/// <summary>
/// Looks up a localized resource of type System.Byte[].
/// </summary>

View File

@ -133,6 +133,9 @@
<data name="word_p2_bw" type="System.Resources.ResXFileRef, System.Windows.Forms">
<value>Resources\word_p2_bw.png;System.Byte[], mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</data>
<data name="word_ocr_test" type="System.Resources.ResXFileRef, System.Windows.Forms">
<value>Resources\word_ocr_test.pdf;System.Byte[], mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</data>
<data name="image_pdf" type="System.Resources.ResXFileRef, System.Windows.Forms">
<value>Resources\image_pdf.pdf;System.Byte[], mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</data>

Binary file not shown.

View File

@ -8,7 +8,6 @@ using NAPS2.Pdf.Pdfium;
using NAPS2.Scan;
using PdfSharpCore.Drawing;
using PdfSharpCore.Drawing.Layout;
using PdfSharpCore.Fonts;
using PdfSharpCore.Pdf;
using PdfSharpCore.Pdf.IO;
using PdfSharpCore.Pdf.Security;
@ -63,7 +62,7 @@ public class PdfExporter
var document = InitializeDocument(exportParams);
// TODO: Consider storing text from imported image-based pages in PostProcessingData so it can be saved even
// when not exporting with OCR (assuming no transforms).
// when not exporting with OCR (assuming no transforms).
var ocrEngine = GetOcrEngine(ocrParams);
var imagePages = new List<PageExportState>();
@ -148,8 +147,8 @@ public class PdfExporter
});
}
private bool MergePassthroughPages(MemoryStream stream, OutputPathOrStream output, List<PageExportState> passthroughPages,
PdfExportParams exportParams, ProgressHandler progress)
private bool MergePassthroughPages(MemoryStream stream, OutputPathOrStream output,
List<PageExportState> passthroughPages, PdfExportParams exportParams, ProgressHandler progress)
{
if (!passthroughPages.Any())
{
@ -167,6 +166,8 @@ public class PdfExporter
var password = exportParams.Encryption.EncryptPdf ? exportParams.Encryption.OwnerPassword : null;
using var destDoc =
Pdfium.PdfDocument.Load(destHandle.AddrOfPinnedObject(), (int) stream.Length, password);
using var fontSubsets =
new PdfiumFontSubsets(destDoc, passthroughPages.Select(state => state.OcrTask?.Result));
foreach (var state in passthroughPages)
{
destDoc.DeletePage(state.PageIndex);
@ -193,7 +194,7 @@ public class PdfExporter
if (state.OcrTask?.Result != null)
{
using var page = destDoc.GetPage(state.PageIndex);
DrawOcrTextOnPdfiumPage(state.Page, destDoc, page, state.OcrTask.Result);
DrawOcrTextOnPdfiumPage(state.Page, destDoc, page, fontSubsets, state.OcrTask.Result);
}
if (progress.IsCancellationRequested) return false;
}
@ -341,7 +342,7 @@ public class PdfExporter
if (!_scanningContext.OcrRequestQueue.HasCachedResult(state.OcrEngine!, state.Image, state.OcrParams!))
{
// Save the image to a file for use in OCR.
// We don't need to delete this file as long as we pass it to OcrRequestQueue.Enqueue, which takes
// We don't need to delete this file as long as we pass it to OcrRequestQueue.Enqueue, which takes
// ownership and guarantees its eventual deletion.
using var fileStream = new FileStream(ocrTempFilePath, FileMode.Create, FileAccess.Write);
state.Embedder.CopyToStream(fileStream);
@ -413,38 +414,22 @@ public class PdfExporter
}
private static void DrawOcrTextOnPdfiumPage(PdfPage page, Pdfium.PdfDocument pdfiumDocument,
Pdfium.PdfPage pdfiumPage, OcrResult ocrResult)
Pdfium.PdfPage pdfiumPage, PdfiumFontSubsets fontSubsets, OcrResult ocrResult)
{
using XGraphics gfx = XGraphics.FromPdfPage(page, XGraphicsPdfPageOptions.Prepend);
var fontCache = new Dictionary<string, PdfFont>();
try
foreach (var element in ocrResult.Elements)
{
foreach (var element in ocrResult.Elements)
{
var info = GetTextDrawInfo(page, gfx, ocrResult, element);
if (info == null) continue;
var info = GetTextDrawInfo(page, gfx, ocrResult, element);
if (info == null) continue;
var fontName = PdfFontPicker.GetBestFont(element.LanguageCode);
var font = fontCache.GetOrSet(fontName, () =>
{
var fontInfo = GlobalFontSettings.FontResolver.ResolveTypeface(fontName, false, false);
return pdfiumDocument.LoadFont(GlobalFontSettings.FontResolver.GetFont(fontInfo.FaceName));
});
var textObj = pdfiumDocument.NewText(font, info.FontSize);
textObj.TextRenderMode = TextRenderMode.Invisible;
textObj.SetText(info.Text);
// This ends up being slightly different alignment then the PdfSharp-based text. Maybe at some point we can
// try to make them identical, although it's not perfect to begin with.
textObj.Matrix = new PdfMatrix(1, 0, 0, 1, info.X, (float) page.Height - (info.Y + info.TextHeight));
pdfiumPage.InsertObject(textObj);
}
}
finally
{
foreach (var font in fontCache.Values)
{
font.Dispose();
}
var fontName = PdfFontPicker.GetBestFont(element.LanguageCode);
var textObj = pdfiumDocument.NewText(fontSubsets[fontName], info.FontSize);
textObj.TextRenderMode = TextRenderMode.Invisible;
textObj.SetText(info.Text);
// This ends up being slightly different alignment then the PdfSharp-based text. Maybe at some point we can
// try to make them identical, although it's not perfect to begin with.
textObj.Matrix = new PdfMatrix(1, 0, 0, 1, info.X, (float) page.Height - (info.Y + info.TextHeight));
pdfiumPage.InsertObject(textObj);
}
pdfiumPage.GenerateContent();
}

View File

@ -0,0 +1,39 @@
using NAPS2.Ocr;
using NAPS2.Pdf.Pdfium;
using PdfSharpCore.Utils;
namespace NAPS2.Pdf;
/// <summary>
/// Creates and manages the lifetime of font subsets for Pdfium exporting.
/// </summary>
internal class PdfiumFontSubsets : IDisposable
{
private readonly Dictionary<string, PdfFont> _fonts;
public PdfiumFontSubsets(PdfDocument pdfiumDocument, IEnumerable<OcrResult?> ocrResults)
{
var fontSubsetBuilders = new Dictionary<string, FontSubsetBuilder>();
foreach (var element in ocrResults.WhereNotNull().SelectMany(result => result.Elements))
{
// Map the OCR language to a font that supports its glyphs
var fontName = PdfFontPicker.GetBestFont(element.LanguageCode);
// TODO: What happens if the font name isn't found?
var builder = fontSubsetBuilders.GetOrSet(fontName, () => new FontSubsetBuilder(fontName));
// Include the glyphs from the current text in the font subset
builder.AddGlyphs(element.Text);
}
// Load each font subset into Pdfium
_fonts = fontSubsetBuilders.ToDictionary(kvp => kvp.Key, kvp => pdfiumDocument.LoadFont(kvp.Value.Build()));
}
public PdfFont this[string fontName] => _fonts[fontName];
public void Dispose()
{
foreach (var font in _fonts.Values)
{
font.Dispose();
}
}
}