Partially fix Pdfium PDF/A compliance and improve tests

This commit is contained in:
Ben Olden-Cooligan 2024-04-06 13:57:32 -07:00
parent 8e3ea2a4d8
commit 3490a5234a
7 changed files with 116 additions and 33 deletions

View File

@ -30,6 +30,10 @@ public static class PdfAsserts
public static async Task AssertCompliant(string profile, string filePath)
{
if (string.IsNullOrEmpty(profile))
{
return;
}
Assert.True(File.Exists(filePath));
var report = await LazyPdfAValidator.Value.ValidateWithDetailedReportAsync(filePath);
Assert.True(report.Jobs.Job.ValidationReport.IsCompliant);
@ -128,4 +132,13 @@ public static class PdfAsserts
$"Expected filters: {string.Join(",", filters)}, actual: {string.Join(",", obj.GetImageFilters())}");
}
}
public static void AssertVersion(int version, string filePath)
{
lock (PdfiumNativeLibrary.Instance)
{
using var doc = PdfDocument.Load(filePath);
Assert.Equal(version, doc.Version);
}
}
}

View File

@ -1,35 +1,76 @@
using NAPS2.Ocr;
using NAPS2.Pdf;
using NAPS2.Sdk.Tests.Asserts;
using Xunit;
namespace NAPS2.Sdk.Tests.Pdf;
// TODO: Validate with OCR output
// TODO: Maaaybe validate with external import? We certainly can't guarantee it, but maybe some cases can be verified for best effort
public class PdfATests : ContextualTests
{
// Sadly the pdfa verifier library only supports windows/mac
[PlatformFact(exclude: PlatformFlags.Mac)]
public async Task Validate()
{
var pdfExporter = new PdfExporter(ScanningContext);
var testCases = new (PdfCompat pdfCompat, string profile, string fileName)[]
{
(PdfCompat.PdfA1B, "PDF/A-1B", "pdfa1b_test.pdf"),
(PdfCompat.PdfA2B, "PDF/A-2B", "pdfa2b_test.pdf"),
(PdfCompat.PdfA3B, "PDF/A-3B", "pdfa3b_test.pdf"),
(PdfCompat.PdfA3U, "PDF/A-3U", "pdfa3u_test.pdf")
};
private readonly PdfExporter _pdfExporter;
private readonly string _path;
private readonly string _importPath;
var tasks = testCases.Select(testCase =>
{
using var image = CreateScannedImage();
var path = Path.Combine(FolderPath, testCase.fileName);
pdfExporter.Export(path, new[] { image }, new PdfExportParams
{
Compat = testCase.pdfCompat
}).Wait();
return PdfAsserts.AssertCompliant(testCase.profile, path);
}).ToArray();
await Task.WhenAll(tasks);
public PdfATests()
{
_pdfExporter = new PdfExporter(ScanningContext);
_path = Path.Combine(FolderPath, "test.pdf");
_importPath = CopyResourceToFile(PdfResources.word_patcht_pdf, "word.pdf");
}
// Sadly the pdfa verifier library only supports windows/linux
[PlatformTheory(exclude: PlatformFlags.Mac)]
[MemberData(nameof(TestCases))]
public async Task Validate(PdfCompat pdfCompat, string profile, int version)
{
await _pdfExporter.Export(_path, new[] { CreateScannedImage() }, new PdfExportParams
{
Compat = pdfCompat
});
PdfAsserts.AssertVersion(version, _path);
await PdfAsserts.AssertCompliant(profile, _path);
}
[PlatformTheory(exclude: PlatformFlags.Mac)]
[MemberData(nameof(TestCases))]
public async Task ValidateWithOcr(PdfCompat pdfCompat, string profile, int version)
{
SetUpFakeOcr(ifNoMatch: "hello world");
await _pdfExporter.Export(_path, new[] { CreateScannedImage() }, new PdfExportParams
{
Compat = pdfCompat
}, new OcrParams("eng"));
PdfAsserts.AssertVersion(version, _path);
await PdfAsserts.AssertCompliant(profile, _path);
}
[PlatformTheory(exclude: PlatformFlags.Mac)]
[MemberData(nameof(TestCases))]
public async Task ValidateWithPdfium(PdfCompat pdfCompat, string profile, int version)
{
var images = await new PdfImporter(ScanningContext).Import(_importPath).ToListAsync();
await _pdfExporter.Export(_path, images, new PdfExportParams
{
Compat = pdfCompat
});
PdfAsserts.AssertVersion(version, _path);
await PdfAsserts.AssertCompliant(profile, _path);
}
// Note that we don't have a Pdfium OCR test as we fail compliance due to the way Pdfium embeds fonts, which isn't
// practical to fix.
public static IEnumerable<object[]> TestCases =
[
[PdfCompat.Default, "", 14],
[PdfCompat.PdfA1B, "PDF/A-1B", 14],
[PdfCompat.PdfA2B, "PDF/A-2B", 17],
[PdfCompat.PdfA3B, "PDF/A-3B", 17],
[PdfCompat.PdfA3U, "PDF/A-3U", 17]
];
}

View File

@ -0,0 +1,19 @@
using System.Runtime.InteropServices;
using Xunit;
namespace NAPS2.Sdk.Tests;
public sealed class PlatformTheoryAttribute : TheoryAttribute
{
public PlatformTheoryAttribute(PlatformFlags include = PlatformFlags.None, PlatformFlags exclude = PlatformFlags.None)
{
if (include != PlatformFlags.None && (CurrentPlatformFlags.Get() & include) != include)
{
Skip = $"Only runs on platform(s): {include}";
}
if (exclude != PlatformFlags.None && (CurrentPlatformFlags.Get() & exclude) != PlatformFlags.None)
{
Skip = $"Doesn't run on platform(s): {exclude}";
}
}
}

View File

@ -6,12 +6,12 @@ namespace NAPS2.Pdf;
internal static class PdfAHelper
{
public static void CreateXmpMetadata(PdfDocument document, PdfCompat compat)
public static void CreateXmpMetadata(PdfDocument document, PdfCompat compat, string producer)
{
var metadataDict = new PdfDictionary(document);
metadataDict.Elements["/Type"] = new PdfName("/Metadata");
metadataDict.Elements["/Subtype"] = new PdfName("/XML");
metadataDict.CreateStream(CreateRawXmpMetadata(document.Info, GetConformance(compat)));
metadataDict.CreateStream(CreateRawXmpMetadata(document.Info, GetConformance(compat), producer));
document.Internals.AddObject(metadataDict);
document.Internals.Catalog.Elements["/Metadata"] = metadataDict.Reference;
}
@ -33,7 +33,8 @@ internal static class PdfAHelper
}
}
private static byte[] CreateRawXmpMetadata(PdfDocumentInformation info, (string, string) conformance)
private static byte[] CreateRawXmpMetadata(PdfDocumentInformation info, (string, string) conformance,
string producer)
{
string xml = $@"<?xpacket begin=""{'\ufeff'}"" id=""W5M0MpCehiHzreSzNTczkc9d""?>
<x:xmpmeta xmlns:x=""adobe:ns:meta/"" x:xmptk=""Adobe XMP Core 5.1.0-jc003"">
@ -45,7 +46,7 @@ internal static class PdfAHelper
xmlns:pdfaid=""http://www.aiim.org/pdfa/ns/id/""
dc:format=""application/pdf""
pdf:Keywords=""{info.Keywords}""
pdf:Producer=""{PdfSharpCore.ProductVersionInfo.Producer}""
pdf:Producer=""{producer}""
xmp:CreateDate=""{info.CreationDate:yyyy'-'MM'-'dd'T'HH':'mm':'ssK}""
xmp:ModifyDate=""{info.ModificationDate:yyyy'-'MM'-'dd'T'HH':'mm':'ssK}""
xmp:CreatorTool=""{info.Creator}""

View File

@ -23,6 +23,7 @@ public class PdfExporter
{
private const int PDF_VERSION_14 = 14;
private const int PDF_VERSION_17 = 17;
private const string PDFIUM_PRODUCER = "PDFium";
private readonly ScanningContext _scanningContext;
private readonly ILogger _logger;
@ -133,8 +134,9 @@ public class PdfExporter
await pdfPagesOcrPipeline;
if (progress.IsCancellationRequested) return false;
var producer = pdfPages.Any() ? PDFIUM_PRODUCER : PdfSharpCore.ProductVersionInfo.Producer;
// TODO: Doing in memory as that's presumably faster than IO, but of course that's quite a bit of memory use potentially...
var stream = FinalizeAndSaveDocument(document, exportParams);
var stream = FinalizeAndSaveDocument(document, exportParams, producer);
if (progress.IsCancellationRequested) return false;
return MergePassthroughPages(stream, output, pdfPages, exportParams, progress);
@ -296,7 +298,8 @@ public class PdfExporter
return state;
}
private static MemoryStream FinalizeAndSaveDocument(PdfDocument document, PdfExportParams exportParams)
private static MemoryStream FinalizeAndSaveDocument(PdfDocument document, PdfExportParams exportParams,
string producer)
{
var compat = exportParams.Compat;
var now = DateTime.Now;
@ -312,7 +315,7 @@ public class PdfExporter
{
PdfAHelper.SetColorProfile(document);
PdfAHelper.SetCidMap(document);
PdfAHelper.CreateXmpMetadata(document, compat);
PdfAHelper.CreateXmpMetadata(document, compat, producer);
}
document.Version = compat switch
@ -465,7 +468,8 @@ public class PdfExporter
while (true)
{
var font = new XFont(lineFontFamily, lineFontSize + 1, XFontStyle.Regular);
if (eligibleWords.All(word => gfx.MeasureString(word.Text, font).Width < word.Bounds.w * hAdjust))
if (eligibleWords.All(
word => gfx.MeasureString(word.Text, font).Width < word.Bounds.w * hAdjust))
{
lineFontSize++;
}

View File

@ -43,6 +43,8 @@ internal class PdfDocument : NativePdfiumObject
public int PageCount => Native.FPDF_GetPageCount(Handle);
public int? Version => Native.FPDF_GetFileVersion(Handle, out int version) ? version : null;
public PdfPage GetPage(int pageIndex)
{
return new PdfPage(Native.FPDF_LoadPage(Handle, pageIndex), this, pageIndex);

View File

@ -73,6 +73,8 @@ internal class PdfiumNativeLibrary : Unmanaged.NativeLibrary
public delegate bool FPDF_SaveAsCopy_delegate(IntPtr document, ref FPDF_FileWrite fileWrite, int flags);
public delegate bool FPDF_GetFileVersion_delegate(IntPtr document, out int fileVersion);
public delegate IntPtr FPDF_GetMetaText_delegate(IntPtr document, [MarshalAs(UnmanagedType.LPStr)] string tag,
byte[]? buffer, IntPtr buflen);
@ -214,6 +216,7 @@ internal class PdfiumNativeLibrary : Unmanaged.NativeLibrary
public FPDF_LoadMemDocument_delegate FPDF_LoadMemDocument => Load<FPDF_LoadMemDocument_delegate>();
public FPDF_CloseDocument_delegate FPDF_CloseDocument => Load<FPDF_CloseDocument_delegate>();
public FPDF_SaveAsCopy_delegate FPDF_SaveAsCopy => Load<FPDF_SaveAsCopy_delegate>();
public FPDF_GetFileVersion_delegate FPDF_GetFileVersion => Load<FPDF_GetFileVersion_delegate>();
public FPDF_GetMetaText_delegate FPDF_GetMetaText => Load<FPDF_GetMetaText_delegate>();
public FPDF_GetPageCount_delegate FPDF_GetPageCount => Load<FPDF_GetPageCount_delegate>();
public FPDF_LoadPage_delegate FPDF_LoadPage => Load<FPDF_LoadPage_delegate>();