From 79bba703703b2c7ca0dda670e7e462880cd216ef Mon Sep 17 00:00:00 2001
From: Ben Olden-Cooligan <ben.cyanfish@gmail.com>
Date: Tue, 26 Mar 2024 19:05:17 -0700
Subject: [PATCH] Improve OCR text alignment

This is nearly a full rewrite of the alignment code. Position is now based on the line baseline (provided by Tesseract) and the font size is smarter (defaulting to Tesseract's provided value with various adjustments).

The goals were:
- Have Ctrl+F highlight the word as accurately as possible.
- Have Ctrl+A/Ctrl+C end up with text that matches the original as closely as possible.
- Have PdfSharp and Pdfium produce consistent output.
On my test cases all goals are fully met.

#236
---
 NAPS2.App.Tests/Appium/ImportAndSaveTests.cs  |   2 +-
 NAPS2.Sdk.Tests/ContextualTests.cs            |   9 +-
 NAPS2.Sdk.Tests/Ocr/OcrRequestQueueTests.cs   |   4 +-
 .../Ocr/TesseractOcrEngineTests.cs            |  22 +--
 NAPS2.Sdk/Ocr/OcrResult.cs                    |  15 +-
 NAPS2.Sdk/Ocr/OcrResultElement.cs             |  13 +-
 NAPS2.Sdk/Ocr/TesseractOcrEngine.cs           | 162 ++++++++++++------
 NAPS2.Sdk/Pdf/PdfExporter.cs                  | 154 +++++++++++------
 NAPS2.Sdk/Pdf/PdfiumFontSubsets.cs            |   2 +-
 9 files changed, 252 insertions(+), 131 deletions(-)
diff --git a/NAPS2.App.Tests/Appium/ImportAndSaveTests.cs b/NAPS2.App.Tests/Appium/ImportAndSaveTests.cs
index d7707c2c0..3b7d914d2 100644
--- a/NAPS2.App.Tests/Appium/ImportAndSaveTests.cs
+++ b/NAPS2.App.Tests/Appium/ImportAndSaveTests.cs
@@ -52,7 +52,7 @@ public class ImportAndSaveTests : AppiumTests
         PdfAsserts.AssertContainsTextOnce("Page one.", path);
         PdfAsserts.AssertContainsTextOnce("Page two.", path);
         PdfAsserts.AssertContainsTextOnce("ADVERTISEMENT.", path);
-        PdfAsserts.AssertContainsTextOnce("Patch Code separator sheet geometry", path);
+        PdfAsserts.AssertContainsTextOnce("Sized for printing unscaled", path);
         AppTestHelper.AssertNoErrorLog(FolderPath);
     }
 
diff --git a/NAPS2.Sdk.Tests/ContextualTests.cs b/NAPS2.Sdk.Tests/ContextualTests.cs
index 3e876cbf4..c22d63b09 100644
--- a/NAPS2.Sdk.Tests/ContextualTests.cs
+++ b/NAPS2.Sdk.Tests/ContextualTests.cs
@@ -89,10 +89,13 @@ public class ContextualTests : IDisposable
                     var ocrImage = ImageContext.Load(path);
                     await Task.Delay(delay);
 
-                    OcrResult CreateOcrResult(string text) => new((0, 0, 100, 100),
-                        ImmutableList.Create(
+                    OcrResult CreateOcrResult(string text)
+                    {
+                        var list = ImmutableList.Create(
                             new OcrResultElement(text, ocrParams.LanguageCode!, false,
-                                (10, 10, 10, 10))));
+                                (10, 10, 10, 10), 0, 10, ImmutableList<OcrResultElement>.Empty));
+                        return new((0, 0, 100, 100), list, list);
+                    }
 
                     if (ocrTextByImage != null)
                     {
diff --git a/NAPS2.Sdk.Tests/Ocr/OcrRequestQueueTests.cs b/NAPS2.Sdk.Tests/Ocr/OcrRequestQueueTests.cs
index a11918fa9..2ee436300 100644
--- a/NAPS2.Sdk.Tests/Ocr/OcrRequestQueueTests.cs
+++ b/NAPS2.Sdk.Tests/Ocr/OcrRequestQueueTests.cs
@@ -375,8 +375,8 @@ public class OcrRequestQueueTests : ContextualTests
 
     private static OcrResult CreateOcrResult()
     {
-        var uniqueElement = new OcrResultElement(Guid.NewGuid().ToString(), "eng", false, (0, 0, 1, 1));
-        return new OcrResult((0, 0, 1, 1), ImmutableList<OcrResultElement>.Empty.Add(uniqueElement));
+        var uniqueElement = new OcrResultElement(Guid.NewGuid().ToString(), "eng", false, (0, 0, 1, 1), 0, 10, ImmutableList<OcrResultElement>.Empty);
+        return new OcrResult((0, 0, 1, 1), ImmutableList.Create(uniqueElement), ImmutableList.Create(uniqueElement));
     }
 
     private static OcrParams CreateOcrParams()
diff --git a/NAPS2.Sdk.Tests/Ocr/TesseractOcrEngineTests.cs b/NAPS2.Sdk.Tests/Ocr/TesseractOcrEngineTests.cs
index c7ad33b2c..4f86ae0dc 100644
--- a/NAPS2.Sdk.Tests/Ocr/TesseractOcrEngineTests.cs
+++ b/NAPS2.Sdk.Tests/Ocr/TesseractOcrEngineTests.cs
@@ -26,17 +26,17 @@ public class TesseractOcrEngineTests : ContextualTests
         var ocrParams = new OcrParams("eng", OcrMode.Fast, 0);
         var result = await _engine.ProcessImage(ScanningContext, _testImagePath, ocrParams, CancellationToken.None);
         Assert.NotNull(result);
-        Assert.NotEmpty(result.Elements);
-        foreach (var element in result.Elements)
+        Assert.NotEmpty(result.Words);
+        foreach (var element in result.Words)
         {
             Assert.Equal("eng", element.LanguageCode);
             Assert.False(element.RightToLeft);
         }
-        Assert.Equal("ADVERTISEMENT.", result.Elements[0].Text);
-        Assert.InRange(result.Elements[0].Bounds.x, 139, 149);
-        Assert.InRange(result.Elements[0].Bounds.y, 26, 36);
-        Assert.InRange(result.Elements[0].Bounds.w, 237, 247);
-        Assert.InRange(result.Elements[0].Bounds.h, 17, 27);
+        Assert.Equal("ADVERTISEMENT.", result.Words[0].Text);
+        Assert.InRange(result.Words[0].Bounds.x, 139, 149);
+        Assert.InRange(result.Words[0].Bounds.y, 26, 36);
+        Assert.InRange(result.Words[0].Bounds.w, 237, 247);
+        Assert.InRange(result.Words[0].Bounds.h, 17, 27);
     }
 
     [Fact]
@@ -44,13 +44,13 @@ public class TesseractOcrEngineTests : ContextualTests
     {
         var result = await _engine.ProcessImage(ScanningContext, _testImagePathHebrew, new OcrParams("heb", OcrMode.Fast, 0), CancellationToken.None);
         Assert.NotNull(result);
-        Assert.NotEmpty(result.Elements);
-        foreach (var element in result.Elements)
+        Assert.NotEmpty(result.Words);
+        foreach (var element in result.Words)
         {
             Assert.Equal("heb", element.LanguageCode);
             Assert.True(element.RightToLeft);
         }
-        Assert.Equal("הקדמת", result.Elements[0].Text);
+        Assert.Equal("הקדמת", result.Words[0].Text);
     }
 
     [Fact(Skip = "flaky")]
@@ -97,6 +97,6 @@ public class TesseractOcrEngineTests : ContextualTests
         var mode = OcrMode.Best;
         var result = await _engine.ProcessImage(ScanningContext, _testImagePath, new OcrParams("eng", mode, 0), CancellationToken.None);
         Assert.NotNull(result);
-        Assert.Equal("ADVERTISEMENT.", result.Elements[0].Text);
+        Assert.Equal("ADVERTISEMENT.", result.Words[0].Text);
     }
 }
\ No newline at end of file
diff --git a/NAPS2.Sdk/Ocr/OcrResult.cs b/NAPS2.Sdk/Ocr/OcrResult.cs
index add3eae2c..0eda6dee2 100644
--- a/NAPS2.Sdk/Ocr/OcrResult.cs
+++ b/NAPS2.Sdk/Ocr/OcrResult.cs
@@ -5,15 +5,14 @@ namespace NAPS2.Ocr;
 /// <summary>
 /// The result of an OCR request. Contains a set of elements that represent text segments. 
 /// </summary>
-public class OcrResult
+public class OcrResult(
+    (int x, int y, int w, int h) pageBounds,
+    ImmutableList<OcrResultElement> words,
+    ImmutableList<OcrResultElement> lines)
 {
-    public OcrResult((int x, int y, int w, int h) pageBounds, ImmutableList<OcrResultElement> elements)
-    {
-        PageBounds = pageBounds;
-        Elements = elements;
-    }
+    public (int x, int y, int w, int h) PageBounds { get; } = pageBounds;
 
-    public (int x, int y, int w, int h) PageBounds { get; }
+    public ImmutableList<OcrResultElement> Words { get; } = words;
 
-    public ImmutableList<OcrResultElement> Elements { get; }
+    public ImmutableList<OcrResultElement> Lines { get; } = lines;
 }
\ No newline at end of file
diff --git a/NAPS2.Sdk/Ocr/OcrResultElement.cs b/NAPS2.Sdk/Ocr/OcrResultElement.cs
index ddbedee3a..ad312367f 100644
--- a/NAPS2.Sdk/Ocr/OcrResultElement.cs
+++ b/NAPS2.Sdk/Ocr/OcrResultElement.cs
@@ -1,6 +1,15 @@
-﻿namespace NAPS2.Ocr;
+﻿using System.Collections.Immutable;
+
+namespace NAPS2.Ocr;
 
 /// <summary>
 /// A element in the result of an OCR request that represents a text segment.
 /// </summary>
-public record OcrResultElement(string Text, string LanguageCode, bool RightToLeft, (int x, int y, int w, int h) Bounds);
\ No newline at end of file
+public record OcrResultElement(
+    string Text,
+    string LanguageCode,
+    bool RightToLeft,
+    (int x, int y, int w, int h) Bounds,
+    int Baseline,
+    int FontSize,
+    ImmutableList<OcrResultElement> Children);
\ No newline at end of file
diff --git a/NAPS2.Sdk/Ocr/TesseractOcrEngine.cs b/NAPS2.Sdk/Ocr/TesseractOcrEngine.cs
index 17de9e249..5fa1cb0df 100644
--- a/NAPS2.Sdk/Ocr/TesseractOcrEngine.cs
+++ b/NAPS2.Sdk/Ocr/TesseractOcrEngine.cs
@@ -4,6 +4,7 @@ using System.Xml;
 using Microsoft.Extensions.Logging;
 using NAPS2.Scan;
 using NAPS2.Unmanaged;
+using Bounds = (int x, int y, int w, int h);
 
 namespace NAPS2.Ocr;
 
@@ -74,10 +75,11 @@ public class TesseractOcrEngine : IOcrEngine
             {
                 PreProcessImage(scanningContext, imagePath);
             }
+            var configVals = "-c tessedit_create_hocr=1 -c hocr_font_info=1";
             var startInfo = new ProcessStartInfo
             {
                 FileName = _tesseractPath,
-                Arguments = $"\"{imagePath}\" \"{tempHocrFilePath}\" -l {ocrParams.LanguageCode} hocr",
+                Arguments = $"\"{imagePath}\" \"{tempHocrFilePath}\" -l {ocrParams.LanguageCode} {configVals}",
                 UseShellExecute = false,
                 CreateNoWindow = true,
                 RedirectStandardOutput = true,
@@ -92,8 +94,6 @@ public class TesseractOcrEngine : IOcrEngine
                     languageDataPath = Path.Combine(languageDataPath, subfolder);
                 }
                 startInfo.EnvironmentVariables["TESSDATA_PREFIX"] = languageDataPath;
-                var tessdata = new DirectoryInfo(languageDataPath);
-                EnsureHocrConfigExists(tessdata);
             }
             var tesseractProcess = Process.Start(startInfo);
             if (tesseractProcess == null)
@@ -150,22 +150,7 @@ public class TesseractOcrEngine : IOcrEngine
                 }
 #endif
             XDocument hocrDocument = XDocument.Load(tempHocrFilePathWithExt);
-            var pageBounds = hocrDocument.Descendants()
-                .Where(x => x.Attributes("class").Any(y => y.Value == "ocr_page"))
-                .Select(x => GetBounds(x.Attribute("title")))
-                .First();
-            var elements = hocrDocument.Descendants()
-                .Where(x => x.Attributes("class").Any(y => y.Value == "ocrx_word"))
-                .Where(x => !string.IsNullOrWhiteSpace(x.Value))
-                .Select(x =>
-                {
-                    var text = x.Value;
-                    var lang = GetNearestAncestorAttribute(x, "lang") ?? "";
-                    var rtl = GetNearestAncestorAttribute(x, "dir") == "rtl";
-                    var bounds = GetBounds(x.Attribute("title"));
-                    return new OcrResultElement(text, lang, rtl, bounds);
-                }).ToImmutableList();
-            return new OcrResult(pageBounds, elements);
+            return CreateOcrResult(hocrDocument);
         }
         catch (XmlException e)
         {
@@ -211,57 +196,136 @@ public class TesseractOcrEngine : IOcrEngine
         }
     }
 
+    private OcrResult CreateOcrResult(XDocument hocrDocument)
+    {
+        var pageBounds = hocrDocument.Descendants()
+            .Where(element => GetClass(element) == "ocr_page")
+            .Select(GetBounds)
+            .First();
+        var words = new List<OcrResultElement>();
+        var lines = new List<OcrResultElement>();
+        foreach (var lineElement in hocrDocument.Descendants()
+                     .Where(element => GetClass(element) is "ocr_line" or "ocr_header" or "ocr_textfloat"))
+        {
+            var lineBounds = GetBounds(lineElement);
+            var lineAngle = GetTextAngle(lineElement);
+            bool isRotated = lineAngle is >= 45 or <= -45;
+            var baselineParams = GetBaselineParams(lineElement);
+            var lineWords = lineElement.Descendants()
+                .Where(element => GetClass(element) == "ocrx_word")
+                .Where(element => !string.IsNullOrWhiteSpace(element.Value))
+                .Select(wordElement =>
+                {
+                    var wordBounds = GetBounds(wordElement);
+                    return new OcrResultElement(
+                        wordElement.Value,
+                        GetNearestAncestorAttribute(wordElement, "lang") ?? "",
+                        GetNearestAncestorAttribute(wordElement, "dir") == "rtl",
+                        wordBounds,
+                        // TODO: Maybe we can properly handle rotated text?
+                        isRotated
+                            ? wordBounds.y + wordBounds.h
+                            : CalculateBaseline(baselineParams, lineBounds, wordBounds),
+                        GetFontSize(wordElement),
+                        ImmutableList<OcrResultElement>.Empty);
+                }).ToImmutableList();
+            if (lineWords.Count == 0) continue;
+            words.AddRange(lineWords);
+            lines.Add(lineWords[0] with
+            {
+                Text = string.Join(" ", lineWords.Select(x => x.Text)),
+                Bounds = lineBounds,
+                Baseline = CalculateBaseline(baselineParams, lineBounds, lineBounds),
+                Children = lineWords
+            });
+        }
+        return new OcrResult(pageBounds, words.ToImmutableList(), lines.ToImmutableList());
+    }
+
     private static string? GetNearestAncestorAttribute(XElement x, string attributeName)
     {
         var ancestor = x.AncestorsAndSelf().FirstOrDefault(x => x.Attribute(attributeName) != null);
         return ancestor?.Attribute(attributeName)?.Value;
     }
 
-    private void EnsureHocrConfigExists(DirectoryInfo tessdata)
+    private string? GetClass(XElement? element)
     {
-        try
-        {
-            var configDir = new DirectoryInfo(Path.Combine(tessdata.FullName, "configs"));
-            if (!configDir.Exists)
-            {
-                configDir.Create();
-            }
-            var hocrConfigFile = new FileInfo(Path.Combine(configDir.FullName, "hocr"));
-            if (!hocrConfigFile.Exists)
-            {
-                using var writer = hocrConfigFile.CreateText();
-                writer.Write("tessedit_create_hocr 1");
-            }
-        }
-        catch (Exception)
-        {
-            // Possibly contention over creating the file. As long as it's created assume everything is okay.
-            if (!File.Exists(Path.Combine(tessdata.FullName, "configs", "hocr")))
-            {
-                throw;
-            }
-        }
+        return element?.Attribute("class")?.Value;
     }
 
-    private (int x, int y, int w, int h) GetBounds(XAttribute? titleAttr)
+    private bool ParseData(XElement? element, string dataKey, int dataCount, out string[] parts)
     {
-        var bounds = (0, 0, 0, 0);
+        parts = Array.Empty<string>();
+        var titleAttr = element?.Attribute("title");
         if (titleAttr != null)
         {
             foreach (var param in titleAttr.Value.Split(';'))
             {
-                string[] parts = param.Trim().Split(' ');
-                if (parts.Length == 5 && parts[0] == "bbox")
+                parts = param.Trim().Split(' ');
+                if (parts[0] == dataKey && parts.Length == dataCount + 1)
                 {
-                    int x1 = int.Parse(parts[1]), y1 = int.Parse(parts[2]);
-                    int x2 = int.Parse(parts[3]), y2 = int.Parse(parts[4]);
-                    bounds = (x1, y1, x2 - x1, y2 - y1);
+                    return true;
                 }
             }
         }
+        return false;
+    }
+
+    private Bounds GetBounds(XElement? element)
+    {
+        var bounds = (0, 0, 0, 0);
+        if (ParseData(element, "bbox", 4, out string[] parts))
+        {
+            int x1 = int.Parse(parts[1]), y1 = int.Parse(parts[2]);
+            int x2 = int.Parse(parts[3]), y2 = int.Parse(parts[4]);
+            bounds = (x1, y1, x2 - x1, y2 - y1);
+        }
         return bounds;
     }
 
+    private int GetFontSize(XElement? element)
+    {
+        int fontSize = 0;
+        if (ParseData(element, "x_fsize", 1, out string[] parts))
+        {
+            fontSize = int.Parse(parts[1]);
+        }
+        return fontSize;
+    }
+
+    private (float m, float b) GetBaselineParams(XElement? element)
+    {
+        float m = 0;
+        float b = 0;
+        if (ParseData(element, "baseline", 2, out string[] parts))
+        {
+            m = float.Parse(parts[1]);
+            b = float.Parse(parts[2]);
+        }
+        return (m, b);
+    }
+
+    private float GetTextAngle(XElement? element)
+    {
+        float angle = 0;
+        if (ParseData(element, "textangle", 1, out string[] parts))
+        {
+            angle = float.Parse(parts[1]);
+        }
+        return angle;
+    }
+
+    private int CalculateBaseline((float m, float b) baselineParams, Bounds lineBounds, Bounds elementBounds)
+    {
+        // The line baseline is a linear equation (y=mx + b), so we calculate the word baseline from the
+        // word offset to the left side of the line.
+        float midpoint = elementBounds.x + elementBounds.w / 2f;
+        int relativeBaseline = (int) Math.Round(baselineParams.b +
+                                                baselineParams.m * (midpoint - lineBounds.x));
+        int absoluteBaseline = relativeBaseline + lineBounds.y + lineBounds.h;
+        return absoluteBaseline;
+    }
+
     // TODO: Consider adding back CanProcess, or otherwise using this code to get the languages from a system engine
 //     private void CheckIfInstalled()
 //     {
diff --git a/NAPS2.Sdk/Pdf/PdfExporter.cs b/NAPS2.Sdk/Pdf/PdfExporter.cs
index ebb837274..f73923fa9 100644
--- a/NAPS2.Sdk/Pdf/PdfExporter.cs
+++ b/NAPS2.Sdk/Pdf/PdfExporter.cs
@@ -7,12 +7,12 @@ using NAPS2.Ocr;
 using NAPS2.Pdf.Pdfium;
 using NAPS2.Scan;
 using PdfSharpCore.Drawing;
-using PdfSharpCore.Drawing.Layout;
 using PdfSharpCore.Pdf;
 using PdfSharpCore.Pdf.IO;
 using PdfSharpCore.Pdf.Security;
 using PdfDocument = PdfSharpCore.Pdf.PdfDocument;
 using PdfPage = PdfSharpCore.Pdf.PdfPage;
+using Alphabet = NAPS2.Pdf.PdfFontPicker.Alphabet;
 
 namespace NAPS2.Pdf;
 
@@ -398,20 +398,19 @@ public class PdfExporter
     private static void DrawOcrTextOnPage(PdfPage page, OcrResult ocrResult)
     {
 #if DEBUG && DEBUGOCR
-            using XGraphics gfx = XGraphics.FromPdfPage(page, XGraphicsPdfPageOptions.Append);
+        using XGraphics gfx = XGraphics.FromPdfPage(page, XGraphicsPdfPageOptions.Append);
 #else
         using XGraphics gfx = XGraphics.FromPdfPage(page, XGraphicsPdfPageOptions.Prepend);
 #endif
-        var tf = new XTextFormatter(gfx);
-        foreach (var element in ocrResult.Elements)
+        foreach (var info in GetOcrTextToDraw(page, ocrResult, gfx))
         {
-            var info = GetTextDrawInfo(page, gfx, ocrResult, element);
-            if (info == null) continue;
+            var font = new XFont(info.FontFamily, info.FontSize, XFontStyle.Regular,
+                new XPdfFontOptions(PdfFontEncoding.Unicode));
 #if DEBUG && DEBUGOCR
             gfx.DrawRectangle(new XPen(XColor.FromArgb(255, 0, 0)), info.Bounds);
-            tf.DrawString(info.Text, info.Font, XBrushes.Blue, info.Bounds);
+            gfx.DrawString(info.Text, font, XBrushes.Blue, info.X, info.Y, XStringFormats.BaseLineLeft);
 #else
-            tf.DrawString(info.Text, info.Font, XBrushes.Transparent, info.Bounds);
+            gfx.DrawString(info.Text, font, XBrushes.Transparent, info.X, info.Y, XStringFormats.BaseLineLeft);
 #endif
         }
     }
@@ -420,13 +419,9 @@ public class PdfExporter
         Pdfium.PdfPage pdfiumPage, PdfiumFontSubsets fontSubsets, OcrResult ocrResult)
     {
         using XGraphics gfx = XGraphics.FromPdfPage(page, XGraphicsPdfPageOptions.Prepend);
-        foreach (var element in ocrResult.Elements)
+        foreach (var info in GetOcrTextToDraw(page, ocrResult, gfx))
         {
-            var info = GetTextDrawInfo(page, gfx, ocrResult, element);
-            if (info == null) continue;
-
-            var fontName = PdfFontPicker.GetBestFont(element.LanguageCode);
-            var textObj = pdfiumDocument.NewText(fontSubsets[fontName], info.FontSize);
+            var textObj = pdfiumDocument.NewText(fontSubsets[info.FontFamily], info.FontSize);
 #if DEBUG && DEBUGOCR
             textObj.FillColor = (0, 0, 255, 255);
 #else
@@ -435,34 +430,78 @@ public class PdfExporter
             textObj.SetText(info.Text);
             // This ends up being slightly different alignment then the PdfSharp-based text. Maybe at some point we can
             // try to make them identical, although it's not perfect to begin with.
-            textObj.Matrix = new PdfMatrix(1, 0, 0, 1, info.X, (float) page.Height - (info.Y + info.TextHeight));
+            textObj.Matrix = new PdfMatrix(1, 0, 0, 1, info.X, (float) page.Height - info.Y);
             pdfiumPage.InsertObject(textObj);
         }
         pdfiumPage.GenerateContent();
     }
 
-    private static TextDrawInfo? GetTextDrawInfo(PdfPage page, XGraphics gfx, OcrResult ocrResult,
-        OcrResultElement element)
+    private static IEnumerable<TextDrawInfo> GetOcrTextToDraw(PdfPage page, OcrResult ocrResult, XGraphics gfx)
     {
-        if (string.IsNullOrEmpty(element.Text)) return null;
+        double hAdjust = page.Width / ocrResult.PageBounds.w;
+        double vAdjust = page.Height / ocrResult.PageBounds.h;
+        foreach (var line in ocrResult.Lines)
+        {
+            var lineFontFamily = PdfFontPicker.GetBestFont(line.LanguageCode);
+            var lineFontSize = line.FontSize;
+            // Chinese/Japanese/Korean languages don't need font size alignment as words are generally just 1 char
+            if (!IsCjk(line.LanguageCode))
+            {
+                // Only measure words with at least 3 characters to avoid noise
+                var eligibleWords = line.Children.Where(word => word.Text.Length >= 3).ToList();
+                if (eligibleWords.Count > 1)
+                {
+                    // In case Tesseract underestimated the font size, keep increasing it as long as all words are still
+                    // within their bounds.
+                    while (true)
+                    {
+                        var font = new XFont(lineFontFamily, lineFontSize + 1, XFontStyle.Regular);
+                        if (eligibleWords.All(word => gfx.MeasureString(word.Text, font).Width < word.Bounds.w * hAdjust))
+                        {
+                            lineFontSize++;
+                        }
+                        else
+                        {
+                            break;
+                        }
+                    }
+                }
+            }
+            for (int i = 0; i < line.Children.Count; i++)
+            {
+                var word = line.Children[i];
+                if (string.IsNullOrEmpty(word.Text)) continue;
 
-        var adjustedBounds = AdjustBounds(element.Bounds, (float) page.Width / ocrResult.PageBounds.w,
-            (float) page.Height / ocrResult.PageBounds.h);
-        var adjustedFontSize = CalculateFontSize(element, adjustedBounds, gfx);
-        // Special case to avoid accidentally recognizing big lines as dashes/underscores
-        if (adjustedFontSize > 100 && (element.Text == "-" || element.Text == "_")) return null;
-        var font = new XFont(PdfFontPicker.GetBestFont(element.LanguageCode), adjustedFontSize, XFontStyle.Regular,
-            new XPdfFontOptions(PdfFontEncoding.Unicode));
-        var adjustedTextSize = gfx.MeasureString(element.Text, font);
-        var verticalOffset = (adjustedBounds.Height - adjustedTextSize.Height) / 2;
-        var horizontalOffset = (adjustedBounds.Width - adjustedTextSize.Width) / 2;
-        adjustedBounds.Offset((float) horizontalOffset, (float) verticalOffset);
+                var rightBound = i + 1 < line.Children.Count ? line.Children[i + 1].Bounds.x : -1;
+                var adjustedRightBound = rightBound * hAdjust;
+                var adjustedX = word.Bounds.x * hAdjust;
+                var adjustedY = word.Baseline * vAdjust;
 
-        return new TextDrawInfo(
-            element.RightToLeft ? ReverseText(element.Text) : element.Text,
-            font,
-            adjustedBounds,
-            adjustedTextSize);
+                // We make sure there's enough distance between this word and the next to fit a space (" "), so that
+                // when you Ctrl+A and Ctrl+C in a PDF file, the words don't blend together
+                var wordFontSize = ClampFontSizeByRightBound(word, lineFontSize, adjustedX, adjustedRightBound, gfx);
+
+                // Special case to avoid accidentally recognizing big lines as dashes/underscores
+                if (wordFontSize > 100 && (word.Text == "-" || word.Text == "_")) continue;
+
+                yield return new TextDrawInfo(
+                    word.RightToLeft ? ReverseText(word.Text) : word.Text,
+                    lineFontFamily,
+                    wordFontSize,
+                    (int) Math.Round(adjustedX),
+                    (int) Math.Round(adjustedY));
+            }
+        }
+    }
+
+    private static bool IsCjk(string langCode)
+    {
+        var alphabet = PdfFontPicker.MapLanguageCodeToAlphabet(langCode);
+        return alphabet is
+            Alphabet.ChineseSimplified or
+            Alphabet.ChineseTraditional or
+            Alphabet.Japanese or
+            Alphabet.Korean;
     }
 
     private static string ReverseText(string text)
@@ -527,18 +566,34 @@ public class PdfExporter
         return (realWidth, realHeight);
     }
 
-    private static XRect AdjustBounds((int x, int y, int w, int h) bounds, float hAdjust, float vAdjust) =>
-        new XRect(bounds.x * hAdjust, bounds.y * vAdjust, bounds.w * hAdjust, bounds.h * vAdjust);
-
-    private static int CalculateFontSize(OcrResultElement element, XRect adjustedBounds, XGraphics gfx)
+    private static int ClampFontSizeByRightBound(OcrResultElement element, int initialFontSize, double x,
+        double rightBound,
+        XGraphics gfx)
     {
-        int fontSizeGuess = Math.Max(1, (int) (adjustedBounds.Height));
+        var fontSize = initialFontSize;
+        if (IsCjk(element.LanguageCode))
+        {
+            // No word separators so no need to ensure space between words
+            return fontSize;
+        }
+        if (rightBound < 0)
+        {
+            // No word to the right
+            return fontSize;
+        }
         var fontFamily = PdfFontPicker.GetBestFont(element.LanguageCode);
-        var measuredBoundsForGuess =
-            gfx.MeasureString(element.Text, new XFont(fontFamily, fontSizeGuess, XFontStyle.Regular));
-        double adjustmentFactor = adjustedBounds.Width / measuredBoundsForGuess.Width;
-        int adjustedFontSize = Math.Max(1, (int) Math.Floor(fontSizeGuess * adjustmentFactor));
-        return adjustedFontSize;
+        while (fontSize > 2)
+        {
+            var spaceWidth = gfx.MeasureString(" ", new XFont(fontFamily, fontSize, XFontStyle.Regular)).Width;
+            var measuredBounds =
+                gfx.MeasureString(element.Text, new XFont(fontFamily, fontSize, XFontStyle.Regular));
+            if (measuredBounds.Width + x <= rightBound - spaceWidth)
+            {
+                break;
+            }
+            fontSize--;
+        }
+        return fontSize;
     }
 
     private static bool IsPdfStorage(IImageStorage storage) => storage switch
@@ -548,16 +603,7 @@ public class PdfExporter
         _ => false
     };
 
-    private record TextDrawInfo(string Text, XFont Font, XRect Bounds, XSize TextSize)
-    {
-        public int FontSize => (int) Font.Size;
-        public float X => (float) Bounds.X;
-        public float Y => (float) Bounds.Y;
-        public float Width => (float) Bounds.Width;
-        public float Height => (float) Bounds.Height;
-        public float TextWidth => (float) TextSize.Width;
-        public float TextHeight => (float) TextSize.Height;
-    }
+    private record TextDrawInfo(string Text, string FontFamily, int FontSize, int X, int Y);
 
     private class PageExportState
     {
diff --git a/NAPS2.Sdk/Pdf/PdfiumFontSubsets.cs b/NAPS2.Sdk/Pdf/PdfiumFontSubsets.cs
index 8c6d6cb35..178d88231 100644
--- a/NAPS2.Sdk/Pdf/PdfiumFontSubsets.cs
+++ b/NAPS2.Sdk/Pdf/PdfiumFontSubsets.cs
@@ -14,7 +14,7 @@ internal class PdfiumFontSubsets : IDisposable
     public PdfiumFontSubsets(PdfDocument pdfiumDocument, IEnumerable<OcrResult?> ocrResults)
     {
         var fontSubsetBuilders = new Dictionary<string, FontSubsetBuilder>();
-        foreach (var element in ocrResults.WhereNotNull().SelectMany(result => result.Elements))
+        foreach (var element in ocrResults.WhereNotNull().SelectMany(result => result.Words))
         {
             // Map the OCR language to a font that supports its glyphs
             var fontName = PdfFontPicker.GetBestFont(element.LanguageCode);