Improve OCR text alignment

This is nearly a full rewrite of the alignment code. Position is now based on the line baseline (provided by Tesseract) and the font size is smarter (defaulting to Tesseract's provided value with various adjustments).

The goals were:
- Have Ctrl+F highlight the word as accurately as possible.
- Have Ctrl+A/Ctrl+C end up with text that matches the original as closely as possible.
- Have PdfSharp and Pdfium produce consistent output.
On my test cases all goals are fully met.

This commit is contained in:
Ben Olden-Cooligan 2024-03-26 19:05:17 -07:00
parent 1abcdd6cc6
commit 79bba70370
9 changed files with 252 additions and 131 deletions

View File

@ -52,7 +52,7 @@ public class ImportAndSaveTests : AppiumTests
PdfAsserts.AssertContainsTextOnce("Page one.", path);
PdfAsserts.AssertContainsTextOnce("Page two.", path);
PdfAsserts.AssertContainsTextOnce("ADVERTISEMENT.", path);
PdfAsserts.AssertContainsTextOnce("Patch Code separator sheet geometry", path);
PdfAsserts.AssertContainsTextOnce("Sized for printing unscaled", path);

View File

@ -89,10 +89,13 @@ public class ContextualTests : IDisposable
var ocrImage = ImageContext.Load(path);
await Task.Delay(delay);
OcrResult CreateOcrResult(string text) => new((0, 0, 100, 100),
OcrResult CreateOcrResult(string text)
var list = ImmutableList.Create(
new OcrResultElement(text, ocrParams.LanguageCode!, false,
(10, 10, 10, 10))));
(10, 10, 10, 10), 0, 10, ImmutableList<OcrResultElement>.Empty));
return new((0, 0, 100, 100), list, list);
if (ocrTextByImage != null)

View File

@ -375,8 +375,8 @@ public class OcrRequestQueueTests : ContextualTests
private static OcrResult CreateOcrResult()
var uniqueElement = new OcrResultElement(Guid.NewGuid().ToString(), "eng", false, (0, 0, 1, 1));
return new OcrResult((0, 0, 1, 1), ImmutableList<OcrResultElement>.Empty.Add(uniqueElement));
var uniqueElement = new OcrResultElement(Guid.NewGuid().ToString(), "eng", false, (0, 0, 1, 1), 0, 10, ImmutableList<OcrResultElement>.Empty);
return new OcrResult((0, 0, 1, 1), ImmutableList.Create(uniqueElement), ImmutableList.Create(uniqueElement));
private static OcrParams CreateOcrParams()

View File

@ -26,17 +26,17 @@ public class TesseractOcrEngineTests : ContextualTests
var ocrParams = new OcrParams("eng", OcrMode.Fast, 0);
var result = await _engine.ProcessImage(ScanningContext, _testImagePath, ocrParams, CancellationToken.None);
foreach (var element in result.Elements)
foreach (var element in result.Words)
Assert.Equal("eng", element.LanguageCode);
Assert.Equal("ADVERTISEMENT.", result.Elements[0].Text);
Assert.InRange(result.Elements[0].Bounds.x, 139, 149);
Assert.InRange(result.Elements[0].Bounds.y, 26, 36);
Assert.InRange(result.Elements[0].Bounds.w, 237, 247);
Assert.InRange(result.Elements[0].Bounds.h, 17, 27);
Assert.Equal("ADVERTISEMENT.", result.Words[0].Text);
Assert.InRange(result.Words[0].Bounds.x, 139, 149);
Assert.InRange(result.Words[0].Bounds.y, 26, 36);
Assert.InRange(result.Words[0].Bounds.w, 237, 247);
Assert.InRange(result.Words[0].Bounds.h, 17, 27);
@ -44,13 +44,13 @@ public class TesseractOcrEngineTests : ContextualTests
var result = await _engine.ProcessImage(ScanningContext, _testImagePathHebrew, new OcrParams("heb", OcrMode.Fast, 0), CancellationToken.None);
foreach (var element in result.Elements)
foreach (var element in result.Words)
Assert.Equal("heb", element.LanguageCode);
Assert.Equal("הקדמת", result.Elements[0].Text);
Assert.Equal("הקדמת", result.Words[0].Text);
[Fact(Skip = "flaky")]
@ -97,6 +97,6 @@ public class TesseractOcrEngineTests : ContextualTests
var mode = OcrMode.Best;
var result = await _engine.ProcessImage(ScanningContext, _testImagePath, new OcrParams("eng", mode, 0), CancellationToken.None);
Assert.Equal("ADVERTISEMENT.", result.Elements[0].Text);
Assert.Equal("ADVERTISEMENT.", result.Words[0].Text);

View File

@ -5,15 +5,14 @@ namespace NAPS2.Ocr;
/// <summary>
/// The result of an OCR request. Contains a set of elements that represent text segments.
/// </summary>
public class OcrResult
public class OcrResult(
(int x, int y, int w, int h) pageBounds,
ImmutableList<OcrResultElement> words,
ImmutableList<OcrResultElement> lines)
public OcrResult((int x, int y, int w, int h) pageBounds, ImmutableList<OcrResultElement> elements)
PageBounds = pageBounds;
Elements = elements;
public (int x, int y, int w, int h) PageBounds { get; } = pageBounds;
public (int x, int y, int w, int h) PageBounds { get; }
public ImmutableList<OcrResultElement> Words { get; } = words;
public ImmutableList<OcrResultElement> Elements { get; }
public ImmutableList<OcrResultElement> Lines { get; } = lines;

View File

@ -1,6 +1,15 @@
namespace NAPS2.Ocr;
using System.Collections.Immutable;
namespace NAPS2.Ocr;
/// <summary>
/// A element in the result of an OCR request that represents a text segment.
/// </summary>
public record OcrResultElement(string Text, string LanguageCode, bool RightToLeft, (int x, int y, int w, int h) Bounds);
public record OcrResultElement(
string Text,
string LanguageCode,
bool RightToLeft,
(int x, int y, int w, int h) Bounds,
int Baseline,
int FontSize,
ImmutableList<OcrResultElement> Children);

View File

@ -4,6 +4,7 @@ using System.Xml;
using Microsoft.Extensions.Logging;
using NAPS2.Scan;
using NAPS2.Unmanaged;
using Bounds = (int x, int y, int w, int h);
namespace NAPS2.Ocr;
@ -74,10 +75,11 @@ public class TesseractOcrEngine : IOcrEngine
PreProcessImage(scanningContext, imagePath);
var configVals = "-c tessedit_create_hocr=1 -c hocr_font_info=1";
var startInfo = new ProcessStartInfo
FileName = _tesseractPath,
Arguments = $"\"{imagePath}\" \"{tempHocrFilePath}\" -l {ocrParams.LanguageCode} hocr",
Arguments = $"\"{imagePath}\" \"{tempHocrFilePath}\" -l {ocrParams.LanguageCode} {configVals}",
UseShellExecute = false,
CreateNoWindow = true,
RedirectStandardOutput = true,
@ -92,8 +94,6 @@ public class TesseractOcrEngine : IOcrEngine
languageDataPath = Path.Combine(languageDataPath, subfolder);
startInfo.EnvironmentVariables["TESSDATA_PREFIX"] = languageDataPath;
var tessdata = new DirectoryInfo(languageDataPath);
var tesseractProcess = Process.Start(startInfo);
if (tesseractProcess == null)
@ -150,22 +150,7 @@ public class TesseractOcrEngine : IOcrEngine
XDocument hocrDocument = XDocument.Load(tempHocrFilePathWithExt);
var pageBounds = hocrDocument.Descendants()
.Where(x => x.Attributes("class").Any(y => y.Value == "ocr_page"))
.Select(x => GetBounds(x.Attribute("title")))
var elements = hocrDocument.Descendants()
.Where(x => x.Attributes("class").Any(y => y.Value == "ocrx_word"))
.Where(x => !string.IsNullOrWhiteSpace(x.Value))
.Select(x =>
var text = x.Value;
var lang = GetNearestAncestorAttribute(x, "lang") ?? "";
var rtl = GetNearestAncestorAttribute(x, "dir") == "rtl";
var bounds = GetBounds(x.Attribute("title"));
return new OcrResultElement(text, lang, rtl, bounds);
return new OcrResult(pageBounds, elements);
return CreateOcrResult(hocrDocument);
catch (XmlException e)
@ -211,57 +196,136 @@ public class TesseractOcrEngine : IOcrEngine
private OcrResult CreateOcrResult(XDocument hocrDocument)
var pageBounds = hocrDocument.Descendants()
.Where(element => GetClass(element) == "ocr_page")
var words = new List<OcrResultElement>();
var lines = new List<OcrResultElement>();
foreach (var lineElement in hocrDocument.Descendants()
.Where(element => GetClass(element) is "ocr_line" or "ocr_header" or "ocr_textfloat"))
var lineBounds = GetBounds(lineElement);
var lineAngle = GetTextAngle(lineElement);
bool isRotated = lineAngle is >= 45 or <= -45;
var baselineParams = GetBaselineParams(lineElement);
var lineWords = lineElement.Descendants()
.Where(element => GetClass(element) == "ocrx_word")
.Where(element => !string.IsNullOrWhiteSpace(element.Value))
.Select(wordElement =>
var wordBounds = GetBounds(wordElement);
return new OcrResultElement(
GetNearestAncestorAttribute(wordElement, "lang") ?? "",
GetNearestAncestorAttribute(wordElement, "dir") == "rtl",
// TODO: Maybe we can properly handle rotated text?
? wordBounds.y + wordBounds.h
: CalculateBaseline(baselineParams, lineBounds, wordBounds),
if (lineWords.Count == 0) continue;
lines.Add(lineWords[0] with
Text = string.Join(" ", lineWords.Select(x => x.Text)),
Bounds = lineBounds,
Baseline = CalculateBaseline(baselineParams, lineBounds, lineBounds),
Children = lineWords
return new OcrResult(pageBounds, words.ToImmutableList(), lines.ToImmutableList());
private static string? GetNearestAncestorAttribute(XElement x, string attributeName)
var ancestor = x.AncestorsAndSelf().FirstOrDefault(x => x.Attribute(attributeName) != null);
return ancestor?.Attribute(attributeName)?.Value;
private void EnsureHocrConfigExists(DirectoryInfo tessdata)
private string? GetClass(XElement? element)
var configDir = new DirectoryInfo(Path.Combine(tessdata.FullName, "configs"));
if (!configDir.Exists)
var hocrConfigFile = new FileInfo(Path.Combine(configDir.FullName, "hocr"));
if (!hocrConfigFile.Exists)
using var writer = hocrConfigFile.CreateText();
writer.Write("tessedit_create_hocr 1");
catch (Exception)
// Possibly contention over creating the file. As long as it's created assume everything is okay.
if (!File.Exists(Path.Combine(tessdata.FullName, "configs", "hocr")))
return element?.Attribute("class")?.Value;
private (int x, int y, int w, int h) GetBounds(XAttribute? titleAttr)
private bool ParseData(XElement? element, string dataKey, int dataCount, out string[] parts)
var bounds = (0, 0, 0, 0);
parts = Array.Empty<string>();
var titleAttr = element?.Attribute("title");
if (titleAttr != null)
foreach (var param in titleAttr.Value.Split(';'))
string[] parts = param.Trim().Split(' ');
if (parts.Length == 5 && parts[0] == "bbox")
parts = param.Trim().Split(' ');
if (parts[0] == dataKey && parts.Length == dataCount + 1)
return true;
return false;
private Bounds GetBounds(XElement? element)
var bounds = (0, 0, 0, 0);
if (ParseData(element, "bbox", 4, out string[] parts))
int x1 = int.Parse(parts[1]), y1 = int.Parse(parts[2]);
int x2 = int.Parse(parts[3]), y2 = int.Parse(parts[4]);
bounds = (x1, y1, x2 - x1, y2 - y1);
return bounds;
private int GetFontSize(XElement? element)
int fontSize = 0;
if (ParseData(element, "x_fsize", 1, out string[] parts))
fontSize = int.Parse(parts[1]);
return fontSize;
private (float m, float b) GetBaselineParams(XElement? element)
float m = 0;
float b = 0;
if (ParseData(element, "baseline", 2, out string[] parts))
m = float.Parse(parts[1]);
b = float.Parse(parts[2]);
return (m, b);
private float GetTextAngle(XElement? element)
float angle = 0;
if (ParseData(element, "textangle", 1, out string[] parts))
angle = float.Parse(parts[1]);
return angle;
private int CalculateBaseline((float m, float b) baselineParams, Bounds lineBounds, Bounds elementBounds)
// The line baseline is a linear equation (y=mx + b), so we calculate the word baseline from the
// word offset to the left side of the line.
float midpoint = elementBounds.x + elementBounds.w / 2f;
int relativeBaseline = (int) Math.Round(baselineParams.b +
baselineParams.m * (midpoint - lineBounds.x));
int absoluteBaseline = relativeBaseline + lineBounds.y + lineBounds.h;
return absoluteBaseline;
// TODO: Consider adding back CanProcess, or otherwise using this code to get the languages from a system engine
// private void CheckIfInstalled()
// {

View File

@ -7,12 +7,12 @@ using NAPS2.Ocr;
using NAPS2.Pdf.Pdfium;
using NAPS2.Scan;
using PdfSharpCore.Drawing;
using PdfSharpCore.Drawing.Layout;
using PdfSharpCore.Pdf;
using PdfSharpCore.Pdf.IO;
using PdfSharpCore.Pdf.Security;
using PdfDocument = PdfSharpCore.Pdf.PdfDocument;
using PdfPage = PdfSharpCore.Pdf.PdfPage;
using Alphabet = NAPS2.Pdf.PdfFontPicker.Alphabet;
namespace NAPS2.Pdf;
@ -402,16 +402,15 @@ public class PdfExporter
using XGraphics gfx = XGraphics.FromPdfPage(page, XGraphicsPdfPageOptions.Prepend);
var tf = new XTextFormatter(gfx);
foreach (var element in ocrResult.Elements)
foreach (var info in GetOcrTextToDraw(page, ocrResult, gfx))
var info = GetTextDrawInfo(page, gfx, ocrResult, element);
if (info == null) continue;
var font = new XFont(info.FontFamily, info.FontSize, XFontStyle.Regular,
new XPdfFontOptions(PdfFontEncoding.Unicode));
gfx.DrawRectangle(new XPen(XColor.FromArgb(255, 0, 0)), info.Bounds);
tf.DrawString(info.Text, info.Font, XBrushes.Blue, info.Bounds);
gfx.DrawString(info.Text, font, XBrushes.Blue, info.X, info.Y, XStringFormats.BaseLineLeft);
tf.DrawString(info.Text, info.Font, XBrushes.Transparent, info.Bounds);
gfx.DrawString(info.Text, font, XBrushes.Transparent, info.X, info.Y, XStringFormats.BaseLineLeft);
@ -420,13 +419,9 @@ public class PdfExporter
Pdfium.PdfPage pdfiumPage, PdfiumFontSubsets fontSubsets, OcrResult ocrResult)
using XGraphics gfx = XGraphics.FromPdfPage(page, XGraphicsPdfPageOptions.Prepend);
foreach (var element in ocrResult.Elements)
foreach (var info in GetOcrTextToDraw(page, ocrResult, gfx))
var info = GetTextDrawInfo(page, gfx, ocrResult, element);
if (info == null) continue;
var fontName = PdfFontPicker.GetBestFont(element.LanguageCode);
var textObj = pdfiumDocument.NewText(fontSubsets[fontName], info.FontSize);
var textObj = pdfiumDocument.NewText(fontSubsets[info.FontFamily], info.FontSize);
textObj.FillColor = (0, 0, 255, 255);
@ -435,34 +430,78 @@ public class PdfExporter
// This ends up being slightly different alignment then the PdfSharp-based text. Maybe at some point we can
// try to make them identical, although it's not perfect to begin with.
textObj.Matrix = new PdfMatrix(1, 0, 0, 1, info.X, (float) page.Height - (info.Y + info.TextHeight));
textObj.Matrix = new PdfMatrix(1, 0, 0, 1, info.X, (float) page.Height - info.Y);
private static TextDrawInfo? GetTextDrawInfo(PdfPage page, XGraphics gfx, OcrResult ocrResult,
OcrResultElement element)
private static IEnumerable<TextDrawInfo> GetOcrTextToDraw(PdfPage page, OcrResult ocrResult, XGraphics gfx)
if (string.IsNullOrEmpty(element.Text)) return null;
double hAdjust = page.Width / ocrResult.PageBounds.w;
double vAdjust = page.Height / ocrResult.PageBounds.h;
foreach (var line in ocrResult.Lines)
var lineFontFamily = PdfFontPicker.GetBestFont(line.LanguageCode);
var lineFontSize = line.FontSize;
// Chinese/Japanese/Korean languages don't need font size alignment as words are generally just 1 char
if (!IsCjk(line.LanguageCode))
// Only measure words with at least 3 characters to avoid noise
var eligibleWords = line.Children.Where(word => word.Text.Length >= 3).ToList();
if (eligibleWords.Count > 1)
// In case Tesseract underestimated the font size, keep increasing it as long as all words are still
// within their bounds.
while (true)
var font = new XFont(lineFontFamily, lineFontSize + 1, XFontStyle.Regular);
if (eligibleWords.All(word => gfx.MeasureString(word.Text, font).Width < word.Bounds.w * hAdjust))
for (int i = 0; i < line.Children.Count; i++)
var word = line.Children[i];
if (string.IsNullOrEmpty(word.Text)) continue;
var rightBound = i + 1 < line.Children.Count ? line.Children[i + 1].Bounds.x : -1;
var adjustedRightBound = rightBound * hAdjust;
var adjustedX = word.Bounds.x * hAdjust;
var adjustedY = word.Baseline * vAdjust;
// We make sure there's enough distance between this word and the next to fit a space (" "), so that
// when you Ctrl+A and Ctrl+C in a PDF file, the words don't blend together
var wordFontSize = ClampFontSizeByRightBound(word, lineFontSize, adjustedX, adjustedRightBound, gfx);
var adjustedBounds = AdjustBounds(element.Bounds, (float) page.Width / ocrResult.PageBounds.w,
(float) page.Height / ocrResult.PageBounds.h);
var adjustedFontSize = CalculateFontSize(element, adjustedBounds, gfx);
// Special case to avoid accidentally recognizing big lines as dashes/underscores
if (adjustedFontSize > 100 && (element.Text == "-" || element.Text == "_")) return null;
var font = new XFont(PdfFontPicker.GetBestFont(element.LanguageCode), adjustedFontSize, XFontStyle.Regular,
new XPdfFontOptions(PdfFontEncoding.Unicode));
var adjustedTextSize = gfx.MeasureString(element.Text, font);
var verticalOffset = (adjustedBounds.Height - adjustedTextSize.Height) / 2;
var horizontalOffset = (adjustedBounds.Width - adjustedTextSize.Width) / 2;
adjustedBounds.Offset((float) horizontalOffset, (float) verticalOffset);
if (wordFontSize > 100 && (word.Text == "-" || word.Text == "_")) continue;
return new TextDrawInfo(
element.RightToLeft ? ReverseText(element.Text) : element.Text,
yield return new TextDrawInfo(
word.RightToLeft ? ReverseText(word.Text) : word.Text,
(int) Math.Round(adjustedX),
(int) Math.Round(adjustedY));
private static bool IsCjk(string langCode)
var alphabet = PdfFontPicker.MapLanguageCodeToAlphabet(langCode);
return alphabet is
Alphabet.ChineseSimplified or
Alphabet.ChineseTraditional or
Alphabet.Japanese or
private static string ReverseText(string text)
@ -527,18 +566,34 @@ public class PdfExporter
return (realWidth, realHeight);
private static XRect AdjustBounds((int x, int y, int w, int h) bounds, float hAdjust, float vAdjust) =>
new XRect(bounds.x * hAdjust, bounds.y * vAdjust, bounds.w * hAdjust, bounds.h * vAdjust);
private static int CalculateFontSize(OcrResultElement element, XRect adjustedBounds, XGraphics gfx)
private static int ClampFontSizeByRightBound(OcrResultElement element, int initialFontSize, double x,
double rightBound,
XGraphics gfx)
int fontSizeGuess = Math.Max(1, (int) (adjustedBounds.Height));
var fontSize = initialFontSize;
if (IsCjk(element.LanguageCode))
// No word separators so no need to ensure space between words
return fontSize;
if (rightBound < 0)
// No word to the right
return fontSize;
var fontFamily = PdfFontPicker.GetBestFont(element.LanguageCode);
var measuredBoundsForGuess =
gfx.MeasureString(element.Text, new XFont(fontFamily, fontSizeGuess, XFontStyle.Regular));
double adjustmentFactor = adjustedBounds.Width / measuredBoundsForGuess.Width;
int adjustedFontSize = Math.Max(1, (int) Math.Floor(fontSizeGuess * adjustmentFactor));
return adjustedFontSize;
while (fontSize > 2)
var spaceWidth = gfx.MeasureString(" ", new XFont(fontFamily, fontSize, XFontStyle.Regular)).Width;
var measuredBounds =
gfx.MeasureString(element.Text, new XFont(fontFamily, fontSize, XFontStyle.Regular));
if (measuredBounds.Width + x <= rightBound - spaceWidth)
return fontSize;
private static bool IsPdfStorage(IImageStorage storage) => storage switch
@ -548,16 +603,7 @@ public class PdfExporter
_ => false
private record TextDrawInfo(string Text, XFont Font, XRect Bounds, XSize TextSize)
public int FontSize => (int) Font.Size;
public float X => (float) Bounds.X;
public float Y => (float) Bounds.Y;
public float Width => (float) Bounds.Width;
public float Height => (float) Bounds.Height;
public float TextWidth => (float) TextSize.Width;
public float TextHeight => (float) TextSize.Height;
private record TextDrawInfo(string Text, string FontFamily, int FontSize, int X, int Y);
private class PageExportState

View File

@ -14,7 +14,7 @@ internal class PdfiumFontSubsets : IDisposable
public PdfiumFontSubsets(PdfDocument pdfiumDocument, IEnumerable<OcrResult?> ocrResults)
var fontSubsetBuilders = new Dictionary<string, FontSubsetBuilder>();
foreach (var element in ocrResults.WhereNotNull().SelectMany(result => result.Elements))
foreach (var element in ocrResults.WhereNotNull().SelectMany(result => result.Words))
// Map the OCR language to a font that supports its glyphs
var fontName = PdfFontPicker.GetBestFont(element.LanguageCode);