Redo parallel OCR with a pipeline in order to avoid having to worry about memory (TODO: solve how to keep OCR text under the image in the PDF)

This commit is contained in:
Ben Olden-Cooligan 2016-06-13 18:29:51 -04:00
parent fcc0399e23
commit 129ac8cf2f
7 changed files with 340 additions and 64 deletions

View File

@ -19,6 +19,7 @@
*/ */
using System; using System;
using System.Collections.Concurrent;
using System.Collections.Generic; using System.Collections.Generic;
using System.Drawing; using System.Drawing;
using System.IO; using System.IO;
@ -76,38 +77,13 @@ namespace NAPS2.ImportExport.Pdf
document.SecuritySettings.PermitPrint = settings.Encryption.AllowPrinting; document.SecuritySettings.PermitPrint = settings.Encryption.AllowPrinting;
} }
var imageList = images.ToList();
var pageList = imageList.Select(x => document.AddPage()).ToList();
double maxImageSizeMB = imageList.Select(x => x.Size / 1048576.0).Max(); bool useOcr = false;
double memoryLimitMB = Environment.Is64BitOperatingSystem ? 3000 : 1000;
int maxThreads = (int)Math.Floor(memoryLimitMB / (maxImageSizeMB * 4));
int progress = 0;
Parallel.For(0, imageList.Count, new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, (i, loop) =>
{
if (!progressCallback(progress))
{
loop.Stop();
return;
}
using (Stream stream = imageList[i].GetImageStream())
using (var img = new Bitmap(stream))
{
if (!progressCallback(progress))
{
loop.Stop();
return;
}
OcrResult ocrResult = null;
if (ocrLanguageCode != null) if (ocrLanguageCode != null)
{ {
if (ocrEngine.CanProcess(ocrLanguageCode)) if (ocrEngine.CanProcess(ocrLanguageCode))
{ {
ocrResult = ocrEngine.ProcessImage(img, ocrLanguageCode); useOcr = true;
} }
else else
{ {
@ -115,44 +91,105 @@ namespace NAPS2.ImportExport.Pdf
} }
} }
if (!progressCallback(progress)) if (useOcr)
{ {
loop.Stop(); BuildDocumentWithOcr(document, images, ocrLanguageCode);
return; }
else
{
BuildDocumentWithoutOcr(document, images);
} }
PathHelper.EnsureParentDirExists(path);
document.Save(path);
return true;
}
private void BuildDocumentWithoutOcr(PdfDocument document, IEnumerable<ScannedImage> images)
{
foreach (var image in images)
{
using (Stream stream = image.GetImageStream())
using (var img = new Bitmap(stream))
{
float hAdjust = 72 / img.HorizontalResolution; float hAdjust = 72 / img.HorizontalResolution;
float vAdjust = 72 / img.VerticalResolution; float vAdjust = 72 / img.VerticalResolution;
double realWidth = img.Width * hAdjust; double realWidth = img.Width * hAdjust;
double realHeight = img.Height * vAdjust; double realHeight = img.Height * vAdjust;
PdfPage page = document.AddPage();
page.Width = (int)realWidth;
page.Height = (int)realHeight;
using (XGraphics gfx = XGraphics.FromPdfPage(page))
{
gfx.DrawImage(img, 0, 0, (int)realWidth, (int)realHeight);
}
}
};
}
private void BuildDocumentWithOcr(PdfDocument document, IEnumerable<ScannedImage> images, string ocrLanguageCode)
{
Pipeline.For(images).Step(image =>
{
using (Stream stream = image.GetImageStream())
using (var img = new Bitmap(stream))
{
float hAdjust = 72 / img.HorizontalResolution;
float vAdjust = 72 / img.VerticalResolution;
double realWidth = img.Width * hAdjust;
double realHeight = img.Height * vAdjust;
PdfPage page;
lock (document) lock (document)
{ {
PdfPage newPage = pageList[i]; page = document.AddPage();
newPage.Width = (int)realWidth; page.Width = (int)realWidth;
newPage.Height = (int)realHeight; page.Height = (int)realHeight;
using (XGraphics gfx = XGraphics.FromPdfPage(newPage)) using (XGraphics gfx = XGraphics.FromPdfPage(page))
{ {
if (ocrResult != null) gfx.DrawImage(img, 0, 0, (int)realWidth, (int)realHeight);
}
}
string tempImageFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName());
img.Save(tempImageFilePath);
return Tuple.Create(page, tempImageFilePath);
}
}).StepParallel((page, tempImageFilePath) =>
{
OcrResult ocrResult;
try
{
ocrResult = ocrEngine.ProcessImage(tempImageFilePath, ocrLanguageCode);
}
finally
{
File.Delete(tempImageFilePath);
}
return Tuple.Create(page, ocrResult);
}).Run((page, ocrResult) =>
{
if (ocrResult == null)
{
return;
}
lock (document)
{
using (XGraphics gfx = XGraphics.FromPdfPage(page))
{ {
var tf = new XTextFormatter(gfx); var tf = new XTextFormatter(gfx);
foreach (var element in ocrResult.Elements) foreach (var element in ocrResult.Elements)
{ {
var adjustedBounds = AdjustBounds(element.Bounds, hAdjust, vAdjust); var adjustedBounds = AdjustBounds(element.Bounds, (float)page.Width / ocrResult.PageBounds.Width, (float)page.Height / ocrResult.PageBounds.Height);
var adjustedFontSize = CalculateFontSize(element.Text, adjustedBounds, gfx); var adjustedFontSize = CalculateFontSize(element.Text, adjustedBounds, gfx);
var font = new XFont("Times New Roman", adjustedFontSize, XFontStyle.Regular, var font = new XFont("Times New Roman", adjustedFontSize, XFontStyle.Regular,
new XPdfFontOptions(PdfFontEncoding.Unicode)); new XPdfFontOptions(PdfFontEncoding.Unicode));
tf.DrawString(element.Text, font, XBrushes.Transparent, adjustedBounds); tf.DrawString(element.Text, font, XBrushes.Transparent, adjustedBounds);
} }
} }
gfx.DrawImage(img, 0, 0, (int)realWidth, (int)realHeight);
}
}
Interlocked.Increment(ref progress);
} }
}); });
PathHelper.EnsureParentDirExists(path);
document.Save(path);
return true;
} }
private static RectangleF AdjustBounds(Rectangle b, float hAdjust, float vAdjust) private static RectangleF AdjustBounds(Rectangle b, float hAdjust, float vAdjust)

View File

@ -132,6 +132,7 @@
<Compile Include="Scan\Twain\Legacy\TwainLib.cs" /> <Compile Include="Scan\Twain\Legacy\TwainLib.cs" />
<Compile Include="Scan\Twain\TwainWrapper.cs" /> <Compile Include="Scan\Twain\TwainWrapper.cs" />
<Compile Include="Util\ChangeTracker.cs" /> <Compile Include="Util\ChangeTracker.cs" />
<Compile Include="Util\ChaosMonkey.cs" />
<Compile Include="Util\CollectionExtensions.cs" /> <Compile Include="Util\CollectionExtensions.cs" />
<Compile Include="Config\AppConfig.cs" /> <Compile Include="Config\AppConfig.cs" />
<Compile Include="Config\AppConfigManager.cs" /> <Compile Include="Config\AppConfigManager.cs" />
@ -262,6 +263,7 @@
<Compile Include="Operation\OperationErrorEventArgs.cs" /> <Compile Include="Operation\OperationErrorEventArgs.cs" />
<Compile Include="Operation\OperationStatus.cs" /> <Compile Include="Operation\OperationStatus.cs" />
<Compile Include="Util\PathHelper.cs" /> <Compile Include="Util\PathHelper.cs" />
<Compile Include="Util\Pipeline.cs" />
<Compile Include="Util\Pipes.cs" /> <Compile Include="Util\Pipes.cs" />
<Compile Include="Host\ProcessJob.cs" /> <Compile Include="Host\ProcessJob.cs" />
<Compile Include="ImportExport\DirectImageTransfer.cs" /> <Compile Include="ImportExport\DirectImageTransfer.cs" />

View File

@ -8,6 +8,6 @@ namespace NAPS2.Ocr
public interface IOcrEngine public interface IOcrEngine
{ {
bool CanProcess(string langCode); bool CanProcess(string langCode);
OcrResult ProcessImage(Image image, string langCode); OcrResult ProcessImage(string imagePath, string langCode);
} }
} }

View File

@ -1,11 +1,14 @@
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Drawing;
using System.Linq; using System.Linq;
namespace NAPS2.Ocr namespace NAPS2.Ocr
{ {
public class OcrResult public class OcrResult
{ {
public Rectangle PageBounds { get; set; }
public IEnumerable<OcrResultElement> Elements { get; set; } public IEnumerable<OcrResultElement> Elements { get; set; }
} }
} }

View File

@ -35,20 +35,18 @@ namespace NAPS2.Ocr
return langCode.Split('+').All(code => availableLanguages.Any(x => x.Code == code)); return langCode.Split('+').All(code => availableLanguages.Any(x => x.Code == code));
} }
public OcrResult ProcessImage(Image image, string langCode) public OcrResult ProcessImage(string imagePath, string langCode)
{ {
bool newTesseract = ocrDependencyManager.IsNewExecutableDownloaded; bool newTesseract = ocrDependencyManager.IsNewExecutableDownloaded;
string tempImageFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName());
string tempHocrFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName()); string tempHocrFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName());
string tempHocrFilePathWithExt = tempHocrFilePath + (newTesseract ? ".hocr" : ".html"); string tempHocrFilePathWithExt = tempHocrFilePath + (newTesseract ? ".hocr" : ".html");
try try
{ {
image.Save(tempImageFilePath);
var exeDir = newTesseract ? ocrDependencyManager.GetExecutableDir() : ocrDependencyManager.GetOldExecutableDir(); var exeDir = newTesseract ? ocrDependencyManager.GetExecutableDir() : ocrDependencyManager.GetOldExecutableDir();
var startInfo = new ProcessStartInfo var startInfo = new ProcessStartInfo
{ {
FileName = Path.Combine(exeDir.FullName, "tesseract.exe"), FileName = Path.Combine(exeDir.FullName, "tesseract.exe"),
Arguments = string.Format("\"{0}\" \"{1}\" -l {2} hocr", tempImageFilePath, tempHocrFilePath, langCode), Arguments = string.Format("\"{0}\" \"{1}\" -l {2} hocr", imagePath, tempHocrFilePath, langCode),
UseShellExecute = false, UseShellExecute = false,
CreateNoWindow = true, CreateNoWindow = true,
RedirectStandardOutput = true, RedirectStandardOutput = true,
@ -103,6 +101,10 @@ namespace NAPS2.Ocr
XDocument hocrDocument = XDocument.Load(tempHocrFilePathWithExt); XDocument hocrDocument = XDocument.Load(tempHocrFilePathWithExt);
return new OcrResult return new OcrResult
{ {
PageBounds = hocrDocument.Descendants()
.Where(x => x.Attributes("class").Any(y => y.Value == "ocr_page"))
.Select(x => GetBounds(x.Attribute("title")))
.First(),
Elements = hocrDocument.Descendants() Elements = hocrDocument.Descendants()
.Where(x => x.Attributes("class").Any(y => y.Value == "ocrx_word")) .Where(x => x.Attributes("class").Any(y => y.Value == "ocrx_word"))
.Select(x => new OcrResultElement { Text = x.Value, Bounds = GetBounds(x.Attribute("title")) }) .Select(x => new OcrResultElement { Text = x.Value, Bounds = GetBounds(x.Attribute("title")) })
@ -117,7 +119,6 @@ namespace NAPS2.Ocr
{ {
try try
{ {
File.Delete(tempImageFilePath);
File.Delete(tempHocrFilePathWithExt); File.Delete(tempHocrFilePathWithExt);
} }
catch (Exception e) catch (Exception e)

View File

@ -0,0 +1,33 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading;
namespace NAPS2.Util
{
internal static class ChaosMonkey
{
private static Lazy<Random> random = new Lazy<Random>();
[Conditional("DEBUG")]
public static void MaybeError(double chance)
{
if (random.Value.NextDouble() < chance)
{
throw new Exception("Randomly generated exception for testing");
}
}
[Conditional("DEBUG")]
public static void MaybeDelay(double chance, double durationInSeconds, double variationInSeconds = 0)
{
if (random.Value.NextDouble() < chance)
{
double duration = durationInSeconds + variationInSeconds * (random.Value.NextDouble() * 2 - 1);
Thread.Sleep(TimeSpan.FromSeconds(duration));
}
}
}
}

200
NAPS2.Core/Util/Pipeline.cs Normal file
View File

@ -0,0 +1,200 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace NAPS2.Util
{
public static class Pipeline
{
// TODO: Need to add cancellation logic to avoid a possible deadlock
// see: https://msdn.microsoft.com/en-us/library/ff963548.aspx
// TODO: Test exception handling
private static readonly TaskFactory TaskFactory =
new TaskFactory(TaskCreationOptions.LongRunning, TaskContinuationOptions.None);
public static IPipelineSyntax<T> For<T>(IEnumerable<T> input)
{
return new PipelineSource<T>(input);
}
public abstract class PipelineBase<T> : IPipelineSyntax<T>
{
public IPipelineSyntax<T2> Step<T2>(Func<T, T2> pipelineStepFunc)
{
return new PipelineStep<T, T2>(this, pipelineStepFunc);
}
public IPipelineSyntax<T2> StepParallel<T2>(Func<T, T2> pipelineStepFunc)
{
return new PipelineParallelStep<T, T2>(this, pipelineStepFunc);
}
public List<T> Run()
{
var taskList = new List<Task>();
var result = GetOutput(taskList);
Task.WaitAll(taskList.ToArray());
return result.ToList();
}
public void Run(Action<T> pipelineFinishAction)
{
var taskList = new List<Task>();
foreach (var item in GetOutput(taskList))
{
pipelineFinishAction(item);
}
Task.WaitAll(taskList.ToArray());
}
public abstract IEnumerable<T> GetOutput(List<Task> taskList);
}
public class PipelineSource<T> : PipelineBase<T>
{
private readonly IEnumerable<T> value;
public PipelineSource(IEnumerable<T> input)
{
value = input;
}
public override IEnumerable<T> GetOutput(List<Task> taskList)
{
return value;
}
}
public class PipelineStep<T1, T2> : PipelineBase<T2>
{
private readonly PipelineBase<T1> previous;
private readonly Func<T1, T2> func;
public PipelineStep(PipelineBase<T1> previous, Func<T1, T2> func)
{
this.previous = previous;
this.func = func;
}
public override IEnumerable<T2> GetOutput(List<Task> taskList)
{
var collection = new BlockingCollection<T2>();
var input = previous.GetOutput(taskList);
taskList.Add(TaskFactory.StartNew(() =>
{
try
{
foreach (var item in input)
{
collection.Add(func(item));
}
}
finally
{
collection.CompleteAdding();
}
}));
return collection.GetConsumingEnumerable();
}
}
public class PipelineParallelStep<T1, T2> : PipelineBase<T2>
{
private readonly PipelineBase<T1> previous;
private readonly Func<T1, T2> func;
public PipelineParallelStep(PipelineBase<T1> previous, Func<T1, T2> func)
{
this.previous = previous;
this.func = func;
}
public override IEnumerable<T2> GetOutput(List<Task> taskList)
{
var collection = new BlockingCollection<T2>();
var input = previous.GetOutput(taskList);
taskList.Add(TaskFactory.StartNew(() =>
{
try
{
Parallel.ForEach(input, item => collection.Add(func(item)));
}
finally
{
collection.CompleteAdding();
}
}));
return collection.GetConsumingEnumerable();
}
}
public interface IPipelineSyntax<T>
{
/// <summary>
/// Adds a new step to the pipeline.
/// </summary>
/// <param name="pipelineStepFunc"></param>
/// <returns></returns>
IPipelineSyntax<T2> Step<T2>(Func<T, T2> pipelineStepFunc);
/// <summary>
/// Adds a new step to the pipeline, where multiple items can be processed at once. Note: order will not be maintained!
/// </summary>
/// <param name="pipelineStepFunc"></param>
/// <returns></returns>
IPipelineSyntax<T2> StepParallel<T2>(Func<T, T2> pipelineStepFunc);
/// <summary>
/// Runs the pipeline with the previously defined steps, returning the result. Blocks until the pipeline is finished.
/// </summary>
/// <returns></returns>
List<T> Run();
/// <summary>
/// Runs the pipeline with the previously defined steps, performing the specified action on each item in the result. Blocks until the pipeline is finished.
/// </summary>
/// <param name="pipelineFinishAction"></param>
void Run(Action<T> pipelineFinishAction);
}
#region Extensions for Tuples
/// <summary>
/// Adds a new step to the pipeline.
/// </summary>
/// <param name="syntax"></param>
/// <param name="pipelineStepFunc"></param>
/// <returns></returns>
public static IPipelineSyntax<T2> Step<TIn1, TIn2, T2>(this IPipelineSyntax<Tuple<TIn1, TIn2>> syntax, Func<TIn1, TIn2, T2> pipelineStepFunc)
{
return syntax.Step(tuple => pipelineStepFunc(tuple.Item1, tuple.Item2));
}
/// <summary>
/// Adds a new step to the pipeline, where multiple items can be processed at once. Note: order will not be maintained!
/// </summary>
/// <param name="syntax"></param>
/// <param name="pipelineStepFunc"></param>
/// <returns></returns>
public static IPipelineSyntax<T2> StepParallel<TIn1, TIn2, T2>(this IPipelineSyntax<Tuple<TIn1, TIn2>> syntax, Func<TIn1, TIn2, T2> pipelineStepFunc)
{
return syntax.StepParallel(tuple => pipelineStepFunc(tuple.Item1, tuple.Item2));
}
/// <summary>
/// Runs the pipeline with the previously defined steps, performing the specified action on each item in the result. Blocks until the pipeline is finished.
/// </summary>
/// <param name="syntax"></param>
/// <param name="pipelineFinishAction"></param>
public static void Run<TIn1, TIn2>(this IPipelineSyntax<Tuple<TIn1, TIn2>> syntax, Action<TIn1, TIn2> pipelineFinishAction)
{
syntax.Run(tuple => pipelineFinishAction(tuple.Item1, tuple.Item2));
}
#endregion
}
}