mirror of
https://github.com/cyanfish/naps2.git
synced 2024-10-04 11:27:08 +03:00
Redo parallel OCR with a pipeline in order to avoid having to worry about memory (TODO: solve how to keep OCR text under the image in the PDF)
This commit is contained in:
parent
fcc0399e23
commit
129ac8cf2f
@ -19,6 +19,7 @@
|
||||
*/
|
||||
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Drawing;
|
||||
using System.IO;
|
||||
@ -76,83 +77,119 @@ namespace NAPS2.ImportExport.Pdf
|
||||
document.SecuritySettings.PermitPrint = settings.Encryption.AllowPrinting;
|
||||
}
|
||||
|
||||
var imageList = images.ToList();
|
||||
var pageList = imageList.Select(x => document.AddPage()).ToList();
|
||||
|
||||
double maxImageSizeMB = imageList.Select(x => x.Size / 1048576.0).Max();
|
||||
double memoryLimitMB = Environment.Is64BitOperatingSystem ? 3000 : 1000;
|
||||
int maxThreads = (int)Math.Floor(memoryLimitMB / (maxImageSizeMB * 4));
|
||||
|
||||
int progress = 0;
|
||||
|
||||
Parallel.For(0, imageList.Count, new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, (i, loop) =>
|
||||
bool useOcr = false;
|
||||
if (ocrLanguageCode != null)
|
||||
{
|
||||
if (!progressCallback(progress))
|
||||
if (ocrEngine.CanProcess(ocrLanguageCode))
|
||||
{
|
||||
loop.Stop();
|
||||
return;
|
||||
useOcr = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
Log.Error("OCR files not available for '{0}'.", ocrLanguageCode);
|
||||
}
|
||||
}
|
||||
|
||||
using (Stream stream = imageList[i].GetImageStream())
|
||||
if (useOcr)
|
||||
{
|
||||
BuildDocumentWithOcr(document, images, ocrLanguageCode);
|
||||
}
|
||||
else
|
||||
{
|
||||
BuildDocumentWithoutOcr(document, images);
|
||||
}
|
||||
|
||||
PathHelper.EnsureParentDirExists(path);
|
||||
document.Save(path);
|
||||
return true;
|
||||
}
|
||||
|
||||
private void BuildDocumentWithoutOcr(PdfDocument document, IEnumerable<ScannedImage> images)
|
||||
{
|
||||
foreach (var image in images)
|
||||
{
|
||||
using (Stream stream = image.GetImageStream())
|
||||
using (var img = new Bitmap(stream))
|
||||
{
|
||||
if (!progressCallback(progress))
|
||||
{
|
||||
loop.Stop();
|
||||
return;
|
||||
}
|
||||
|
||||
OcrResult ocrResult = null;
|
||||
if (ocrLanguageCode != null)
|
||||
{
|
||||
if (ocrEngine.CanProcess(ocrLanguageCode))
|
||||
{
|
||||
ocrResult = ocrEngine.ProcessImage(img, ocrLanguageCode);
|
||||
}
|
||||
else
|
||||
{
|
||||
Log.Error("OCR files not available for '{0}'.", ocrLanguageCode);
|
||||
}
|
||||
}
|
||||
|
||||
if (!progressCallback(progress))
|
||||
{
|
||||
loop.Stop();
|
||||
return;
|
||||
}
|
||||
|
||||
float hAdjust = 72 / img.HorizontalResolution;
|
||||
float vAdjust = 72 / img.VerticalResolution;
|
||||
double realWidth = img.Width * hAdjust;
|
||||
double realHeight = img.Height * vAdjust;
|
||||
PdfPage page = document.AddPage();
|
||||
page.Width = (int)realWidth;
|
||||
page.Height = (int)realHeight;
|
||||
using (XGraphics gfx = XGraphics.FromPdfPage(page))
|
||||
{
|
||||
gfx.DrawImage(img, 0, 0, (int)realWidth, (int)realHeight);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private void BuildDocumentWithOcr(PdfDocument document, IEnumerable<ScannedImage> images, string ocrLanguageCode)
|
||||
{
|
||||
Pipeline.For(images).Step(image =>
|
||||
{
|
||||
using (Stream stream = image.GetImageStream())
|
||||
using (var img = new Bitmap(stream))
|
||||
{
|
||||
float hAdjust = 72 / img.HorizontalResolution;
|
||||
float vAdjust = 72 / img.VerticalResolution;
|
||||
double realWidth = img.Width * hAdjust;
|
||||
double realHeight = img.Height * vAdjust;
|
||||
PdfPage page;
|
||||
lock (document)
|
||||
{
|
||||
PdfPage newPage = pageList[i];
|
||||
newPage.Width = (int)realWidth;
|
||||
newPage.Height = (int)realHeight;
|
||||
using (XGraphics gfx = XGraphics.FromPdfPage(newPage))
|
||||
page = document.AddPage();
|
||||
page.Width = (int)realWidth;
|
||||
page.Height = (int)realHeight;
|
||||
using (XGraphics gfx = XGraphics.FromPdfPage(page))
|
||||
{
|
||||
if (ocrResult != null)
|
||||
{
|
||||
var tf = new XTextFormatter(gfx);
|
||||
foreach (var element in ocrResult.Elements)
|
||||
{
|
||||
var adjustedBounds = AdjustBounds(element.Bounds, hAdjust, vAdjust);
|
||||
var adjustedFontSize = CalculateFontSize(element.Text, adjustedBounds, gfx);
|
||||
var font = new XFont("Times New Roman", adjustedFontSize, XFontStyle.Regular,
|
||||
new XPdfFontOptions(PdfFontEncoding.Unicode));
|
||||
tf.DrawString(element.Text, font, XBrushes.Transparent, adjustedBounds);
|
||||
}
|
||||
}
|
||||
gfx.DrawImage(img, 0, 0, (int)realWidth, (int)realHeight);
|
||||
}
|
||||
}
|
||||
Interlocked.Increment(ref progress);
|
||||
|
||||
string tempImageFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName());
|
||||
img.Save(tempImageFilePath);
|
||||
|
||||
return Tuple.Create(page, tempImageFilePath);
|
||||
}
|
||||
}).StepParallel((page, tempImageFilePath) =>
|
||||
{
|
||||
OcrResult ocrResult;
|
||||
try
|
||||
{
|
||||
ocrResult = ocrEngine.ProcessImage(tempImageFilePath, ocrLanguageCode);
|
||||
}
|
||||
finally
|
||||
{
|
||||
File.Delete(tempImageFilePath);
|
||||
}
|
||||
|
||||
return Tuple.Create(page, ocrResult);
|
||||
}).Run((page, ocrResult) =>
|
||||
{
|
||||
if (ocrResult == null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
lock (document)
|
||||
{
|
||||
using (XGraphics gfx = XGraphics.FromPdfPage(page))
|
||||
{
|
||||
var tf = new XTextFormatter(gfx);
|
||||
foreach (var element in ocrResult.Elements)
|
||||
{
|
||||
var adjustedBounds = AdjustBounds(element.Bounds, (float)page.Width / ocrResult.PageBounds.Width, (float)page.Height / ocrResult.PageBounds.Height);
|
||||
var adjustedFontSize = CalculateFontSize(element.Text, adjustedBounds, gfx);
|
||||
var font = new XFont("Times New Roman", adjustedFontSize, XFontStyle.Regular,
|
||||
new XPdfFontOptions(PdfFontEncoding.Unicode));
|
||||
tf.DrawString(element.Text, font, XBrushes.Transparent, adjustedBounds);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
PathHelper.EnsureParentDirExists(path);
|
||||
document.Save(path);
|
||||
return true;
|
||||
}
|
||||
|
||||
private static RectangleF AdjustBounds(Rectangle b, float hAdjust, float vAdjust)
|
||||
|
@ -132,6 +132,7 @@
|
||||
<Compile Include="Scan\Twain\Legacy\TwainLib.cs" />
|
||||
<Compile Include="Scan\Twain\TwainWrapper.cs" />
|
||||
<Compile Include="Util\ChangeTracker.cs" />
|
||||
<Compile Include="Util\ChaosMonkey.cs" />
|
||||
<Compile Include="Util\CollectionExtensions.cs" />
|
||||
<Compile Include="Config\AppConfig.cs" />
|
||||
<Compile Include="Config\AppConfigManager.cs" />
|
||||
@ -262,6 +263,7 @@
|
||||
<Compile Include="Operation\OperationErrorEventArgs.cs" />
|
||||
<Compile Include="Operation\OperationStatus.cs" />
|
||||
<Compile Include="Util\PathHelper.cs" />
|
||||
<Compile Include="Util\Pipeline.cs" />
|
||||
<Compile Include="Util\Pipes.cs" />
|
||||
<Compile Include="Host\ProcessJob.cs" />
|
||||
<Compile Include="ImportExport\DirectImageTransfer.cs" />
|
||||
|
@ -8,6 +8,6 @@ namespace NAPS2.Ocr
|
||||
public interface IOcrEngine
|
||||
{
|
||||
bool CanProcess(string langCode);
|
||||
OcrResult ProcessImage(Image image, string langCode);
|
||||
OcrResult ProcessImage(string imagePath, string langCode);
|
||||
}
|
||||
}
|
@ -1,11 +1,14 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Drawing;
|
||||
using System.Linq;
|
||||
|
||||
namespace NAPS2.Ocr
|
||||
{
|
||||
public class OcrResult
|
||||
{
|
||||
public Rectangle PageBounds { get; set; }
|
||||
|
||||
public IEnumerable<OcrResultElement> Elements { get; set; }
|
||||
}
|
||||
}
|
||||
|
@ -35,20 +35,18 @@ namespace NAPS2.Ocr
|
||||
return langCode.Split('+').All(code => availableLanguages.Any(x => x.Code == code));
|
||||
}
|
||||
|
||||
public OcrResult ProcessImage(Image image, string langCode)
|
||||
public OcrResult ProcessImage(string imagePath, string langCode)
|
||||
{
|
||||
bool newTesseract = ocrDependencyManager.IsNewExecutableDownloaded;
|
||||
string tempImageFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName());
|
||||
string tempHocrFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName());
|
||||
string tempHocrFilePathWithExt = tempHocrFilePath + (newTesseract ? ".hocr" : ".html");
|
||||
try
|
||||
{
|
||||
image.Save(tempImageFilePath);
|
||||
var exeDir = newTesseract ? ocrDependencyManager.GetExecutableDir() : ocrDependencyManager.GetOldExecutableDir();
|
||||
var startInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = Path.Combine(exeDir.FullName, "tesseract.exe"),
|
||||
Arguments = string.Format("\"{0}\" \"{1}\" -l {2} hocr", tempImageFilePath, tempHocrFilePath, langCode),
|
||||
Arguments = string.Format("\"{0}\" \"{1}\" -l {2} hocr", imagePath, tempHocrFilePath, langCode),
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true,
|
||||
RedirectStandardOutput = true,
|
||||
@ -103,6 +101,10 @@ namespace NAPS2.Ocr
|
||||
XDocument hocrDocument = XDocument.Load(tempHocrFilePathWithExt);
|
||||
return new OcrResult
|
||||
{
|
||||
PageBounds = hocrDocument.Descendants()
|
||||
.Where(x => x.Attributes("class").Any(y => y.Value == "ocr_page"))
|
||||
.Select(x => GetBounds(x.Attribute("title")))
|
||||
.First(),
|
||||
Elements = hocrDocument.Descendants()
|
||||
.Where(x => x.Attributes("class").Any(y => y.Value == "ocrx_word"))
|
||||
.Select(x => new OcrResultElement { Text = x.Value, Bounds = GetBounds(x.Attribute("title")) })
|
||||
@ -117,7 +119,6 @@ namespace NAPS2.Ocr
|
||||
{
|
||||
try
|
||||
{
|
||||
File.Delete(tempImageFilePath);
|
||||
File.Delete(tempHocrFilePathWithExt);
|
||||
}
|
||||
catch (Exception e)
|
||||
|
33
NAPS2.Core/Util/ChaosMonkey.cs
Normal file
33
NAPS2.Core/Util/ChaosMonkey.cs
Normal file
@ -0,0 +1,33 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading;
|
||||
|
||||
namespace NAPS2.Util
|
||||
{
|
||||
internal static class ChaosMonkey
|
||||
{
|
||||
private static Lazy<Random> random = new Lazy<Random>();
|
||||
|
||||
[Conditional("DEBUG")]
|
||||
public static void MaybeError(double chance)
|
||||
{
|
||||
if (random.Value.NextDouble() < chance)
|
||||
{
|
||||
throw new Exception("Randomly generated exception for testing");
|
||||
}
|
||||
}
|
||||
|
||||
[Conditional("DEBUG")]
|
||||
public static void MaybeDelay(double chance, double durationInSeconds, double variationInSeconds = 0)
|
||||
{
|
||||
if (random.Value.NextDouble() < chance)
|
||||
{
|
||||
double duration = durationInSeconds + variationInSeconds * (random.Value.NextDouble() * 2 - 1);
|
||||
Thread.Sleep(TimeSpan.FromSeconds(duration));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
200
NAPS2.Core/Util/Pipeline.cs
Normal file
200
NAPS2.Core/Util/Pipeline.cs
Normal file
@ -0,0 +1,200 @@
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace NAPS2.Util
|
||||
{
|
||||
public static class Pipeline
|
||||
{
|
||||
// TODO: Need to add cancellation logic to avoid a possible deadlock
|
||||
// see: https://msdn.microsoft.com/en-us/library/ff963548.aspx
|
||||
// TODO: Test exception handling
|
||||
|
||||
private static readonly TaskFactory TaskFactory =
|
||||
new TaskFactory(TaskCreationOptions.LongRunning, TaskContinuationOptions.None);
|
||||
|
||||
public static IPipelineSyntax<T> For<T>(IEnumerable<T> input)
|
||||
{
|
||||
return new PipelineSource<T>(input);
|
||||
}
|
||||
|
||||
public abstract class PipelineBase<T> : IPipelineSyntax<T>
|
||||
{
|
||||
public IPipelineSyntax<T2> Step<T2>(Func<T, T2> pipelineStepFunc)
|
||||
{
|
||||
return new PipelineStep<T, T2>(this, pipelineStepFunc);
|
||||
}
|
||||
|
||||
public IPipelineSyntax<T2> StepParallel<T2>(Func<T, T2> pipelineStepFunc)
|
||||
{
|
||||
return new PipelineParallelStep<T, T2>(this, pipelineStepFunc);
|
||||
}
|
||||
|
||||
public List<T> Run()
|
||||
{
|
||||
var taskList = new List<Task>();
|
||||
var result = GetOutput(taskList);
|
||||
Task.WaitAll(taskList.ToArray());
|
||||
return result.ToList();
|
||||
}
|
||||
|
||||
public void Run(Action<T> pipelineFinishAction)
|
||||
{
|
||||
var taskList = new List<Task>();
|
||||
foreach (var item in GetOutput(taskList))
|
||||
{
|
||||
pipelineFinishAction(item);
|
||||
}
|
||||
Task.WaitAll(taskList.ToArray());
|
||||
}
|
||||
|
||||
public abstract IEnumerable<T> GetOutput(List<Task> taskList);
|
||||
}
|
||||
|
||||
public class PipelineSource<T> : PipelineBase<T>
|
||||
{
|
||||
private readonly IEnumerable<T> value;
|
||||
|
||||
public PipelineSource(IEnumerable<T> input)
|
||||
{
|
||||
value = input;
|
||||
}
|
||||
|
||||
public override IEnumerable<T> GetOutput(List<Task> taskList)
|
||||
{
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
public class PipelineStep<T1, T2> : PipelineBase<T2>
|
||||
{
|
||||
private readonly PipelineBase<T1> previous;
|
||||
private readonly Func<T1, T2> func;
|
||||
|
||||
public PipelineStep(PipelineBase<T1> previous, Func<T1, T2> func)
|
||||
{
|
||||
this.previous = previous;
|
||||
this.func = func;
|
||||
}
|
||||
|
||||
public override IEnumerable<T2> GetOutput(List<Task> taskList)
|
||||
{
|
||||
var collection = new BlockingCollection<T2>();
|
||||
var input = previous.GetOutput(taskList);
|
||||
taskList.Add(TaskFactory.StartNew(() =>
|
||||
{
|
||||
try
|
||||
{
|
||||
foreach (var item in input)
|
||||
{
|
||||
collection.Add(func(item));
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
collection.CompleteAdding();
|
||||
}
|
||||
}));
|
||||
return collection.GetConsumingEnumerable();
|
||||
}
|
||||
}
|
||||
|
||||
public class PipelineParallelStep<T1, T2> : PipelineBase<T2>
|
||||
{
|
||||
private readonly PipelineBase<T1> previous;
|
||||
private readonly Func<T1, T2> func;
|
||||
|
||||
public PipelineParallelStep(PipelineBase<T1> previous, Func<T1, T2> func)
|
||||
{
|
||||
this.previous = previous;
|
||||
this.func = func;
|
||||
}
|
||||
|
||||
public override IEnumerable<T2> GetOutput(List<Task> taskList)
|
||||
{
|
||||
var collection = new BlockingCollection<T2>();
|
||||
var input = previous.GetOutput(taskList);
|
||||
taskList.Add(TaskFactory.StartNew(() =>
|
||||
{
|
||||
try
|
||||
{
|
||||
Parallel.ForEach(input, item => collection.Add(func(item)));
|
||||
}
|
||||
finally
|
||||
{
|
||||
collection.CompleteAdding();
|
||||
}
|
||||
}));
|
||||
return collection.GetConsumingEnumerable();
|
||||
}
|
||||
}
|
||||
|
||||
public interface IPipelineSyntax<T>
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds a new step to the pipeline.
|
||||
/// </summary>
|
||||
/// <param name="pipelineStepFunc"></param>
|
||||
/// <returns></returns>
|
||||
IPipelineSyntax<T2> Step<T2>(Func<T, T2> pipelineStepFunc);
|
||||
|
||||
/// <summary>
|
||||
/// Adds a new step to the pipeline, where multiple items can be processed at once. Note: order will not be maintained!
|
||||
/// </summary>
|
||||
/// <param name="pipelineStepFunc"></param>
|
||||
/// <returns></returns>
|
||||
IPipelineSyntax<T2> StepParallel<T2>(Func<T, T2> pipelineStepFunc);
|
||||
|
||||
/// <summary>
|
||||
/// Runs the pipeline with the previously defined steps, returning the result. Blocks until the pipeline is finished.
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
List<T> Run();
|
||||
|
||||
/// <summary>
|
||||
/// Runs the pipeline with the previously defined steps, performing the specified action on each item in the result. Blocks until the pipeline is finished.
|
||||
/// </summary>
|
||||
/// <param name="pipelineFinishAction"></param>
|
||||
void Run(Action<T> pipelineFinishAction);
|
||||
}
|
||||
|
||||
#region Extensions for Tuples
|
||||
|
||||
/// <summary>
|
||||
/// Adds a new step to the pipeline.
|
||||
/// </summary>
|
||||
/// <param name="syntax"></param>
|
||||
/// <param name="pipelineStepFunc"></param>
|
||||
/// <returns></returns>
|
||||
public static IPipelineSyntax<T2> Step<TIn1, TIn2, T2>(this IPipelineSyntax<Tuple<TIn1, TIn2>> syntax, Func<TIn1, TIn2, T2> pipelineStepFunc)
|
||||
{
|
||||
return syntax.Step(tuple => pipelineStepFunc(tuple.Item1, tuple.Item2));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds a new step to the pipeline, where multiple items can be processed at once. Note: order will not be maintained!
|
||||
/// </summary>
|
||||
/// <param name="syntax"></param>
|
||||
/// <param name="pipelineStepFunc"></param>
|
||||
/// <returns></returns>
|
||||
public static IPipelineSyntax<T2> StepParallel<TIn1, TIn2, T2>(this IPipelineSyntax<Tuple<TIn1, TIn2>> syntax, Func<TIn1, TIn2, T2> pipelineStepFunc)
|
||||
{
|
||||
return syntax.StepParallel(tuple => pipelineStepFunc(tuple.Item1, tuple.Item2));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs the pipeline with the previously defined steps, performing the specified action on each item in the result. Blocks until the pipeline is finished.
|
||||
/// </summary>
|
||||
/// <param name="syntax"></param>
|
||||
/// <param name="pipelineFinishAction"></param>
|
||||
public static void Run<TIn1, TIn2>(this IPipelineSyntax<Tuple<TIn1, TIn2>> syntax, Action<TIn1, TIn2> pipelineFinishAction)
|
||||
{
|
||||
syntax.Run(tuple => pipelineFinishAction(tuple.Item1, tuple.Item2));
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user