Question

I am evaluating Winnovative s PdfToText library and have run into something that concerns me.

如果我申请假释,我可以立即从20k或更少的pdf中提取案文内容。然而,如果我从UNUnit gui的同一法典上说,它需要15-25秒钟(我已经核实,它通过在一条线上打上一个断点,从中抽取案文,打F10,看它需要多长时间才能升至下线)。

这一点令我感到关切,因为我不知道原因,我不敢确定谁会责怪。是否与Unit或PdfToText存在问题? 我所要做的是从pdf中提取案文,但如果我会在某些条件下看到这种行为,20秒是完全不合理的。如果在管理联尼特时,那是可以接受的,但否则我不得不在其他地方看到。

It s easier to demonstrate the problem using a complete VS Solution (2010), so here s the link to make it easier to setup and run (no need to download NUnit or PdfToText or even a sample pdf): http://dl.dropbox.com/u/273037/PdfToTextProblem.zip (You may have to change the reference to PdfToText to use the x86 dll if you re running on a 32-bit machine).

Just hit F5 and the NUnit Gui runner will load.

如果你提出建议,我不会与这个图书馆挂钩,我会尝试过iTextSharp(两条法典过于昂贵),并看A Aspose(我没有尝试过,但SaaS许可证是11k)。但它们要么缺乏必要的功能,要么过于昂贵。

Answer 1

(结果改为回答>>

你们的国防军是多么复杂? 4.1.6版本的iText允许采用封闭来源的解决办法。虽然4.1.6n t 直接有案文摘录,但用PdfReader和GetPageContent()写字却太可怕。

Answer 2

下面是使用iTextSharpv4.1.6 。如果表面上看上去,则与我如何使用否决权和所需的灵活性有关。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using iTextSharp.text.pdf;

namespace ClassLibrary1
{
    public class PdfToken
    {
        private PdfToken(int type, string value)
        {
            Type = type;
            Value = value;
        }

        public static PdfToken Create(PRTokeniser tokenizer)
        {
            return new PdfToken(tokenizer.TokenType, tokenizer.StringValue);
        }

        public int Type { get; private set; }
        public string Value { get; private set; }
        public bool IsOperand
        {
            get
            {
                return Type == PRTokeniser.TK_OTHER;
            }
        }
    }

    public class PdfOperation
    {
        public PdfOperation(PdfToken operationToken, IEnumerable<PdfToken> arguments)
        {
            Name = operationToken.Value;
            Arguments = arguments;
        }

        public string Name { get; private set; }
        public IEnumerable<PdfToken> Arguments { get; private set; }
    }

    public interface IPdfParsingStrategy
    {
        void Execute(PdfOperation op);
    }

    public class PlainTextParsingStrategy : IPdfParsingStrategy
    {
        StringBuilder text = new StringBuilder();

        public PlainTextParsingStrategy()
        {

        }

        public String GetText()
        {
            return text.ToString();
        }

        #region IPdfParsingStrategy Members

        public void Execute(PdfOperation op)
        {
            // see Adobe PDF specs for additional operations
            switch (op.Name)
            {
                case "TJ":
                    PrintText(op);
                    break;
                case "Tm":
                    SetMatrix(op);
                    break;
                case "Tf":
                    SetFont(op);
                    break;
                case "S":
                    PrintSection(op);
                    break;
                case "G":
                case "g":
                case "rg":
                    SetColor(op);
                    break;
            }
        }

        #endregion

        bool newSection = false;

        private void PrintSection(PdfOperation op)
        {
            text.AppendLine("------------------------------------------------------------");
            newSection = true;
        }

        private void PrintNewline(PdfOperation op)
        {
            text.AppendLine();
        }

        private void PrintText(PdfOperation op)
        {
            if (newSection)
            {
                newSection = false;
                StringBuilder header = new StringBuilder();
                PrintText(op, header);
            }

            PrintText(op, text);
        }

        private static void PrintText(PdfOperation op, StringBuilder text)
        {
            foreach (PdfToken t in op.Arguments)
            {
                switch (t.Type)
                {
                    case PRTokeniser.TK_STRING:
                        text.Append(t.Value);
                        break;
                    case PRTokeniser.TK_NUMBER:
                        text.Append(" ");
                        break;
                }
            }
        }

        String lastFont = String.Empty;
        String lastFontSize = String.Empty;

        private void SetFont(PdfOperation op)
        {
            var args = op.Arguments.ToList();
            string font = args[0].Value;
            string size = args[1].Value;

            //if (font != lastFont || size != lastFontSize)
            //    text.AppendLine();

            lastFont = font;
            lastFontSize = size;
        }

        String lastX = String.Empty;
        String lastY = String.Empty;

        private void SetMatrix(PdfOperation op)
        {
            var args = op.Arguments.ToList();
            string x = args[4].Value;
            string y = args[5].Value;

            if (lastY != y)
                text.AppendLine();
            else if (lastX != x)
                text.Append(" ");

            lastX = x;
            lastY = y;
        }

        String lastColor = String.Empty;

        private void SetColor(PdfOperation op)
        {
            lastColor = PrintCommand(op).Replace(" ", "_");
        }

        private static string PrintCommand(PdfOperation op)
        {
            StringBuilder text = new StringBuilder();
            foreach (PdfToken t in op.Arguments)
                text.AppendFormat("{0} ", t.Value);
            text.Append(op.Name);
            return text.ToString();
        }

    }
}

这里我这样说:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using iTextSharp.text.pdf;

namespace ClassLibrary1
{
    public class PdfExtractor
    {
        public static string GetText(byte[] pdfBuffer)
        {
            PlainTextParsingStrategy strategy = new PlainTextParsingStrategy();
            ParsePdf(pdfBuffer, strategy);
            return strategy.GetText();
        }

        private static void ParsePdf(byte[] pdf, IPdfParsingStrategy strategy)
        {
            PdfReader reader = new PdfReader(pdf);

            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                byte[] page = reader.GetPageContent(i);
                if (page != null)
                {
                    PRTokeniser tokenizer = new PRTokeniser(page);
                    List<PdfToken> parameters = new List<PdfToken>();

                    while (tokenizer.NextToken())
                    {
                        var token = PdfToken.Create(tokenizer);
                        if (token.IsOperand)
                        {
                            strategy.Execute(new PdfOperation(token, parameters));
                            parameters.Clear();
                        }
                        else
                        {
                            parameters.Add(token);
                        }
                    }
                }
            }

        }
    }
}

友情链接