How to Start Convert PDF Read PDF Build PDF Work with PDF Modules PDF Document PDF Pages Text Image Graph & Path Annotation, Markup & Drawing Redaction Security Digital Signature Forms Watermark Bookmark Link File Attachment File Metadata Printing Work with Other SDKs Barcode read Barcode create OCR Twain

Using C# PDF SDK
C# PDF Reader: how to read, extract text from PDF file


C# Demo Code to read, extract text from Adobe PDF document













Extract text content from a page region


//  open a document
String inputFilePath = Program.RootPath + "\\" + "2.pdf";
PDFDocument doc = new PDFDocument(inputFilePath);
//  get a text manager from the document object
PDFTextMgr textMgr = PDFTextHandler.ExportPDFTextManager(doc);

//  get the first page from the document
int pageIndex = 0;
PDFPage page = (PDFPage)doc.GetPage(pageIndex);


//  select char at position (245F, 155F)
PointF cursor = new PointF(245F, 155F);
PDFTextCharacter aChar = textMgr.SelectChar(page, cursor);
if (aChar == null)
{
    Console.WriteLine("No character has been found.");
}
else
{
    Console.WriteLine("Value: " + aChar.GetChar() + "; Boundary: " + aChar.GetBoundary().ToString());
}

//  select chars in the region (250F, 150F, 100F, 100F)
RectangleF region = new RectangleF(250F, 150F, 100F, 100F);
List<PDFTextCharacter> chars = textMgr.SelectChar(page, region);
foreach (PDFTextCharacter obj in chars)
{
    Console.WriteLine("Value: " + obj.GetChar() + "; Boundary: " + obj.GetBoundary().ToString());
}




Extract line text from a page region


//  select a line at 150F from the top of the page
PDFTextLine aLine = textMgr.SelectLine(page, 150F);
if (aLine == null)
{
    Console.WriteLine("No character has been found.");
}
else
{
    Console.WriteLine("Line: " + aLine.GetContent());
}




Extract highlighted text


The code below is only for text markup annotations


  • PDFAnnotHighlight
  • PDFAnnotUnderLine
  • PDFAnnotDeleteLine
  • PDFAnnotTextReplace


String inputFilePath = Program.RootPath + "\\" + "1.pdf";

//  Open the PDF file.
PDFDocument doc = new PDFDocument(inputFilePath);
//  Retreive all annotations in the document.
List<IPDFAnnot> annots = PDFAnnotHandler.GetAllAnnotations(doc);
foreach (IPDFAnnot annot in annots)
{
    //  For PDFAnnotHighlight, PDFAnnotUnderLine, PDFAnnotDeleteLine and PDFAnnotTextReplace.
    if (annot is IPDFMarkupAnnot)
    {
        //  Get the parent page of the annotation.
        PDFPage page = (PDFPage)doc.GetPage(annot.PageIndex);

        //  Extract text from the target text markup annotation.
        String[] text = PDFAnnotHandler.ExtractText(page, (IPDFMarkupAnnot)annot);
        //  Show the markup text related to the annotation.
        Console.WriteLine("Content: ");
        foreach (String line in text)
        {
            Console.WriteLine(line);
        }
    }
}