Using C# PDF SDK
C# PDF Reader: how to read, extract text from PDF file
C# Demo Code to read, extract text from Adobe PDF document
Extract text content from a page region
// open a document
String inputFilePath = Program.RootPath + "\\" + "2.pdf";
PDFDocument doc = new PDFDocument(inputFilePath);
// get a text manager from the document object
PDFTextMgr textMgr = PDFTextHandler.ExportPDFTextManager(doc);
// get the first page from the document
int pageIndex = 0;
PDFPage page = (PDFPage)doc.GetPage(pageIndex);
// select char at position (245F, 155F)
PointF cursor = new PointF(245F, 155F);
PDFTextCharacter aChar = textMgr.SelectChar(page, cursor);
if (aChar == null)
{
Console.WriteLine("No character has been found.");
}
else
{
Console.WriteLine("Value: " + aChar.GetChar() + "; Boundary: " + aChar.GetBoundary().ToString());
}
// select chars in the region (250F, 150F, 100F, 100F)
RectangleF region = new RectangleF(250F, 150F, 100F, 100F);
List<PDFTextCharacter> chars = textMgr.SelectChar(page, region);
foreach (PDFTextCharacter obj in chars)
{
Console.WriteLine("Value: " + obj.GetChar() + "; Boundary: " + obj.GetBoundary().ToString());
}
Extract line text from a page region
// select a line at 150F from the top of the page
PDFTextLine aLine = textMgr.SelectLine(page, 150F);
if (aLine == null)
{
Console.WriteLine("No character has been found.");
}
else
{
Console.WriteLine("Line: " + aLine.GetContent());
}
Extract highlighted text
The code below is only for text markup annotations
- PDFAnnotHighlight
- PDFAnnotUnderLine
- PDFAnnotDeleteLine
- PDFAnnotTextReplace
String inputFilePath = Program.RootPath + "\\" + "1.pdf";
// Open the PDF file.
PDFDocument doc = new PDFDocument(inputFilePath);
// Retreive all annotations in the document.
List<IPDFAnnot> annots = PDFAnnotHandler.GetAllAnnotations(doc);
foreach (IPDFAnnot annot in annots)
{
// For PDFAnnotHighlight, PDFAnnotUnderLine, PDFAnnotDeleteLine and PDFAnnotTextReplace.
if (annot is IPDFMarkupAnnot)
{
// Get the parent page of the annotation.
PDFPage page = (PDFPage)doc.GetPage(annot.PageIndex);
// Extract text from the target text markup annotation.
String[] text = PDFAnnotHandler.ExtractText(page, (IPDFMarkupAnnot)annot);
// Show the markup text related to the annotation.
Console.WriteLine("Content: ");
foreach (String line in text)
{
Console.WriteLine(line);
}
}
}