public class TSGTextStripper
extends org.apache.pdfbox.util.PDFTextStripper
Constructor and Description |
---|
TSGTextStripper()
Initialize our hash map, in addition to the parent constructor.
|
Modifier and Type | Method and Description |
---|---|
IndexRectangle |
buildIndexRect(org.apache.pdfbox.util.TextPosition firstTextPosition,
org.apache.pdfbox.util.TextPosition lastTextPosition) |
boolean |
checkForLinkStart(java.lang.String textPositionsWord)
Checks if the given string is the beginning of a link
|
void |
clearTextLinks()
Clears out all the detected internal links found.
|
void |
clearWordMap()
Clear all the index entries out of the map.
|
java.lang.String |
getStringFromTextPosition(org.apache.pdfbox.util.TextPosition textPosition)
Method to return the String at a given TextPosition.
|
java.util.List<TSGInternalLink> |
getTextLinks()
Returns links detected within the text of the document.
|
java.util.Map<java.lang.String,java.util.List<TSGIndexEntry>> |
getWordsWithPositions()
Returns the processed word-to-index map.
|
protected void |
writeString(java.lang.String text,
java.util.List<org.apache.pdfbox.util.TextPosition> textPositions)
Called for each line of text stripped off of the PDF.
|
endArticle, endDocument, endPage, getAddMoreFormatting, getArticleEnd, getArticleStart, getAverageCharTolerance, getCharactersByArticle, getCurrentPageNo, getDropThreshold, getEndBookmark, getEndPage, getIndentThreshold, getLineSeparator, getListItemPatterns, getOutput, getPageEnd, getPageSeparator, getPageStart, getParagraphEnd, getParagraphStart, getSeparateByBeads, getSortByPosition, getSpacingTolerance, getStartBookmark, getStartPage, getSuppressDuplicateOverlappingText, getText, getText, getWordSeparator, handleLineSeparation, inspectFontEncoding, isParagraphSeparation, matchListItemPattern, matchPattern, processPage, processPages, processTextPosition, resetEngine, setAddMoreFormatting, setArticleEnd, setArticleStart, setAverageCharTolerance, setDropThreshold, setEndBookmark, setEndPage, setIndentThreshold, setLineSeparator, setListItemPatterns, setPageEnd, setPageSeparator, setPageStart, setParagraphEnd, setParagraphStart, setShouldSeparateByBeads, setSortByPosition, setSpacingTolerance, setStartBookmark, setStartPage, setSuppressDuplicateOverlappingText, setWordSeparator, startArticle, startArticle, startDocument, startPage, writeCharacters, writeLineSeparator, writePage, writePageEnd, writePageSeperator, writePageStart, writeParagraphEnd, writeParagraphSeparator, writeParagraphStart, writeString, writeText, writeText, writeWordSeparator
getColorSpaces, getCurrentPage, getFonts, getGraphicsStack, getGraphicsState, getGraphicsStates, getResources, getTextLineMatrix, getTextMatrix, getTotalCharCnt, getValidCharCnt, getXObjects, isForceParsing, processEncodedText, processOperator, processOperator, processStream, processSubStream, registerOperatorProcessor, setColorSpaces, setFonts, setForceParsing, setGraphicsStack, setGraphicsState, setGraphicsStates, setTextLineMatrix, setTextMatrix
public TSGTextStripper() throws java.io.IOException
java.io.IOException
- If there is an error loading the properties.protected void writeString(java.lang.String text, java.util.List<org.apache.pdfbox.util.TextPosition> textPositions) throws java.io.IOException
writeString
in class org.apache.pdfbox.util.PDFTextStripper
java.io.IOException
public IndexRectangle buildIndexRect(org.apache.pdfbox.util.TextPosition firstTextPosition, org.apache.pdfbox.util.TextPosition lastTextPosition)
firstTextPosition
- - The location of the start of the textlastTextPosition
- - The location of the start of the textpublic boolean checkForLinkStart(java.lang.String textPositionsWord)
textPositionsWord
- - The text that we are checkingpublic java.util.Map<java.lang.String,java.util.List<TSGIndexEntry>> getWordsWithPositions()
TSGIndexEntry
objects.public java.util.List<TSGInternalLink> getTextLinks()
TSGInternalLink
objects.public void clearTextLinks()
public void clearWordMap()
public java.lang.String getStringFromTextPosition(org.apache.pdfbox.util.TextPosition textPosition)
textPosition
- the text position to get the text from