I am using PdfBox in Java to extract text from PDF files. Some of the input files provided are not valid and PDFTextStripper halts on these files. Is there a clean way to ch
The answer by Roger Keays is wrong! since not all PDF files in version 1.3 and not all terminated by EOL. The answer below works for all not corrupted pdf files:
public static boolean is_pdf(byte[] data) {
if (data != null && data.length > 4
&& data[0] == 0x25 && // %
data[1] == 0x50 && // P
data[2] == 0x44 && // D
data[3] == 0x46 && // F
data[4] == 0x2D) { // -
// version 1.3 file terminator
if (//data[5] == 0x31 && data[6] == 0x2E && data[7] == 0x33 &&
data[data.length - 7] == 0x25 && // %
data[data.length - 6] == 0x25 && // %
data[data.length - 5] == 0x45 && // E
data[data.length - 4] == 0x4F && // O
data[data.length - 3] == 0x46 && // F
data[data.length - 2] == 0x20 // SPACE
//&& data[data.length - 1] == 0x0A// EOL
) {
return true;
}
// version 1.3 file terminator
if (//data[5] == 0x31 && data[6] == 0x2E && data[7] == 0x34 &&
data[data.length - 6] == 0x25 && // %
data[data.length - 5] == 0x25 && // %
data[data.length - 4] == 0x45 && // E
data[data.length - 3] == 0x4F && // O
data[data.length - 2] == 0x46 // F
//&& data[data.length - 1] == 0x0A // EOL
) {
return true;
}
}
return false;
}