private static String readDoc(String filePath, InputStream is) {
String text= "";
try {
if (filePath.endsWith("doc")) {
WordExtractor ex = new WordExtractor(is);
text = ex.getText();
ex.close();
is.close();
} else if(filePath.endsWith("docx")) {
XWPFDocument doc = new XWPFDocument(is);
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
text = extractor.getText();
extractor.close();
is.close();
}
} catch (Exception e) {
logger.error(filePath, e);
} finally {
if (is != null) {
is.close();
}
}
return text;
}
org.apache.poi.poifs.filesystem.OfficeXmlFileException: The supplied data appears to be in the Office 2007+ XML. You are calling the part of POI that deals with OLE2 Office Documents.
boolean isZip = new ZipInputStream( fileStream ).getNextEntry() != null;
public enum FileMagic {
/** OLE2 / BIFF8+ stream used for Office 97 and higher documents */
OLE2(HeaderBlockConstants._signature),
/** OOXML / ZIP stream */
OOXML(OOXML_FILE_HEADER),
/** XML file */
XML(RAW_XML_FILE_HEADER),
/** BIFF2 raw stream - for Excel 2 */
BIFF2(new byte[]{
0x09, 0x00, // sid=0x0009
0x04, 0x00, // size=0x0004
0x00, 0x00, // unused
0x70, 0x00 // 0x70 = multiple values
}),
/** BIFF3 raw stream - for Excel 3 */
BIFF3(new byte[]{
0x09, 0x02, // sid=0x0209
0x06, 0x00, // size=0x0006
0x00, 0x00, // unused
0x70, 0x00 // 0x70 = multiple values
}),
/** BIFF4 raw stream - for Excel 4 */
BIFF4(new byte[]{
0x09, 0x04, // sid=0x0409
0x06, 0x00, // size=0x0006
0x00, 0x00, // unused
0x70, 0x00 // 0x70 = multiple values
},new byte[]{
0x09, 0x04, // sid=0x0409
0x06, 0x00, // size=0x0006
0x00, 0x00, // unused
0x00, 0x01
}),
/** Old MS Write raw stream */
MSWRITE(
new byte[]{0x31, (byte)0xbe, 0x00, 0x00 },
new byte[]{0x32, (byte)0xbe, 0x00, 0x00 }),
/** RTF document */
RTF("{\\rtf"),
/** PDF document */
PDF("%PDF"),
// keep UNKNOWN always as last enum!
/** UNKNOWN magic */
UNKNOWN(new byte[0]);
final byte[][] magic;
FileMagic(long magic) {
this.magic = new byte[1][8];
LittleEndian.putLong(this.magic[0], 0, magic);
}
FileMagic(byte[]... magic) {
this.magic = magic;
}
FileMagic(String magic) {
this(magic.getBytes(LocaleUtil.CHARSET_1252));
}
public static FileMagic valueOf(byte[] magic) {
for (FileMagic fm : values()) {
int i=0;
boolean found = true;
for (byte[] ma : fm.magic) {
for (byte m : ma) {
byte d = magic[i++];
if (!(d == m || (m == 0x70 && (d == 0x10 || d == 0x20 || d == 0x40)))) {
found = false;
break;
}
}
if (found) {
return fm;
}
}
}
return UNKNOWN;
}
/**
* Get the file magic of the supplied InputStream (which MUST
* support mark and reset).<p>
*
* If unsure if your InputStream does support mark / reset,
* use {@link #prepareToCheckMagic(InputStream)} to wrap it and make
* sure to always use that, and not the original!<p>
*
* Even if this method returns {@link FileMagic#UNKNOWN} it could potentially mean,
* that the ZIP stream has leading junk bytes
*
* @param inp An InputStream which supports either mark/reset
*/
public static FileMagic valueOf(InputStream inp) throws IOException {
if (!inp.markSupported()) {
throw new IOException("getFileMagic() only operates on streams which support mark(int)");
}
// Grab the first 8 bytes
byte[] data = IOUtils.peekFirst8Bytes(inp);
return FileMagic.valueOf(data);
}
/**
* Checks if an {@link InputStream} can be reseted (i.e. used for checking the header magic) and wraps it if not
*
* @param stream stream to be checked for wrapping
* @return a mark enabled stream
*/
public static InputStream prepareToCheckMagic(InputStream stream) {
if (stream.markSupported()) {
return stream;
}
// we used to process the data via a PushbackInputStream, but user code could provide a too small one
// so we use a BufferedInputStream instead now
return new BufferedInputStream(stream);
}
}
private static String readDoc (String filePath, InputStream is) {
String text= "";
is = FileMagic.prepareToCheckMagic(is);
try {
if (FileMagic.valueOf(is) == FileMagic.OLE2) {
WordExtractor ex = new WordExtractor(is);
text = ex.getText();
ex.close();
} else if(FileMagic.valueOf(is) == FileMagic.OOXML) {
XWPFDocument doc = new XWPFDocument(is);
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
text = extractor.getText();
extractor.close();
}
} catch (Exception e) {
logger.error("for file " + filePath, e);
} finally {
if (is != null) {
is.close();
}
}
return text;
}
@SuppressWarnings("deprecation" )
private static String readExcel(String filePath, InputStream inp) throws Exception {
Workbook wb;
StringBuilder sb = new StringBuilder();
try {
if (filePath.endsWith(".xls")) {
wb = new HSSFWorkbook(inp);
} else {
wb = StreamingReader.builder()
.rowCacheSize(1000) // number of rows to keep in memory (defaults to 10)
.bufferSize(4096) // buffer size to use when reading InputStream to file (defaults to 1024)
.open(inp); // InputStream or File for XLSX file (required)
}
sb = readSheet(wb, sb, filePath.endsWith(".xls"));
wb.close();
} catch (OLE2NotOfficeXmlFileException e) {
logger.error(filePath, e);
} finally {
if (inp != null) {
inp.close();
}
}
return sb.toString();
}
private static String readExcelByFile(String filepath, File file) {
Workbook wb;
StringBuilder sb = new StringBuilder();
try {
if (filepath.endsWith(".xls")) {
wb = WorkbookFactory.create(file);
} else {
wb = StreamingReader.builder()
.rowCacheSize(1000) // number of rows to keep in memory (defaults to 10)
.bufferSize(4096) // buffer size to use when reading InputStream to file (defaults to 1024)
.open(file); // InputStream or File for XLSX file (required)
}
sb = readSheet(wb, sb, filepath.endsWith(".xls"));
wb.close();
} catch (Exception e) {
logger.error(filepath, e);
}
return sb.toString();
}
private static StringBuilder readSheet(Workbook wb, StringBuilder sb, boolean isXls) throws Exception {
for (Sheet sheet: wb) {
for (Row r: sheet) {
for (Cell cell: r) {
if (cell.getCellType() == Cell.CELL_TYPE_STRING) {
sb.append(cell.getStringCellValue());
sb.append(" ");
} else if (cell.getCellType() == Cell.CELL_TYPE_NUMERIC) {
if (isXls) {
DataFormatter formatter = new DataFormatter();
sb.append(formatter.formatCellValue(cell));
} else {
sb.append(cell.getStringCellValue());
}
sb.append(" ");
}
}
}
}
return sb;
}
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有