1、从MS word文档获取图片
2010-07-18 19:47
本文包括如何用java从WORD文档获取图片和其他二进制嵌入式对象的演示
Microsoft 从office 2003开始支持将文档保存为XML. 图片则以二进制形式嵌入在XML文档中, 它使用BASE 64 编码. MS Word 使用 w:binData 标签存储嵌入式二进制数据, 并使用伪协议wordml创建一个name属性. URI可以使一个带有文件类型扩展名的名字. 例如
3、改主意, 送到数据库里去. 设计的思路如下: public class ImageExtractor { CharArrayWriter text = new CharArrayWriter(); Map dataMap = new HashMap(); int foundImages; public ImageExtractor() { //C } /** * InputStream is closed internally. * @param is * @throws IOExc
4、eption */ public ImageExtractor(InputStream is) throws IOException { parseXmlFile(is, new ImageParseHandler() , false); is.close(); } /** * Refuse to Validate against dtd. * @param is * @param handler * @param validating */ private void par
5、seXmlFile(InputStream is, DefaultHandler handler, boolean validating) { try { SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setValidating(validating); factory.newSAXParser().parse(is , handler); } catch (SAXException e
6、) { // A parsing error occurred; the xml input is not valid } catch (ParserConfigurationException e) { // } catch (IOException e) { // } } private class ImageParseHandler extends DefaultHandler { private boolean
7、inImage = false; private StringBuffer encodedDataSb = null; private String imageName; Locator locator; public void setDocumentLocator(Locator locator) { this.locator = locator; } public void characters(char[] chars, int start, int l
8、en) throws SAXException { if (inImage) { encodedDataSb.append(new String(chars, start, len)); } } public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException
9、 { text.reset(); if (qName.equals("w:binData")) { imageName = attributes.getValue("w:name"); if (imageName.endsWith(".png") || imageName.endsWith(".jpg")) { encodedDataSb = new StringBuffer(); inImage =
10、true; } else { inImage = false; } foundImages++; } } public void endElement(String uri, String localName, String qName) throws SAXException { if (qName.equals("w:binData") && inImage)
11、 { ByteArrayInputStream is = new ByteArrayInputStream(encodedDataSb.toString().getBytes()); ByteArrayOutputStream baos = new ByteArrayOutputStream(); ImageDecoder id = new ImageDecoder(); id.decodeImage(is, baos); dataMap.put(imageN
12、ame, baos.toByteArray()); try { is.close(); baos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } inImage = false;
13、} } } public int getFoundImages() { return foundImages; } public Map getDataMap() { return dataMap; } } 下面的代码用来处理Base64解码. 当然也可以改用 Jakarta Commons Codec package 来做. classpath里别忘了添加GCJ. public class ImageDecoder { public void decodeImage(InputStream is, OutputStream os) { BASE64Decoder decoder = new BASE64Decoder(); try { decoder.decodeBuffer(is, os); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }






