POI之Word转化为Markdown-yellowcong

来源:互联网 发布:如何选购家具 知乎 编辑:程序博客网 时间:2024/06/09 15:35

Markdown最近特别的火,同时我也想把自己以前做的笔记(用doc写的)分享出来,所以我想将DOC解析,然后生成Markdown文件,然后通过代码直接将图片上传到七牛云 ,将文本数据传到野狗云和CSND,为啥我喜欢用七牛和野狗呢,因为他们都有免费的份额。PS(没有写完)

环境搭建

<!-- excel -->    <dependency>    <groupId>org.apache.poi</groupId>    <artifactId>poi</artifactId>    <version>3.17</version></dependency><!-- word --><dependency>    <groupId>org.apache.poi</groupId>    <artifactId>poi-scratchpad</artifactId>    <version>3.17</version></dependency><!-- xlsx --><dependency>    <groupId>org.apache.poi</groupId>    <artifactId>poi-ooxml</artifactId>    <version>3.17</version></dependency><!-- xlsx  依赖这个包 --><dependency>    <groupId>org.apache.poi</groupId>    <artifactId>poi-ooxml-schemas</artifactId>    <version>3.17</version></dependency>

获取标题等级

Doc中的标题有等级,对应的我们的Markdown中也存在等级的操作

/**一级标题  begin */private static final String STYLE_TITLE_FONT_FONTNAME_1 ="宋体";private static final Integer STYLE_TITLE_FONT_SIZE_1 =48;private static final Integer TITLE_LV_1 = 1;/**一级标题  end *//**二级标题  begin */private static final String STYLE_TITLE_FONT_FONTNAME_2 ="Arial";private static final Integer STYLE_TITLE_FONT_SIZE_2 =32;private static final Integer TITLE_LV_2 = 2;/**二级标题  begin *//**三级标题  begin */private static final String STYLE_TITLE_FONT_FONTNAME_3 ="Times New Roman";private static final Integer STYLE_TITLE_FONT_SIZE_3 =32;private static final Integer TITLE_LV_3 = 3;/**三级标题  begin *//**普通的文本,不是标题*/private static final Integer TITLE_LV_0 = 0;/** * 获取标题的等级 * @param paragraph * @return 1、2、3三个等级,如果没有数据返回,那么等级为0 ,是普通文本 */private static Integer getTitleLvl(Paragraph paragraph) {    //获取这一段文字的数目    int charCnts = paragraph.numCharacterRuns();    int titleLv = TITLE_LV_0;    if(charCnts == 0){        return TITLE_LV_0;    }    CharacterRun characterRun = paragraph.getCharacterRun(0);    if(characterRun != null){        //字体的名称        String fontName =characterRun.getFontName();        //字体大小        int fontSize = characterRun.getFontSize();        //一级标题的情况        if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_1) && fontSize == STYLE_TITLE_FONT_SIZE_1){            //System.out.println("一级标题\t"+paragraph.text().trim());            titleLv= TITLE_LV_1;        }else if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_2) && fontSize == STYLE_TITLE_FONT_SIZE_2){            titleLv= TITLE_LV_2;//              System.out.println("\t二级标题\t"+paragraph.text().trim());        }else if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_3) && fontSize == STYLE_TITLE_FONT_SIZE_3){            titleLv= TITLE_LV_3;            //System.out.println("\t\t三级标题\t"+paragraph.text().trim());        }    }    return titleLv;}

获取每行文字的每段落字符的样式

Markdown中的文字都是存在样式 的,所以每一行都是有不同的样式,通过Range获取CharacterRun (这个类中包含了字体信息)

// 获取段落数,一个回车符号就是一个段落了int paraNum = range.numParagraphs();System.out.println(paraNum);for (int i = 0; i < paraNum; i++) {    //获取段落    Paragraph paragraph =range.getParagraph(i);    //获取这一段文字的数目    int charCnts = paragraph.numCharacterRuns();    if(charCnts == 0){        return;    }    //获取标题的级别,是一级 二级,三级的情况 , 0是普通文本    int titleLev = getTitleLvl(paragraph);    //源码中,有类似的写法,具体为啥判断是否》=-1不是特别清楚    int skipUntil = -1;    for (int c = 0; c < range.numCharacterRuns(); c++) {        //CharacterRun 会根据字体和样式,自动的字符分割开        CharacterRun characterRun = range.getCharacterRun(c);        if (characterRun != null && characterRun.getStartOffset() >= skipUntil) {            String text = characterRun.text();            String fontName = characterRun.getFontName();            int fontSize = characterRun.getFontSize();            int fontColor = characterRun.getColor();            boolean bold = characterRun.isBold();            boolean italic= characterRun.isItalic();            System.out.printf("当前文字的字体名称:%s,字体大小%d,字体颜色%d,加粗%b,斜体%b\r\n",text,fontName,fontSize,fontColor,bold,italic);        }    }    //图片处理    //表格处理}

完整代码

这部分代码没有完全的写完

package com.yellowcong.test;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.util.UUID;import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.usermodel.Bookmark;import org.apache.poi.hwpf.usermodel.Bookmarks;import org.apache.poi.hwpf.usermodel.CharacterRun;import org.apache.poi.hwpf.usermodel.Paragraph;import org.apache.poi.hwpf.usermodel.Range;import org.apache.poi.hwpf.usermodel.Section;import org.apache.poi.hwpf.usermodel.Table;import org.apache.poi.hwpf.usermodel.TableCell;import org.apache.poi.hwpf.usermodel.TableIterator;import org.apache.poi.hwpf.usermodel.TableRow;import com.sun.xml.internal.messaging.saaj.util.ByteInputStream;import com.yellowcong.utils.DocUtils;public class DocTest {    private static final String BASE_PATH = "D:\\笔记\\服务器学习\\bae\\";    /**一级标题  begin */    private static final String STYLE_TITLE_FONT_FONTNAME_1 ="宋体";    private static final Integer STYLE_TITLE_FONT_SIZE_1 =48;    private static final Integer TITLE_LV_1 = 1;    /**一级标题  end */    /**二级标题  begin */    private static final String STYLE_TITLE_FONT_FONTNAME_2 ="Arial";    private static final Integer STYLE_TITLE_FONT_SIZE_2 =32;    private static final Integer TITLE_LV_2 = 2;    /**二级标题  begin */    /**三级标题  begin */    private static final String STYLE_TITLE_FONT_FONTNAME_3 ="Times New Roman";    private static final Integer STYLE_TITLE_FONT_SIZE_3 =32;    private static final Integer TITLE_LV_3 = 3;    /**三级标题  begin */    /**普通的文本,不是标题*/    private static final Integer TITLE_LV_0 = 0;    public static void main(String[] args) throws Exception, IOException {        File file = new File(BASE_PATH + "BAE 服务器文件目录问题.doc");        HWPFDocument doc = new HWPFDocument(new FileInputStream(file));        DocUtils.copyDocToHtml(file);        Range range = doc.getRange();        printInfo(range);        // HWPFDocumentCore        // HWPFDocumentCore doc = WordToHtmlUtils.loadDoc(file);    }   /**     * 插入内容到Range,这里只会写到内存中     * @param range     */     private static void insertInfo(Range range) {        range.insertAfter("Hello");     }      /**     * 在Word中,一个回车符就是一个段落了     * 输出Range     *      * @param range     */    private static void printInfo(Range range) {        // 获取段落数,一个回车符号就是一个段落了        int paraNum = range.numParagraphs();        System.out.println(paraNum);        for (int i = 0; i < paraNum; i++) {            //获取段落            Paragraph paragraph =range.getParagraph(i);            //获取这一段文字的数目            int charCnts = paragraph.numCharacterRuns();            if(charCnts == 0){                return;            }            //获取标题的级别,是一级 二级,三级的情况 , 0是普通文本            int titleLev = getTitleLvl(paragraph);            int skipUntil = -1;            for (int c = 0; c < range.numCharacterRuns(); c++) {                //CharacterRun 会根据字体和样式,自动的字符分割开                CharacterRun characterRun = range.getCharacterRun(c);                if (characterRun != null && characterRun.getStartOffset() >= skipUntil) {                    String text = characterRun.text();                    String fontName = characterRun.getFontName();                    int fontSize = characterRun.getFontSize();                    int fontColor = characterRun.getColor();                    boolean bold = characterRun.isBold();                    boolean italic= characterRun.isItalic();                    System.out.printf("当前文字的字体名称:%s,字体大小%d,字体颜色%d,加粗%b,斜体%b\r\n",text,fontName,fontSize,fontColor,bold,italic);                }            }            //图片处理            //表格处理        }    }    /**     * 用于装每一个 返回的文字的样式信息     * @author yellowcong     * @date 2017年7月15日     */    class FontStyle{        private String fontColor;        private String fontName;        private Integer fontSize;        private boolean bold; //粗体        private boolean italic; //斜体        public String getFontColor() {            return fontColor;        }        public void setFontColor(String fontColor) {            this.fontColor = fontColor;        }        public String getFontName() {            return fontName;        }        public void setFontName(String fontName) {            this.fontName = fontName;        }        public Integer getFontSize() {            return fontSize;        }        public void setFontSize(Integer fontSize) {            this.fontSize = fontSize;        }        public boolean isBold() {            return bold;        }        public void setBold(boolean bold) {            this.bold = bold;        }        public boolean isItalic() {            return italic;        }        public void setItalic(boolean italic) {            this.italic = italic;        }    }    /**     * 获取标题的等级     * @param paragraph     * @return 1、2、3三个等级,如果没有数据返回,那么等级为0 ,是普通文本     */    private static Integer getTitleLvl(Paragraph paragraph) {        //获取这一段文字的数目        int charCnts = paragraph.numCharacterRuns();        int titleLv = TITLE_LV_0;        if(charCnts == 0){            return TITLE_LV_0;        }        CharacterRun characterRun = paragraph.getCharacterRun(0);        if(characterRun != null){            //字体的名称            String fontName =characterRun.getFontName();            //字体大小            int fontSize = characterRun.getFontSize();            //一级标题的情况            if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_1) && fontSize == STYLE_TITLE_FONT_SIZE_1){                //System.out.println("一级标题\t"+paragraph.text().trim());                titleLv= TITLE_LV_1;            }else if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_2) && fontSize == STYLE_TITLE_FONT_SIZE_2){                titleLv= TITLE_LV_2;//              System.out.println("\t二级标题\t"+paragraph.text().trim());            }else if(fontName.equals(STYLE_TITLE_FONT_FONTNAME_3) && fontSize == STYLE_TITLE_FONT_SIZE_3){                titleLv= TITLE_LV_3;                //System.out.println("\t\t三级标题\t"+paragraph.text().trim());            }        }        return titleLv;    }    private static void getFontLive(){    }    @SuppressWarnings("resource")    public static void copyByteToFile(byte[] imgByte) throws Exception {        InputStream in = new ByteInputStream(imgByte, 0, imgByte.length);        byte[] buff = new byte[1024];        String fileName = UUID.randomUUID().toString().substring(0, 6);        OutputStream out = new FileOutputStream(new File(BASE_PATH + fileName + ".jpg"));        int len = 0;        while ((len = in.read(buff)) > 0) {            out.write(buff, 0, len);        }        out.flush();        out.close();        in.close();    }    /**         * 输出书签信息         * @param bookmarks         */         private void printInfo(Bookmarks bookmarks) {            int count = bookmarks.getBookmarksCount();            System.out.println("书签数量:" + count);            Bookmark bookmark;            for (int i=0; i<count; i++) {               bookmark = bookmarks.getBookmark(i);               System.out.println("书签" + (i+1) + "的名称是:" + bookmark.getName());               System.out.println("开始位置:" + bookmark.getStart());               System.out.println("结束位置:" + bookmark.getEnd());            }         }         /**         * 读表格         * 每一个回车符代表一个段落,所以对于表格而言,每一个单元格至少包含一个段落,每行结束都是一个段落。         * @param range         */         private static void readTable(Range range) {            //遍历range范围内的table。            TableIterator tableIter = new TableIterator(range);            Table table;            TableRow row;            TableCell cell;            while (tableIter.hasNext()) {               table = tableIter.next();               int rowNum = table.numRows();               for (int j=0; j<rowNum; j++) {                  row = table.getRow(j);                  int cellNum = row.numCells();                  for (int k=0; k<cellNum; k++) {                      cell = row.getCell(k);                      //输出单元格的文本                      System.out.println(cell.text().trim());                  }               }            }         }         /**         * 读列表         * @param range         */         private static void readList(Range range) {            int num = range.numParagraphs();            Paragraph para;            for (int i=0; i<num; i++) {               para = range.getParagraph(i);               if (para.isInList()) {                  System.out.println("list: " + para.text());               }            }         }  }
原创粉丝点击