微博数据清洗(Java版)

来源:互联网 发布:西班牙语发音软件 编辑:程序博客网 时间:2024/06/11 18:25

大数据公益大学提供的一份数据,义务处理一下,原始数据是Excel,含有html标签,如下:



要求清洗掉html标签,和微博内容中的url地址。


主要分为两部分:

1.处理文本,清洗数据。

2.处理excel读写操作。


上代码:

ExcelUtil类,包含Excel2003-2007的读写操作,Excel使用Apache POI进行操作,需要jar包如下:


package dat.datadeal;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.List;import java.util.Locale;import java.util.logging.Level;import java.util.logging.Logger;import org.apache.poi.hssf.usermodel.HSSFCell;import org.apache.poi.hssf.usermodel.HSSFCellStyle;import org.apache.poi.hssf.usermodel.HSSFRow;import org.apache.poi.hssf.usermodel.HSSFSheet;import org.apache.poi.hssf.usermodel.HSSFWorkbook;import org.apache.poi.ss.usermodel.Cell;import org.apache.poi.ss.usermodel.DateUtil;import org.apache.poi.ss.usermodel.Row;import org.apache.poi.ss.usermodel.Sheet;import org.apache.poi.ss.usermodel.Workbook;import org.apache.poi.xssf.usermodel.XSSFWorkbook;/** *  * @author daT dev.tao@gmail.com *2003,2007版excel读写工具 */public class ExcelUtil{/** * Excel文件读取 * @param filePath * @return String[]存的是行,List存的是列。 * 一个excel一次全部读入内存(Excel超大需要另行处理) */public  List<String[]> readExcel(String filePath) {          List<String[]> dataList = new ArrayList<String[]>();          boolean isExcel2003 = true;          if (isExcel2007(filePath)) {              isExcel2003 = false;          }          File file = new File(filePath);          InputStream is = null;          try {              is = new FileInputStream(file);          } catch (FileNotFoundException ex) {              Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);          }          Workbook wb = null;          try {              wb = isExcel2003 ? new HSSFWorkbook(is) : new XSSFWorkbook(is);          } catch (IOException ex) {              Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);          }          Sheet sheet = wb.getSheetAt(0);          int totalRows = sheet.getPhysicalNumberOfRows();          int totalCells = 0;          if (totalRows >= 1 && sheet.getRow(0) != null) {              totalCells = sheet.getRow(0).getPhysicalNumberOfCells();          }          for (int r = 0; r < totalRows; r++) {              Row row = sheet.getRow(r);              if (row == null) {                  continue;              }              String[] rowList = new String[totalCells];              for (int c = 0; c < totalCells; c++) {                  Cell cell = row.getCell(c);                  String cellValue = "";                  if (cell == null) {                      rowList[c] = (cellValue);                      continue;                  }                  cellValue = ConvertCellStr(cell, cellValue);                  rowList[c] = (cellValue);              }              dataList.add(rowList);          }          return dataList;      }  private String ConvertCellStr(Cell cell, String cellStr) {          switch (cell.getCellType()) {              case Cell.CELL_TYPE_STRING:                  // 读取String                  cellStr = cell.getStringCellValue().toString();                  break;              case Cell.CELL_TYPE_BOOLEAN:                  // 得到Boolean对象的方法                  cellStr = String.valueOf(cell.getBooleanCellValue());                  break;              case Cell.CELL_TYPE_NUMERIC:                  // 先看是否是日期格式                  if (DateUtil.isCellDateFormatted(cell)) {                      // 读取日期格式                      cellStr = formatTime(cell.getDateCellValue().toString());                  } else {                      // 读取数字                      cellStr = String.valueOf(cell.getNumericCellValue());                  }                  break;              case Cell.CELL_TYPE_FORMULA:                  // 读取公式                  cellStr = cell.getCellFormula().toString();                  break;          }          return cellStr;      }    private boolean isExcel2007(String fileName) {          return fileName.matches("^.+\\.(?i)(xlsx)$");   }  private String formatTime(String s) {          SimpleDateFormat sf = new SimpleDateFormat("EEE MMM dd hh:mm:ss z yyyy", Locale.ENGLISH);          Date date = null;          try {              date = sf.parse(s);          } catch (ParseException ex) {              Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);          }          SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");          String result = sdf.format(date);          return result;      }   /** * Excel写操作,简单起见还是采用内存数据一次写入 * @param filePath 输出文件路径名 * @param dataList 输出文件内容,List<String>行  List列 * @throws IOException */public void writeExcel(String filePath,List<List<String>> dataList) throws IOException{        HSSFWorkbook wb = new HSSFWorkbook();          HSSFSheet sheet = wb.createSheet("sheet");// 添加sheet          // 表格样式          HSSFCellStyle style = wb.createCellStyle();          style.setAlignment(HSSFCellStyle.ALIGN_CENTER);// 指定单元格居中对齐          // // 边框          // style.setBorderBottom(HSSFCellStyle.BORDER_MEDIUM);          // style.setBorderTop(HSSFCellStyle.BORDER_MEDIUM);          // style.setBorderLeft(HSSFCellStyle.BORDER_MEDIUM);          // style.setBorderRight(HSSFCellStyle.BORDER_MEDIUM);          // //设置字体          // HSSFFont f = wb.createFont();          // f.setFontHeightInPoints((short)10);          // f.setBoldweight(HSSFFont.BOLDWEIGHT_NORMAL);          // style.setFont(f);          // //设置列宽          // sheet.setColumnWidth((short)0, (short)9600);          // sheet.setColumnWidth((short)1, (short)4000);          // sheet.setColumnWidth((short)2, (short)8000);          // sheet.setColumnWidth((short)3, (short)8000);            // 在索引0的位置创建第一行            for (int i = 0; i < dataList.size(); i++) {              HSSFRow row = sheet.createRow(i);              List<String> list = dataList.get(i);              for (int j = 0; j < list.size(); j++) {                  HSSFCell cell = row.createCell(j);                  cell.setCellValue(list.get(j));                  cell.setCellStyle(style);              }          }          // 导出文件          FileOutputStream fout = new FileOutputStream(filePath);          wb.write(fout);          fout.close();  }}

DataClean类,包含对html标签,信息中url的的清洗。

package dat.datadeal;import java.io.IOException;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;/** *  * @author daT dev.tao@gmail.com * */public class DataClean {/** * 清洗html标签 * @param inputString * @return */public static String delHtml(String inputString) {        String htmlStr = inputString; // 含html标签的字符串        String textStr = "";        java.util.regex.Pattern p_script;        java.util.regex.Matcher m_script;        java.util.regex.Pattern p_html;        java.util.regex.Matcher m_html;        try {            String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式            String regEx_script = "<[/s]*?script[^>]*?>[/s/S]*?<[/s]*?//[/s]*?script[/s]*?>"; // 定义script的正则表达式{或<script[^>]*?>[/s/S]*?<//script>            p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);            m_script = p_script.matcher(htmlStr);            htmlStr = m_script.replaceAll(""); // 过滤script标签            p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);            m_html = p_html.matcher(htmlStr);            htmlStr = m_html.replaceAll(""); // 过滤html标签            textStr = htmlStr;        } catch (Exception e) {            System.err.println("Html2Text: " + e.getMessage());        }        return textStr;// 返回文本字符串    }/** * 处理掉信息中的url地址 */public static String dealWithUrl(String str){String regEx = "[http|https]+[://]+[0-9A-Za-z:/[-]_#[?][=][.][&]]*";        Pattern p = Pattern.compile(regEx);           Matcher m = p.matcher(str);        return m.replaceAll("");}public static void main(String[] args) throws IOException{ExcelUtil excelUtil = new ExcelUtil();List<List<String>> writeList = new ArrayList<List<String>>();List<String[]> readList =excelUtil.readExcel("/home/dat/javatest/微博数据_.xlsx");for(String[] lineArray:readList){List<String> strList = new ArrayList<String>();for(String str:lineArray){String strTmp = DataClean.dealWithUrl(DataClean.delHtml(str));strList.add(strTmp);//System.out.println(strTmp);}writeList.add(strList);}excelUtil.writeExcel("/home/dat/javatest/weibo.xlsx",writeList);      System.out.println("job has finished...........");}}


清洗后数据:


0 0