功能说明:从hdfs读取excel文件,经过poi转换成txt文本文件并输出成hdfs文件
一、引入jar包
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.14</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.14</version> </dependency>
二、代码实现
package operator.excel; import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataOutputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path; import java.io.File;import java.io.IOException;import java.io.InputStream; /** * @ClassName: ExcelInputFormat * @Description: * @Author: mashiwei * @Date: 2017/6/30 */public class ExcelInputFormat { public static void main(String[] args) throws IOException { Configuration config = new Configuration(); InputStream inputStream; String[] strArrayofLines; String output = "/kettle/excel/test.txt"; String input = "/kettle/excel/test.xls"; FileSystem fileSystem = FileSystem.get(config); FSDataOutputStream out = fileSystem.create(new Path(output)); inputStream = fileSystem.open(new Path(input));// strArrayofLines = ExcelParser.parseExcelData(inputStream); strArrayofLines = ExcelParser.parseExcelData(inputStream,new File(input)); for (String str:strArrayofLines) { System.out.println("------"+str); out.writeBytes(str); out.writeUTF("\n"); } inputStream.close(); out.close(); }}
package operator.excel; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; /** * @ClassName: ExcelParser * @Description: * @Author: mashiwei * @Date: 2017/6/30 */ public class ExcelParser { private static final Log logger = LogFactory.getLog(ExcelParser.class); public static void checkFile(File file) throws IOException{ //判断文件是否存在 if(null == file){ logger.error("文件不存在!"); throw new FileNotFoundException("文件不存在!"); } //获得文件名 String fileName = file.getAbsolutePath(); //判断文件是否是excel文件 if(!fileName.endsWith("xls") && !fileName.endsWith("xlsx")){ logger.error(fileName + "不是excel文件"); throw new IOException(fileName + "不是excel文件"); } } /** * 解析is * * @param is 数据源 * @return String[] */ public static String[] parseExcelData(InputStream is,File file) { //获得文件名 String fileName = file.getAbsolutePath(); try { checkFile(file); } catch (IOException e) { e.printStackTrace(); } // 结果集 List<String> resultList = new ArrayList<String>(); Workbook workbook = null; try { // 获取Workbook if(fileName.endsWith("xls")){ //2003 workbook = new HSSFWorkbook(is); }else if(fileName.endsWith("xlsx")) { //2007 workbook = new XSSFWorkbook(is); } // 获取sheet Sheet sheet = workbook.getSheetAt(0); Iterator<Row> rowIterator = sheet.iterator(); while (rowIterator.hasNext()) { // 行 Row row = rowIterator.next(); // 字符串 StringBuilder rowString = new StringBuilder(); Iterator<Cell> colIterator = row.cellIterator(); while (colIterator.hasNext()) { Cell cell = colIterator.next(); switch (cell.getCellType()) { case Cell.CELL_TYPE_BOOLEAN: rowString.append(cell.getBooleanCellValue() + ","); break; case Cell.CELL_TYPE_NUMERIC: rowString.append(cell.getNumericCellValue() + ","); break; case Cell.CELL_TYPE_STRING: rowString.append(cell.getStringCellValue() + ","); break; } } String str = rowString.delete(rowString.lastIndexOf(","),rowString.lastIndexOf(",")+1).toString(); resultList.add(str); // resultList.add(rowString.toString()); } } catch (IOException e) { logger.error("IO Exception : File not found " + e); } return resultList.toArray(new String[0]); } }
时间: 2024-10-10 00:00:30