利用Lucene对大文件进行预处理（可运行）

package comTwo;

import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.IOException;

import java.util.HashMap;

public class WanZheng {

public static String replace(String line){

//创建一个HashMap存储全角和半角字符之间的对应关系

HashMap map = new HashMap();

map.put("，", ",");

map.put("。", ".");

map.put("〈", "<");

map.put("〉", ">");

map.put("｜", "|");

map.put("《", "<");

map.put("》", ">");

map.put("［", "[");

map.put("］", "]");

map.put("？", "?");

map.put("＂", "\"");

map.put("：", ":");

map.put("﹑", ",");

map.put("（", "(");

map.put("）", ")");

map.put("【", "[");

map.put("】", "]");

map.put("－", "-");

map.put("￣", "~");

map.put("！", "!");

map.put("｀", "`");

map.put("１", "1");

map.put("２", "2");

map.put("３", "3");

map.put("４", "4");

map.put("５", "5");

map.put("６", "6");

map.put("７", "7");

map.put("８", "8");

map.put("９", "9");

int length = line.length();

for(int i = 0; i < length; i++){

String charat = line.substring(i, i + 1);

if(map.get(charat) != null){

line = line.replace(charat, (String)map.get(charat));

}

}

return line;

}

public static File charactoProcess(File file, String destFile){

try{

//创建一个输出流，用于写新文件

BufferedWriter writer = new BufferedWriter(new FileWriter(destFile));

//创建一个输入流，用于读取文件

BufferedReader reader = new BufferedReader(new FileReader(file));

String line = reader.readLine();

while(line != null){

//调用replace方法替换所有的全角字符

String newline = replace(line);

//将替换后的String写入新的文件

writer.write(newline);

//写入行分隔符

writer.newLine();

line = reader.readLine();

}

reader.close();

writer.close();

}catch(IOException e){

e.printStackTrace();

}

return new File(destFile);

}

public static void splitToSmallFils(File file, String outputpath){

try{

//文件计数器,用于文件名

int filePointer = 0;

//定义文件的最大长度

int MAX_SIZE = 10240;

//创建文件输出流

BufferedWriter writer = null;

//创建文件输入流

BufferedReader reader =
new BufferedReader(new FileReader(file));

//建立字符串缓冲区，存储大文件中读取的数据

StringBuffer buffer = new StringBuffer();

String line = reader.readLine();

//循环遍历读取的每行字符串

while(line != null){

//如果读取字符串不为空，则将字符串加入到缓冲区

//并在每行字符串后面加上回车换行

buffer.append(line).append("\r\n");

//判断缓冲区长度是否达到文件最大长度

if(buffer.toString().getBytes().length >= MAX_SIZE){

//如果文件达到最大长度，则将缓冲区的数据写入文件

//filePointer是文件名前缀的一部分

writer = new BufferedWriter(new FileWriter(outputpath + "切分后" + filePointer + ".txt"));

writer.write(buffer.toString());

writer.close();

//文件计数器加一

filePointer++;

//清空缓冲区数据

buffer = new StringBuffer();

}

//如果没有达到文件的最大长度

line = reader.readLine();

}

//如果大文件已经读取完毕，直接将缓冲区数据写入文件

writer = new BufferedWriter(new FileWriter(outputpath + "切分后" + filePointer + ".txt"));

writer.write(buffer.toString());

writer.close();

}catch(IOException e){

e.printStackTrace();

}

}

public static void preprocess(File file, String outputDir){

try{

splitToSmallFils(charactoProcess(file, outputDir + "output.all"), outputDir);

}catch(Exception e){

e.printStackTrace();

}

}

public static void main(String[] args) {

// TODO Auto-generated method stub

//设置需要被预处理的原文件位置

String inputFile = "E:\\Lucene项目\\钢铁是怎样练成的.txt";

//设置预处理后的文件存放位置

String outputDir = "E:\\Lucene项目\\目标文件\\";

//判断处理后文件存放文件夹是否存在，如果不存在则创建文件加

if(!new File(outputDir).exists()){

new File(outputDir).mkdirs();

}

//创建一个FileProcess类,并要求调用preprocess

WanZheng filePreprocess = new WanZheng();

filePreprocess.preprocess(new File(inputFile), outputDir);

}

}

时间： 2024-10-05 04:09:19

利用Lucene对大文件进行预处理（可运行）

利用Lucene对大文件进行预处理（可运行）的相关文章

利用Lucene将大文档切割成多个小文档，（可运行）

linux下利用split分割大文件

大文件，5亿整数，怎么排？

对大文件排序

WCF利用Stream上传大文件

利用Lucene和 XPDF 来处理pdf文件

[开源应用]利用HTML5+resumableJs拖拽上传大文件

利用文件打开方式with open('文件名',方式) as 变量名做一个简单的复制（排除大文件bug）

算法初级面试题05——哈希函数/表、生成多个哈希函数、哈希扩容、利用哈希分流找出大文件的重复内容、设计RandomPool结构、布隆过滤器、一致性哈希、并查集、岛问题