import java.io.EOFException; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.RandomAccessFile; import java.util.ArrayList; /* * 文件格式:已分词的文本,词语之间用空格,换行等空白符分割。 * 到了文件末尾就结束 * 适合读取一行很大的文本,因为这里的缓冲不是一行,而是若干个词语(比一行少)。 * 代码实现方式:每次读若干个词语作为一个句子,逐个字节读,以空白符区分词语的开始和结束。 * */ public class WordReader { RandomAccessFile raf = null; ArrayList<String> sentence = null; int senSize = 1000; int senPos =0 ; public WordReader(String fileName) throws IOException { File file=new File(fileName); raf = new RandomAccessFile(file,"r") ; sentence = new ArrayList<String>(); } public String[] getNextWords(int count) throws IOException { if(senPos+count >= sentence.size())//到了段落末尾,读取新的段落 { if(readSentence()) return getNextWords(count); else return null; } String[] words = new String[count]; for(int i=0;i<count;i++) { words[i] = sentence.get(senPos+i); } senPos++; return words; } private boolean readSentence() { try { sentence.clear(); for(int i=0;i<senSize;i++) { //System.out.println(i); int len = 0; while(true) { int b = raf.read(); if(b == -1) return false; if(b == ' ' || b == '\n'|| b == '\r'|| b=='\t') { break; } len++; } raf.seek(raf.getFilePointer() -len-1); byte[] buffer = new byte[len]; raf.read(buffer, 0, len); //byte[] sub = new byte[len]; //for(int k=0;k<len;k++) sub[k] = buffer[k]; String word = new String(buffer,"utf-8");//这里有坑,不会根据结束符0截断字符串,必须手动处理 //System.out.println(word); sentence.add(word); while(true) { int b = raf.read(); if(b == -1) return false; if(b == ' ' || b == '\n' || b == '\r' || b=='\t') { continue; } else break; } raf.seek(raf.getFilePointer() -1); } senPos = 0; return true; } catch(EOFException ex) { ex.printStackTrace(); return false; } catch(IOException ex) { ex.printStackTrace(); return false; } } public static void main(String[] args) throws IOException { // TODO Auto-generated method stub //WordReader wr = new WordReader("/home/linger/sources/ParaModel/electronic_seg.txt"); WordReader wr = new WordReader("/home/linger/sources/resultbig.txt"); wr.readSentence(); //System.out.println("-------------------------"); //wr.readSentence(); //int i=0; //while(true)//614005行 //{ //String[] words = wr.getNextWords(5); //if(words == null) break; //System.out.println(i++); //System.out.println(words.length); //System.out.printf("%s,%s,%s,%s,%s \n",words[0],words[1],words[2],words[3],words[4]); //} } }
本文作者:linger
本文链接:http://blog.csdn.net/lingerlanlan/article/details/38337483
java读取中文分词工具(三),布布扣,bubuko.com
时间: 2024-10-10 01:32:34