import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.StringTokenizer; /* * 文件格式:已分词的中文文本,每个词语空格分割,每行一个段落。 * 这个类适合读取每行数量较少的文本,比如分好段落的文本,一个段落一行存储。 * 读取一行,步长为1,返回词组。不会跨段落生成词组。 * 两种模式: * 1 读到文件末尾,结束 * 2 读到文件末尾,从头再来 */ public class WordReader { static final int normalMode = 0; static final int againMode = 1; int currentMode = 0; BufferedReader br=null; ArrayList<String> paraWords = null; StringTokenizer tokenizer; int currentPara = 0; int paraPos = 0; public WordReader(String fileName) throws IOException { File file=new File(fileName); br=new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf-8")); br.mark((int)file.length()+1); paraWords = new ArrayList<String>(); } private boolean readPara() throws IOException { //if(currentPara>614005+10) return false; String line = br.readLine(); if(line == null)//到文件末尾了 { if(currentMode == normalMode) { return false; } else { br.reset();//从头再来 return readPara(); } } paraWords.clear(); tokenizer= new StringTokenizer(line," "); while(tokenizer.hasMoreTokens()) { paraWords.add(tokenizer.nextToken()); } currentPara++; paraPos = 0; return true; } public String[] getNextWords(int count) throws IOException { if(paraPos+count >= paraWords.size())//到了段落末尾,读取新的段落 { if(readPara()) return getNextWords(count); else return null; } String[] words = new String[count]; for(int i=0;i<count;i++) { words[i] = paraWords.get(paraPos+i); } paraPos++; return words; } public static void main(String[] args) throws IOException { // TODO Auto-generated method stub WordReader wordReader = new WordReader("/home/linger/sources/ParaModel/electronic_seg.txt"); //wordReader.currentMode = WordReader.againMode; while(true)//614005行 { String[] words = wordReader.getNextWords(5); if(words == null) break; System.out.println(words[0]); } System.out.println(wordReader.currentPara); } }
发现bufferreader也是可以移动流位置的,利用mark和reset。
java读取中文分词工具(2)
时间: 2024-09-27 23:22:59