Trie树又称单词查找树,Trie树,是一种树形结构,是一种哈希树的变种。典型应用是用于统计,排序和保存大量的字符串(但不仅限于字符串),所以经常被搜索引擎系统用于文本词频统计。它的优点是:利用字符串的公共前缀来减少查询时间,最大限度地减少无谓的字符串比较,查询效率比哈希表高。
这里的需求是,给如一组词汇,北京人,北京,武汉,武汉话等等。能够统计“武汉”这个词的词频,输入"武"的时候,能够得到以“武”开头的所有词。实现的代码如下:
package com.dong.util; import java.util.ArrayList; import java.util.List; public class Trie { // 根节点 private final TrieNode root = new TrieNode(‘ ‘); // 向trie树中插入一个词 public void insert(String word) { if (word == null || word.length() == 0) { return; } String left = word; TrieNode cur = root; while (left.length() > 0) { char toInert = left.charAt(0); TrieNode next = null; // 如果那个节点不存在的话,将当前节点插入 if (containCharNode(cur.getChild(), toInert) == null) { next = new TrieNode(toInert); cur.getChild().add(next); } else { next = containCharNode(cur.getChild(), toInert); } cur = next; left = left.substring(1); if (left.length() == 0) { cur.setFreq(cur.getFreq() + 1); } } } // 通过前缀查找词,返回包括前缀的所有词。 public List<String> search(String prefix) { List<String> retList = new ArrayList<String>(); List<String> tempList = new ArrayList<String>(); if (prefix == null || prefix.length() == 0) { return null; } String left = prefix; TrieNode cur = root; while (left.length() > 0) { char toFind = left.charAt(0); TrieNode next = null; // 如果那个节点不存在的话,将当前节点插入 if (containCharNode(cur.getChild(), toFind) == null) { return null; } else { next = containCharNode(cur.getChild(), toFind); if (left.length() == 1) { cur = next; break; } } cur = next; left = left.substring(1); } dfs(cur, new ArrayList<Character>(), tempList); for (String s : tempList) { retList.add(prefix + s); } if (getFreq(prefix) > 0) { retList.add(prefix); } return retList; } // 深度搜索一个trieNode节点下的所有的词 private void dfs(TrieNode root, List<Character> stack, List<String> retList) { if (root.getChild().size() == 0) { StringBuffer sb = new StringBuffer(); for (char c : stack) { sb.append(c); } retList.add(sb.toString()); } else { for (TrieNode r : root.getChild()) { stack.add(r.getVal()); dfs(r, stack, retList); stack.remove(stack.size() - 1); } } } // 查看一个节点的的子节点是否包含一个字符 public TrieNode containCharNode(List<TrieNode> child, char c) { TrieNode ret = null; for (TrieNode temp : child) { if (temp.getVal() == c) { ret = temp; } } return ret; } // 得到一个词的词频 public int getFreq(String word) { if (word == null || word.length() == 0) { return 0; } String left = word; TrieNode cur = root; while (left.length() > 0) { char toFind = left.charAt(0); TrieNode next = null; // 如果不存在此节点,返回0 if (containCharNode(cur.getChild(), toFind) == null) { return 0; } else { next = containCharNode(cur.getChild(), toFind); if (left.length() == 1) { cur = next; break; } } cur = next; left = left.substring(1); } return cur.getFreq(); } public static String fill(String prefix, ArrayList<Character> stack) { ArrayList<String> retList = new ArrayList<String>(); StringBuffer sb = new StringBuffer(); sb.append(prefix); for (char c : stack) { sb.append(c); } return sb.toString(); } public static void main(String[] args) { Trie t = new Trie(); String[] wordList = { "我晕", "我晕啊", "我不信啊", "我信了", "也是", "这不对啊", "我晕" }; for (String word : wordList) { t.insert(word); } System.out.println(t.search("我晕")); } } class TrieNode { // 节点下存放的字符 private char val; // 一个节点下面的子节点 private List<TrieNode> child; // 该词的词频 private int freq; public TrieNode(char val) { child = new ArrayList(); freq = 0; this.val = val; } public char getVal() { return val; } public void setVal(char val) { this.val = val; } public List<TrieNode> getChild() { return child; } public void setChild(List<TrieNode> child) { this.child = child; } public int getFreq() { return freq; } public void setFreq(int freq) { this.freq = freq; } }
时间: 2024-10-15 19:33:30