import java.io.*; import java.util.*; /** * read words with Trie support, read words in compressed format * * * @author Owen Astrachan * @version $Id$ */ public class FastJoggleReader { public final static int BUFSIZ = 82000; public FastJoggleReader() { // nothing to create } /** * read words from a compressed file (non-standard compression) * currently the filename is hardwired, but it's an easy * change to make the filename a parameter * * @param trie is the Trie to read words into */ public void ReadWords(Trie trie) { String s; StringBuffer sb; byte buffer[] = new byte[BUFSIZ]; FileInputStream f = null; try { f = new FileInputStream("bogdict"); int offset = 0; int bytesRead; bytesRead = f.read(buffer,0,BUFSIZ); // read entire file System.out.println("read = " + bytesRead); // report it ProcessBuffer(buffer,bytesRead,trie); // process it } catch (IOException e) { e.printStackTrace(); System.err.println("error reading dictionary"); System.err.println(e.getMessage()); } finally { if (f != null) { try { f.close(); } catch (IOException e) { // nothing here } } } } /** * process the compressed words by storing into a trie * * @param buffer the compressed characters * @param size the size of the buffer * @param trie the Trie to store words into * */ void ProcessBuffer(byte buffer[], int size, Trie trie) { char sb[] = new char[20]; int k; int count = 0; int numWords = 0; char ch; int index; for(index = 0; index < size; index++) { ch = (char) buffer[index]; if ('a' <= ch && ch <= 'z') // legal char, store it { sb[count++] = ch; } else // word ended, process { if (count > 0) { numWords++; trie.addCString(sb); // add to trie if (numWords % 1000 == 0) { System.out.println("processed " + numWords + " words "); } } // decrease effective size of sb by ch characters // this treats ch (a control char) as a number // e.g., ^D (control-D) is 4 while (count >= 0 && count != (int) ch) { sb[count--] = '\0'; } } } } // find length of common prefix of a and b private int PrefixLength(String a, String b) { int k; int size = a.length() < b.length() ? a.length() : b.length(); int commonCount = 0; for(k=0; k < size; k++) { if (a.charAt(k) == b.charAt(k)) commonCount++; else return commonCount; } return commonCount; } /** * write words to a compressed file (e.g., for subsequent reading) * * @param trie is the trie that stores the words * @param filename the name of the file storing compressed words */ public void WriteWords(Trie trie, String filename) { HoldWords hw = new HoldWords(); trie.apply(hw); FileOutputStream st = null; DataOutputStream f = null; try { st = new FileOutputStream(filename); f = new DataOutputStream(st); int k; String last = ""; int prefix; byte buffer[] = new byte[30]; // store chars to write for(k=0; k < hw.words.size(); k++) { String s = (String) hw.words.elementAt(k); prefix = PrefixLength(last,s); last = s; f.writeByte(prefix); s.getBytes(prefix,s.length(),buffer,0); f.write(buffer,0,s.length()-prefix); if (k % 1000 == 0) { System.out.println("wrote " + k + " words "); } } f.writeByte(10); // write a linefeed } catch (IOException e) { System.out.println("error opening " + filename); } finally { try { f.close(); st.close(); } catch (IOException e){ } } } } /** * a class to store all words in a trie in a vector * *@see Trie */ class HoldWords implements Recorder { public HoldWords() { words = new Vector(); } public void record(Object o) { words.addElement(o); } public void report() { } public Vector words; }