#include #include #include #include #include #include "randgen.h" #include "vector.h" /* * * author: Owen Astrachan * date: 1/9/93 * revised/converted to C++: 4/19/96 * * Dept of Computer Science * Duke University * Durham, NC 27708 * ola@cs.duke.edu * http://www.cs.duke.edu * * * this program uses a trie data structure to store * n-grams used to "train" the program so that it can * generate random text. See the program 'babble.c' * for a complete description and more comments * * usage: writeall [-o #] [-c #] * * where the file is the training file, * -o # specifies order n-gram to use * -c # specifies # chars to generate * * * comments/suggestions to ola@cs.duke.edu * * * */ const int ALPH_SIZE = 129; // # of chars in alphabet const char SPACE = ' '; const int MAX_LEVEL = 10; // max is 10-gram const int MAX_CHARS = 1024; // max # chars generated // TNode is the trie node used in the program // note that a C-style array is used, it's an array of pointers // each trie node stores pointers to children, and a count of how // many n-grams there are below the node. This count is the // sum of the counts of all children. This makes it possible to // pick a weighted random letter, each non-NULL branch contributes // its count weight to the total count struct TNode { int count; // weight # of children below TNode * list[ALPH_SIZE]; // pointers to children TNode(); }; TNode::TNode() // postcondition: all fields zero'd { int k; count = 0; for(k=0; k < ALPH_SIZE; k++) { list[k] = 0; } } class Trie { public: Trie(int level, int chars); // level n-gram, chars # random chars ~Trie(); // destroy trie nodes void Read(istream & is); // read stream, initialize trie void RandomWrite(); // generate random text int NumNodes() const; // # of nodes in trie private: void AddToTrie(const string & s); // add string to trie int RandomChar(TNode * t); // generate random char void ShiftLeft(int a[]); // helper, shift elts left void DoDelete (TNode * t); // helper, deletes trie node int myNumNodes; // # of nodes in trie int myLevel; // order of n-gram int myNumChars; // # of random chars to create TNode * myRoot; // root of trie }; Trie::Trie(int level, int chars) : myNumNodes(0), myLevel(level), myNumChars(chars), myRoot(new TNode) // postcondition: all data fields initialized { } void Trie::DoDelete(TNode * t) { if (t != 0) { int k; for(k=0; k < ALPH_SIZE; k++) { DoDelete(t->list[k]); } delete t; } } Trie::~Trie() { DoDelete(myRoot); } void Trie::Read(istream & is) // postcondition: is has been read, all chars have been stored in // trie as myLevel-gram sequences { static const int SIZE = 1024; char buffer[SIZE+1]; // use C-style string for reading chars char sub[SIZE+1]; // substrings stored here int k,buffIndex,charsInBuf; buffer[0] = SPACE; // first char on line follows a space buffIndex = 1; // where in buffer reading starts while (true) { is.read(buffer+buffIndex,SIZE-buffIndex); // gcount == # of chars actually read, // charsInBuf = # of chars stored in buffer needing processing charsInBuf = buffIndex + is.gcount(); // process all chars except last 'myLevel-1' chars // copy each 'myLevel' sequence of chars into string sub // and then add string sub to the trie for(k=0; k <= charsInBuf - myLevel; k++) { strncpy(sub,buffer+k,myLevel); sub[myLevel] = '\0'; // sub now properly terminated AddToTrie(sub); } // copy last myLevel-1 chars to front of buffer for // processing next time through loop // ******* LOOP EXIT if (charsInBuf < myLevel) break; // all reading done // ******* LOOP EXIT strncpy(buffer, buffer + (charsInBuf-myLevel+1),myLevel-1); buffIndex = myLevel-1; } } int Trie::NumNodes() const // postcondition: returns # of nodes in trie { return myNumNodes; } void Trie::RandomWrite() // postcondition: random text generated, printed { string randomText = ""; // random text stored here int rc[MAX_LEVEL]; // store n-gram chars/path here TNode * t = myRoot; int k,m; // initialize string with number of chars = myLevel for(k=0; k < myLevel; k++) { if (t == 0 || t->count == 0) { break; } rc[k] = RandomChar(t); // this is path in trie // line below is equivalent to: line[k] = char(rc[k]) randomText += static_cast(rc[k]); t = t->list[rc[k]]; // move to next node } for(k=0; k < myNumChars; k++) { ShiftLeft(rc); // "shift" path once t = myRoot; // restart at root for(m=0; m < myLevel-1; m++) // follow trie { if (t == 0) // should NOT happen, be safe { cout << randomText; return; } t = t->list[rc[m]]; } rc[myLevel-1] = RandomChar(t); randomText += static_cast(rc[myLevel-1]); } cout << randomText; } void Trie::AddToTrie(const string & s) { int len = s.length(); int j,k,index; TNode * t = myRoot; // start at root t->count++; // count string added below t // cout << s << endl; for(j=0; j < len; j++) // add each char, traversing trie { index = s[j]; if (t->list[index] == 0) { t->list[index] = new TNode; myNumNodes++; } t->list[index]->count++; // add char below t t = t->list[index]; // move to new node } } int Trie::RandomChar(TNode * t) // postcondition: returns random char based on trie rooted at t { RandGen rando; int randCount = rando.RandInt(1,t->count); int sum = 0; int k; for(k=0; k < ALPH_SIZE; k++) // sum # entries in each subtrie { if (t->list[k] != 0) { sum += t->list[k]->count; } if (sum >= randCount) return k; } return EOF; // should never reach here } void Trie::ShiftLeft(int a[]) // postcondition: first 'myLevel' entries of a shifted left one position { int k; for(k=0; k < myLevel; k++) { a[k] = a[k+1]; } } int main(int argc, char *argv[]) { int level = MAX_LEVEL; int chars = MAX_CHARS; int k; string filename = ""; ifstream input; if (argc < 2) { cerr << "usage: " << argv[0] << "[-o #][-c #] " << endl; exit(1); } k = 1; while (k < argc) { if (argv[k][0] == '-') // argument begins with '-' { switch (argv[k][1]) { case 'o': level = atoi(argv[k+1]); if (level <= 0 || MAX_LEVEL < level) { cerr << "order must be between 1 and " << MAX_LEVEL << endl; exit(1); } k += 2; break; case 'c': chars = atoi(argv[k+1]); k += 2; break; default: cerr << "unknown option " << argv[k] << endl; k++; } } else { filename = argv[k]; k++; } } input.open(filename.c_str()); if (input.fail()) { cerr << "could not open file " << argv[k] << " for reading" << endl; exit(1); } Trie trie(level,chars); trie.Read(input); cout << "number of nodes = " << trie.NumNodes() << endl; trie.RandomWrite(); cout << endl << endl; return 0; }