-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmarkov3.l
478 lines (450 loc) · 11.5 KB
/
markov3.l
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
%{
/*
* Copyright (c) 1986, 1987 by Joe Buck
*
* Permission is granted to use, redistribute, or modify this program,
* as long as you don't pretend you wrote it. Send improvements or
* bug reports to {ihnp4,hplabs,ames,sun}!oliveb!epimass!jbuck.
*
* The program generates simulated Usenet articles, given Usenet articles
* as input.
*
* This program constructs a table of frequencies for a token appearing,
* given the two preceding tokens. A "token" is a sequence of non-blank
* characters. An entirely blank line is also treated as a token, as is
* the beginning and end of an article.
*
* The program is designed to process news articles, rejecting text from
* the header, signature, and included text, together with cruft inserted
* by rn and notes. A paragraph of included text is treated like a token.
*
* After the table is built (and it can be big), articles are generated
* on the standard output.
*/
#ifndef lint
static char *sccs_id = "@(#)markov3.l 1.1 3/6/87 epimass!jbuck";
#endif
#include <sys/types.h> /* for time_t */
int in_included_text = 0;
%}
%Start HDR BODY SIG
%%
<HDR>^[^ \t]+:.*\n ; /* Header line, e.g. "From: [email protected]" */
<HDR>^[ \t]+[^ \t].*\n ; /* Continuation of header line */
<HDR>^[ \t]*$ BEGIN BODY;
<BODY>^"-- "$ BEGIN SIG;
<BODY>^[><)|#}].*\n { /* 50% rule gets people to change ">"
to other characters; this gets most of them */
if (!in_included_text) {
in_included_text = 1;
process_token ("\n> ...\n\n");
}
}
<BODY>"]".*\n { /* should have been included in the above. My
lex generates bad C code if I say [[><...]
even though ed(1) says that's a valid regular
expression. */
if (!in_included_text) {
in_included_text = 1;
process_token ("\n> ...\n\n");
}
}
<BODY>^"In article".*\n ; /* Reject rn crud */
<BODY>^"/* Written".*"*/"\n ; /* Also NOTES crud */
<BODY>^"/* End of text from".*"*/"\n ; /* NOTES */
<BODY>^"/* ----------".*"----------*/"\n ; /* NOTES */
<BODY>[ \t]+ ; /* Skip white space */
<BODY>\n[ \t\n]*\n { process_token ("\n"); /* Paragraph break */}
<BODY>[^ \t\n]+ { in_included_text = 0; process_token (yytext); }
<HDR>. ; /* Eat anything that escaped */
<HDR>\n ;
<BODY>\n ;
<SIG>. ;
<SIG>\n ;
%%
void perror(), exit();
char *strcpy(), *malloc();
extern int optind;
extern char *optarg;
/*
* hashtab is a hash table storing all the tokens we encounter.
*/
struct htentry {
char *htext;
struct htentry *hlink;
};
#define HSIZE 3557 /* Should be prime */
#define Fprintf (void)fprintf
#define Printf (void)printf
struct htentry hashtab[HSIZE];
/*
* node and succnode are portions of the big structure we're going to build.
* node represents something like ("was", "a") in a binary tree.
* a linked list of succnodes contain tokens that may follow ("was", "a")
*/
struct node {
char *text;
char *text2;
int ocount;
struct node *lc, *rc;
struct succnode *succ;
};
struct succnode {
struct node *scnod;
int count;
struct succnode *link;
};
struct node *prev_code = NULL;
char *prev_token = NULL, **Argv;
int init_state = HDR;
int verbose = 0;
struct node *root = NULL, *tknptr;
struct succnode *start = NULL;
int n_pairs = 0, n_tokens = 0, n_files = 0, n_total = 0;
struct node *insert_token();
char *savetoken();
process_token (txt)
char *txt;
{
struct node *code;
char *token = savetoken (txt);
/* We have a new token. Say the previous two tokens were "one" "way"
* and the current token is "to". Then prev_code points to a node
* for ("one", "way") and token is "to". This function adds ("way", "to") as a
* successor to ("one","way") and makes prev_code point to ("way","to").
*/
code = insert_token (prev_token, token);
insert_pair (prev_code, code);
prev_code = code;
prev_token = token;
return;
}
/*
* here it is, the main function.
*/
main (argc, argv)
int argc;
char **argv;
{
int i, c, n_articles = 10, sflag = 0;
char *dumpfile = NULL;
extern int optind;
extern char *optarg;
while ((c = getopt (argc, argv, "pxvn:d:s:")) != EOF) {
switch (c) {
case 'v':
verbose = 1;
break;
case 'p': /* Input is plain text, not Usenet stuff */
init_state = BODY;
break;
case 'n': /* # articles to generate */
n_articles = atoi (optarg);
break;
case 'd': /* where to dump the data structure */
dumpfile = optarg;
break;
case 's': /* Set the seed for rand; fall through */
srand (atoi (optarg));
case 'x': /* set flag to prevent srand */
sflag++;
break;
default:
Fprintf (stderr,
"Usage: markov3 [-pvx] [-s seed] [-n n_art] [-d dump] files\n");
exit (1);
}
}
BEGIN init_state; /* initial state of lexical analyzer */
if (!sflag) /* set random number generator */
srand ((int)time ((time_t *)0));
/* Note: if optind == argc, there are no file arguments. yyin is left
* initialized to stdin.
*/
if (optind < argc) {
/* yyin is lex input stream. Point to first file. */
if ((yyin = fopen (argv[optind], "r")) == NULL) {
perror (argv[optind]);
exit (1);
}
optind++; /* skip to next file */
}
Argv = argv; /* make it global so yywrap can access it */
n_files = 1;
/* yylex puts all the input files through the lexical analyzer and builds
* the database.
*/
(void) yylex ();
if (dumpfile)
dump_database (dumpfile);
if (verbose)
Fprintf (stderr,
"Total of %d tokens (%d different), %d different pairs, %d files\n",
n_total, n_tokens, n_pairs, n_files);
/* Generate the articles, separated by form feeds */
for (i = 0; i < n_articles; i++) {
if (i > 0) output_word ("\n\f\n");
generate_article ();
}
return 0;
}
/*
* Lex calls this when EOF is reached. It opens the next file if there
* is one. Lex interprets a return value of 1 to mean "all done" and 0
* to mean "keep going".
*/
yywrap () {
(void) fclose (yyin);
insert_pair (prev_code, (struct node *)0);
prev_code = NULL;
if (Argv[optind] == NULL) return 1;
else if ((yyin = fopen (Argv[optind], "r")) == NULL) {
perror (Argv[optind]);
exit (1);
}
optind++;
in_included_text = 0;
if (verbose && n_files % 10 == 0)
Fprintf (stderr, "%d files\n", n_files);
n_files++;
BEGIN init_state;
return 0;
}
/*
* This function saves a token in the hash table (if it isn't there
* already) and returns a pointer to the stored copy.
*/
char *
savetoken (txt)
char *txt;
{
int h;
char *p;
struct htentry *hp;
n_total++;
for (p = txt, h = 0; *p; h += *p++);
hp = hashtab + (h % HSIZE);
while (hp->hlink) {
if (strcmp (hp->htext, txt) == 0) {
return hp->htext;
}
hp = hp->hlink;
}
/* OK, it's a new token. Make hp->hlink point to a new,
* null block and make hp->htext point to the text.
*/
hp->hlink = (struct htentry *) malloc (sizeof *hp);
hp->htext = malloc ((unsigned)(strlen (txt) + 1));
(void) strcpy (hp->htext, txt);
hp->hlink->hlink = NULL;
hp->hlink->htext = NULL;
n_tokens++;
return hp->htext;
}
/*
* This recursive function inserts a token pair into the tree.
*/
struct node *
insert_in_tree (p, txt, txt2)
struct node *p;
char *txt, *txt2;
{
int cmp;
if (p == NULL) {
/* Create a new node. */
p = (struct node *) malloc (sizeof *p);
p->text = txt;
p->text2 = txt2;
p->lc = p->rc = NULL;
p->succ = NULL;
p->ocount = 1;
tknptr = p;
n_pairs++;
if (verbose && n_pairs % 1000 == 0)
Fprintf (stderr, "%d pairs\n", n_pairs);
return p;
}
cmp = my_strcmp (p->text, txt);
if (cmp == 0) cmp = my_strcmp (p->text2, txt2);
if (cmp == 0) {
/* It's a match. Increment the count. */
tknptr = p;
p->ocount += 1;
}
/* Look in the subtrees. */
else if (cmp < 0) p->lc = insert_in_tree (p->lc, txt, txt2);
else p->rc = insert_in_tree (p->rc, txt, txt2);
return p;
}
/*
* This just calls insert_in_tree starting at the root
*/
struct node *
insert_token (txt, txt2)
char *txt,*txt2;
{
root = insert_in_tree (root, txt, txt2);
return tknptr;
}
/*
* This function adds a successor.
*/
struct succnode *
insert_in_succ_chain (sp, np)
struct succnode *sp;
struct node *np;
{
if (sp == NULL) {
sp = (struct succnode *) malloc (sizeof *sp);
sp->scnod = np;
sp->count = 1;
sp->link = NULL;
}
else if (sp->scnod == np)
sp->count += 1;
else sp->link = insert_in_succ_chain (sp->link, np);
return sp;
}
/*
* This calls insert_in_succ_chain starting at the right place.
*/
insert_pair (p1, p2)
struct node *p1, *p2;
{
if (p1) p1->succ = insert_in_succ_chain (p1->succ, p2);
else start = insert_in_succ_chain (start, p2);
}
/*
* This function dumps the stored data structure onto a file.
* Now if only I had a function to read it back in.
*/
char *
pr_token (txt)
char *txt;
{
if (txt[0] != '\n')
return txt;
return txt[1] ? "<INCL>" : "<LF>";
}
treedump (tree, fp)
struct node *tree;
FILE *fp;
{
if (tree) {
treedump (tree->rc, fp);
Fprintf (fp, "( %s %s ) %d", pr_token (tree->text),
pr_token (tree->text2), tree->ocount);
chaindump (tree->succ, fp);
treedump (tree->lc, fp);
}
}
/*
* Subroutine of treedump; it does one row.
*/
chaindump (p, fp)
struct succnode *p;
FILE *fp;
{
char *text;
while (p) {
if (p->scnod == NULL)
text = "<EOF>";
else text = pr_token (p->scnod->text2);
Fprintf (fp, " %s %d", text, p->count);
p = p->link;
}
putc ('\n', fp);
}
/*
* This routine generates the dump file (-d option)
*/
dump_database (file)
char *file;
{
FILE *fp = fopen (file, "w");
if (fp == NULL) {
Fprintf (stderr, "markov: can't open ");
perror (file);
exit (1);
}
Fprintf (fp, "START:");
chaindump (start, fp);
treedump (root, fp);
}
/* roll (n) generates a uniformly distributed rv between 0 and n-1.
* This code is stolen from "hack" and should be portable. If you
* change this, remember that different systems have rand functions
* with different ranges, and the bottom bits are often no good.
*/
#define roll(n) ((rand() >> 3) % n)
/*
* This function generates an article by traversing the
* structure we've built.
*/
generate_article () {
struct succnode *p = start;
int ncounts = n_files;
int n, accum;
char *tp;
while (1) {
/* Roll the dice to find out the next token. The code below selects the
* next token, and the new state, with a probability corresponding to the
* frequency in the input.
*/
n = roll (ncounts);
accum = p->count;
while (accum <= n && p->link) {
p = p->link;
accum += p->count;
}
if (p->scnod == NULL)
break;
tp = p->scnod->text2;
/* Check for "end of story" */
if (tp == NULL)
break;
output_word (tp);
ncounts = p->scnod->ocount;
p = p->scnod->succ;
}
output_word ("\n"); /* This will flush the buffer as well. */
return;
}
/*
* This version handles null strings *
*/
my_strcmp (a, b)
register char *a, *b;
{
if (a == NULL) return b ? -1 : 0;
if (b == NULL) return 1;
return strcmp (a, b);
}
#define LEN 75
output_word (word)
char *word;
{
static char line[LEN+1];
static int room = LEN;
int l;
if (word == NULL) return;
l = strlen (word);
/* If word won't fit, or starts with \n, dump the current line */
if ((l >= room || word[0] == '\n') && line[0]) {
Printf ("%s\n", line);
line[0] = 0;
room = LEN;
}
/* If word won't fit in the buffer or starts with \n, print it now */
if (l >= LEN)
Printf ("%s\n", word);
else if (word[0] == '\n')
Printf ("%s", word);
/* Otherwise fill it in */
else {
(void)strcat (line, word);
(void)strcat (line, " ");
room -= (l + 1);
}
return;
}