MODULE WordDistDist EXPORTS Main; (* Builds a dendrogram for words based on their positional correlation. *) (* The program reads a set of important words "vc" and a list of words "tx". It then builds a matrix "dist[x,y]" that tells the difference between the distributions of "vc[x]" and "vc[y]" in the list "tx". Finally, it builds a dendrogram of the words in "vc", based on this distance matrix. *) IMPORT Wr, Rd, Fmt, Text, Thread, OSError, FileRd, FileWr, ParseParams, Process; FROM Stdio IMPORT stdin, stdout, stderr; TYPE LONG = LONGREAL; DistMatrix = ARRAY OF ARRAY OF REAL; Vocabulary = ARRAY OF TEXT; WordCounts = ARRAY OF CARDINAL; Corpus = ARRAY OF TEXT; Options = RECORD vocabulary: TEXT; corpus: TEXT; matrix: TEXT; tree: TEXT; END; <* FATAL Wr.Failure, Thread.Alerted, Rd.Failure, OSError.E *> PROCEDURE DoIt() = BEGIN WITH o = GetOptions(), vc = ReadVocabulary(o.vocabulary)^, NV = NUMBER(vc), tx = ReadCorpus(o.corpus)^, ct = CountWords(vc, tx)^, dist = NEW(REF DistMatrix, NV, NV)^ DO ComputeDistMatrix(dist, vc, ct, tx); PrintDist(o.matrix, dist, vc); GenTree(o.tree, dist, vc, ct, tx) END; END DoIt; PROCEDURE ComputeDistMatrix( VAR dist: DistMatrix; READONLY vc: Vocabulary; READONLY ct: WordCounts; READONLY tx: Corpus; ) = BEGIN WITH NV = NUMBER(vc) DO FOR x := 0 TO NV-1 DO dist[x,x] := 0.0; FOR y := 0 TO x-1 DO dist[x,y] := DistDist(vc[x], vc[y], ct[x], ct[y], tx); dist[y,x] := dist[x,y]; END END; END END ComputeDistMatrix; PROCEDURE DistDist(wx, wy: TEXT; cx, cy: CARDINAL; READONLY tx: Corpus): REAL = VAR sx, sy, d: LONGREAL := 0.0d0; PROCEDURE Eq(a,b: TEXT): LONGREAL = BEGIN IF Text.Equal(a,b) THEN RETURN 1.0d0 ELSE RETURN 0.0d0 END END Eq; BEGIN WITH NT = NUMBER(tx) DO FOR i := 0 TO NT-1 DO WITH ti = tx[i], (* Probability of an occurrence of word "wx" falling in slot "i": *) px = (Eq(wx, ti) + 1.0d0)/FLOAT(cx + NT, LONG), (* Probability of an occurrence of word "wy" falling in slot "i": *) py = (Eq(wy, ti) + 1.0d0)/FLOAT(cy + NT, LONG) DO sx := sx + px; sy := sy + py; d := d + ABS(sx-sy) END END; RETURN FLOAT(d, REAL) END END DistDist; PROCEDURE PrintDist( name: TEXT; READONLY dist: DistMatrix; READONLY vc: Vocabulary; ) = VAR wr: Wr.T; BEGIN IF Text.Equal(name, "-") THEN wr := stdout ELSE wr := FileWr.Open(name) END; WITH NV = NUMBER(vc) DO FOR x := 0 TO NV-1 DO FOR y := 0 TO x-1 DO Wr.PutText(wr, Fmt.Pad(vc[x], 24, align := Fmt.Align.Left)); Wr.PutText(wr, " "); Wr.PutText(wr, Fmt.Pad(vc[y], 24, align := Fmt.Align.Left)); Wr.PutText(wr, " "); Wr.PutText(wr, FR(dist[x,y], 12,3)); Wr.PutText(wr, "\n"); END END; Wr.Close(wr) END END PrintDist; PROCEDURE GenTree( name: TEXT; VAR dist: DistMatrix; VAR vc: Vocabulary; VAR ct: WordCounts; VAR tx: Corpus; ) = VAR wr: Wr.T; VAR dMin: REAL; n, xMin, yMin: CARDINAL; BEGIN IF Text.Equal(name, "-") THEN wr := stdout ELSE wr := FileWr.Open(name) END; (* Should use a priority queue, but "DistDist" dominates anyway. *) WITH NV = NUMBER(vc) DO n := NV; WHILE n > 1 DO (* Find the two most similar words: *) dMin := LAST(REAL); FOR x := 0 TO n-1 DO FOR y := 0 TO x-1 DO IF dist[x,y] < dMin THEN dMin := dist[x,y]; xMin := x; yMin := y END END END; (* Condense them and replace them by an invented word: *) WITH n1 = n-1, wx = vc[xMin], wy = vc[yMin], wz = "$" & Fmt.Pad(Fmt.Int(n1), 5, '0') DO (* Output them: *) Wr.PutText(wr, Fmt.Pad(wx, 24, align := Fmt.Align.Left)); Wr.PutText(wr, " "); Wr.PutText(wr, Fmt.Pad(wy, 24, align := Fmt.Align.Left)); Wr.PutText(wr, " "); Wr.PutText(wr, Fmt.Pad(wz, 24, align := Fmt.Align.Left)); Wr.PutText(wr, " "); Wr.PutText(wr, FR(dist[xMin,yMin], 12,3)); Wr.PutText(wr, "\n"); <* ASSERT yMin < xMin *> (* Replace "wx" and "wy" by "wz" in corpus: *) FOR i := 0 TO LAST(tx) DO IF Text.Equal(tx[i], wx) OR Text.Equal(tx[i], wy) THEN tx[i] := wz END END; (* Replace "vc[xMin]" by "wz" and eliminate "vc[yMin]": *) ct[xMin] := ct[xMin] + ct[yMin]; vc[xMin] := wz; ct[yMin] := ct[n1]; vc[yMin] := vc[n1]; FOR z := 0 TO n-2 DO dist[yMin,z] := dist[n1,z]; dist[z,yMin] := dist[z,n1] END; dist[yMin,yMin] := 0.0; n := n1; END; (* Recompute distances from new "vc[xMin]": *) FOR z := 0 TO n-1 DO IF z = xMin THEN dist[z,z] := 0.0 ELSE dist[xMin,z] := DistDist(vc[xMin], vc[z], ct[xMin], ct[z], tx); dist[z,xMin] := dist[xMin,z] END END END; Wr.Close(wr) END END GenTree; PROCEDURE ReadWords(name: TEXT): REF ARRAY OF TEXT = VAR rd: Rd.T; rvc: REF ARRAY OF TEXT; n: CARDINAL := 0; BEGIN IF Text.Equal(name, "-") THEN rd := stdin ELSE rd := FileRd.Open(name) END; rvc := NEW(REF ARRAY OF TEXT, 200); LOOP TRY WITH w = Rd.GetLine(rd) DO IF n = NUMBER(rvc^) THEN WITH rt = NEW(REF ARRAY OF TEXT, 2*n) DO SUBARRAY(rt^, 0, n) := rvc^; rvc := rt END END; rvc^[n] := w; INC(n); END EXCEPT Rd.EndOfFile => WITH s = NEW(REF ARRAY OF TEXT, n) DO s^ := SUBARRAY(rvc^, 0, n); RETURN s END END END END ReadWords; PROCEDURE ReadCorpus(name: TEXT): REF Corpus = BEGIN RETURN ReadWords(name) END ReadCorpus; PROCEDURE ReadVocabulary(name: TEXT): REF Vocabulary = BEGIN (* Should check strict ordering *) RETURN ReadWords(name) END ReadVocabulary; PROCEDURE CountWords(READONLY vc: Vocabulary; READONLY tx: Corpus): REF WordCounts = BEGIN WITH NV = NUMBER(vc), rct = NEW(REF WordCounts, NV), ct = rct^ DO FOR x := 0 TO LAST(vc) DO ct[x] := 0 END; FOR i := 0 TO LAST(tx) DO WITH x = Locate(tx[i], vc, 0, NV) DO IF x < NV AND x >= 0 AND Text.Equal(tx[i], vc[x]) THEN INC(ct[x]) END END END; RETURN rct END; END CountWords; PROCEDURE Locate(wx: TEXT; READONLY vc: Vocabulary; lo, hi: CARDINAL): INTEGER = (* Binary search of "wx" in "vc[lo..hi-1]". *) (* Returns "x" such that "vc[x] = wx" or "wx" *) (* should go between "vc[x-1]" AND "vc[x]" *) BEGIN <* ASSERT lo < hi *> REPEAT (* Now "wx" should go between "vc[lo-1]" and "vc[hi]", exclusive. *) WITH md = (lo + hi) DIV 2 DO WITH cmp = Text.Compare(wx, vc[md]) DO CASE cmp OF | -1 => hi := md; | 00 => RETURN md; | +1 => lo := md+1 END END END UNTIL hi <= lo; RETURN hi END Locate; PROCEDURE FR(x: REAL; w, p: CARDINAL): TEXT = BEGIN RETURN Fmt.Pad(Fmt.Real(x, prec := p, style := Fmt.Style.Fix), w); END FR; PROCEDURE GetOptions (): Options = <* FATAL Thread.Alerted, Wr.Failure *> VAR o: Options; BEGIN WITH pp = NEW(ParseParams.T).init(stderr) DO TRY pp.getKeyword("-vocabulary"); o.vocabulary := pp.getNext(); pp.getKeyword("-corpus"); o.corpus := pp.getNext(); pp.getKeyword("-matrix"); o.matrix := pp.getNext(); pp.getKeyword("-tree"); o.tree := pp.getNext(); pp.finish(); EXCEPT | ParseParams.Error => Wr.PutText(stderr, "Usage: WordDistDist \\\n"); Wr.PutText(stderr, " -vocabulary FILE \\\n"); Wr.PutText(stderr, " -corpus FILE \\\n"); Wr.PutText(stderr, " -matrix FILE \\\n"); Wr.PutText(stderr, " -tree FILE \n"); Process.Exit (1); END; END; RETURN o END GetOptions; BEGIN DoIt() END WordDistDist.