From 5aa0effba2d5268b988db09080ff68e325993b43 Mon Sep 17 00:00:00 2001 From: David Phillips Date: Thu, 20 Jun 2019 22:13:00 +1200 Subject: Add initial cut of markov chain tool Currently chooses only one token based on the immediate previous token. In future, looking around at 2 tokens might produce more coherent results. --- process.pl | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100755 process.pl diff --git a/process.pl b/process.pl new file mode 100755 index 0000000..e1f1fae --- /dev/null +++ b/process.pl @@ -0,0 +1,64 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +use utf8; +binmode(STDOUT, ":utf8"); + +$| = 1; + +use Data::Dumper; + +sub some { + return $_[rand(@_)]; +} + +# Takes plain text lines in on stdin, performs analysis, outputs custom markov +# file on stdout + +# Global markov data +my %markov_data; + +print "\"" x 80; +print "\nLearning words...\n"; + +while () { + chomp; + utf8::upgrade($_); + my @words = split /\s+/, $_; + + # Leaning is the same for all but last word + for (my $i = 0; $i < @words - 1; $i++) { + my $word = $words[$i]; + my $next_word = $words[$i + 1]; # +1 safe beacuse of loop bounds + + push @{$markov_data{$word}}, $next_word; + } + + # Now handle special case; last word must be learned as being followed by EOL ("") + push @{$markov_data{$words[@words - 1]}}, ""; +} + +print "Word Patterns:\n"; +#print Dumper(%markov_data); +print "\"" x 80; +print "\n"; + + +while (1) { + my $word = $ARGV[0] || some(keys %markov_data); + print "Taking \"$word\" as the seed\n"; + my $i = 0; + do { + $i++; + print "$word"; + $word = some(@{$markov_data{$word}}); + } until($word eq "" or $i == 100); + print "\n"; + sleep 1; +} + +print "\n"; +print "\"" x 80; +print "\n"; -- cgit v1.1