[Israel.pm] optimizing memory usage

Shlomo Yona shlomo at cs.haifa.ac.il
Sun Jan 4 19:42:39 PST 2004


Opps.

Attached is the correct one (with plenty of printouts to
keep ne entertained while it runs... and runs...)

-- 
Shlomo Yona
shlomo at cs.haifa.ac.il
http://cs.haifa.ac.il/~shlomo/
-------------- next part --------------
#!/usr/bin/perl -w
use strict;
use warnings;

my $files_dir = '/home/shlomo/a7/archive/statistics/';
system("mkdir -p $files_dir") unless -d $files_dir;
my %token_counts=();
my $model= {
	words => {
		sorting => {
			"by_frequency-alphabet.txt" => sub {
				$token_counts{$b} <=> $token_counts{$a} 
				||
				$a cmp $b
			},
			"by_alphabet-frequency.txt" => sub {
				$a cmp $b
				||
				$token_counts{$b} <=> $token_counts{$a}
			},
			"by_token_length-frequency-alphabet.txt" => sub {
				length($b) <=> length($a) 
				||
				$a cmp $b
				||
				$token_counts{$b} <=> $token_counts{$a}
			},
		},
		token_separator => qr/\s+/,
		token_vector_preparation => sub {
			my ($txt_ref,$ws,$ts) = @_;
			return (
				split(//,' ' x ($ws-1)),
				split(/$ts/,$$txt_ref),
				split(//,' ' x ($ws-1))
			);
		},
	},
	characters => {
		sorting => {
			"by_frequency-alphabet.txt" => sub {
				$token_counts{$b} <=> $token_counts{$a} 
				||
				$a cmp $b
			},
			"by_alphabet-frequency.txt" => sub {
				$a cmp $b
				||
				$token_counts{$b} <=> $token_counts{$a}
			},
		},
		token_separator => qr//,
		token_vector_preparation => sub {
			my ($txt_ref,$ws,$ts) = @_;
			return (
				split(//,' ' x ($ws-1)),
				split(/$ts/,$$txt_ref),
				split(//,' ' x ($ws-1))
			);
		},
	},
};

my @window_sizes=(1 .. 5);
my @txt_files = <>;
foreach my $window (@window_sizes) {
print "window size = $window\n";
	while( my ($k,$v) = each %$model) {
print "model = $k\n";
		%token_counts=();
		foreach my $txt_filename (@txt_files) {
print "Window=$window\tmodel=$k\tf=$txt_filename\n";
			chomp $txt_filename;
			open(IN,"<$txt_filename") or die "Cannot open $txt_filename for reading: $!\n";
			my $text=join '',<IN>;
			close(IN) or die "Cannot close $txt_filename after reading: $!\n";
			$text=~s/^\s*//;
			$text=~s/\s*$//;
			$text=~s/\s+/ /gs;
			# prepare tokens vector
			my @tokens = &{$v->{token_vector_preparation}}(\$text,$window,$v->{token_separator});
			for (my $i=0; $i<@tokens-$window+1; ++$i) {
				my $from = $i;
				my $to = $i+$window-1;
				++$token_counts{join(' ', at tokens[$from .. $to])};
			}
		}
		while (my ($filename_suffix,$sort_sub) = each %{$v->{sorting}}) {
			my $filename = $files_dir.$k.".".$window."-gram.".$filename_suffix;
			system("mv $filename $filename.bak") if -f $filename;
			open(OUT,">$filename") or die "Cannot open $filename for writing: $!\n";
			foreach my $token (sort $sort_sub keys %token_counts) {
				print OUT $token,"\t",$token_counts{$token},"\n";
			}
			close(OUT) or die "Cannot close $filename after writing: $!\n";
		}
	}
}


More information about the Perl mailing list