[Israel.pm] optimizing memory usage
Shlomo Yona
shlomo at cs.haifa.ac.il
Sun Jan 4 19:42:39 PST 2004
Opps.
Attached is the correct one (with plenty of printouts to
keep ne entertained while it runs... and runs...)
--
Shlomo Yona
shlomo at cs.haifa.ac.il
http://cs.haifa.ac.il/~shlomo/
-------------- next part --------------
#!/usr/bin/perl -w
use strict;
use warnings;
my $files_dir = '/home/shlomo/a7/archive/statistics/';
system("mkdir -p $files_dir") unless -d $files_dir;
my %token_counts=();
my $model= {
words => {
sorting => {
"by_frequency-alphabet.txt" => sub {
$token_counts{$b} <=> $token_counts{$a}
||
$a cmp $b
},
"by_alphabet-frequency.txt" => sub {
$a cmp $b
||
$token_counts{$b} <=> $token_counts{$a}
},
"by_token_length-frequency-alphabet.txt" => sub {
length($b) <=> length($a)
||
$a cmp $b
||
$token_counts{$b} <=> $token_counts{$a}
},
},
token_separator => qr/\s+/,
token_vector_preparation => sub {
my ($txt_ref,$ws,$ts) = @_;
return (
split(//,' ' x ($ws-1)),
split(/$ts/,$$txt_ref),
split(//,' ' x ($ws-1))
);
},
},
characters => {
sorting => {
"by_frequency-alphabet.txt" => sub {
$token_counts{$b} <=> $token_counts{$a}
||
$a cmp $b
},
"by_alphabet-frequency.txt" => sub {
$a cmp $b
||
$token_counts{$b} <=> $token_counts{$a}
},
},
token_separator => qr//,
token_vector_preparation => sub {
my ($txt_ref,$ws,$ts) = @_;
return (
split(//,' ' x ($ws-1)),
split(/$ts/,$$txt_ref),
split(//,' ' x ($ws-1))
);
},
},
};
my @window_sizes=(1 .. 5);
my @txt_files = <>;
foreach my $window (@window_sizes) {
print "window size = $window\n";
while( my ($k,$v) = each %$model) {
print "model = $k\n";
%token_counts=();
foreach my $txt_filename (@txt_files) {
print "Window=$window\tmodel=$k\tf=$txt_filename\n";
chomp $txt_filename;
open(IN,"<$txt_filename") or die "Cannot open $txt_filename for reading: $!\n";
my $text=join '',<IN>;
close(IN) or die "Cannot close $txt_filename after reading: $!\n";
$text=~s/^\s*//;
$text=~s/\s*$//;
$text=~s/\s+/ /gs;
# prepare tokens vector
my @tokens = &{$v->{token_vector_preparation}}(\$text,$window,$v->{token_separator});
for (my $i=0; $i<@tokens-$window+1; ++$i) {
my $from = $i;
my $to = $i+$window-1;
++$token_counts{join(' ', at tokens[$from .. $to])};
}
}
while (my ($filename_suffix,$sort_sub) = each %{$v->{sorting}}) {
my $filename = $files_dir.$k.".".$window."-gram.".$filename_suffix;
system("mv $filename $filename.bak") if -f $filename;
open(OUT,">$filename") or die "Cannot open $filename for writing: $!\n";
foreach my $token (sort $sort_sub keys %token_counts) {
print OUT $token,"\t",$token_counts{$token},"\n";
}
close(OUT) or die "Cannot close $filename after writing: $!\n";
}
}
}
More information about the Perl
mailing list