From 9a14f7fe3eb10585ead82f99a43802582ee4d5ba Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Thu, 20 Mar 2008 17:40:37 -0700 Subject: Add index creation script. --- read-index.pl | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/read-index.pl b/read-index.pl index c279750..3041a1c 100644 --- a/read-index.pl +++ b/read-index.pl @@ -2,11 +2,17 @@ use strict; use warnings; +use Lucene; +use File::Basename; + +# Lucene stuff by Robin H. Johnson + + my $filename = "sample.out"; open(my $fh, $filename) or die "could not open $filename"; -my %documents; +my %rawdocs; while (my $line=<$fh>) { $line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s; my $fileid = $1; # numeric or "dist" @@ -15,11 +21,11 @@ while (my $line=<$fh>) { #print "Fileid: ". $fileid . "\n"; #print "field: ". $field . "\n"; #print "Value: ". $value . "\n"; - - if ( ! $documents{$fileid} ) { - $documents{$fileid} = { $field => $value }; + + if ( ! $rawdocs{$fileid} ) { + $rawdocs{$fileid} = { $field => $value }; } else { - $documents{$fileid}{$field} = $value; + $rawdocs{$fileid}{$field} = $value; } } close($fh); @@ -41,3 +47,51 @@ close($fh); # i would split up by [/.-_] at least. technically, using # (\W|_|\d) as the class of split characters might be reasonable +my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer(); +mkdir "data"; +my $store = Lucene::Store::FSDirectory->getDirectory("data", 0); +my $writer = new Lucene::Index::IndexWriter($store, $analyzer, 1); +$writer->setMergeFactor(100); +$writer->setUseCompoundFile(0); +$writer->setMaxFieldLength(2048); +$writer->setMinMergeDocs(10); +$writer->setMaxMergeDocs(100); + +# Add Documents here +sub createdoc { + my ($distfile, $rawdoc) = @_; + my $isdist = defined($rawdoc->{isdistfile}) && $rawdoc->{isdistfile} ? 1 : 0; + my $doc = new Lucene::Document; + $doc->add(Lucene::Document::Field->Text("distfile", $distfile)); + $doc->add(Lucene::Document::Field->Keyword("isdistfile", $isdist)); + if($isdist) { + for my $f (qw(origin cat pn cpv)) { + $doc->add(Lucene::Document::Field->Text($f, $rawdoc->{$f})) if defined($rawdoc->{$f}); + } + for my $f (qw(pv pr pf)) { + $doc->add(Lucene::Document::Field->Keyword($f, $rawdoc->{$f})) if defined($rawdoc->{$f}); + } + } else { + my $name = $rawdoc->{name}; + $doc->add(Lucene::Document::Field->Text("path", $name)); + $doc->add(Lucene::Document::Field->Text("filename", basename($name))); + $doc->add(Lucene::Document::Field->Text("directory", dirname($name))); + } + for my $f (qw(md5 sha1 mtime size)) { + $doc->add(Lucene::Document::Field->Keyword($f, $rawdoc->{$f})) if defined($rawdoc->{$f}); + } + return $doc; +} + +my $distfile = $rawdocs{dist}{name}; +foreach my $f (keys(%rawdocs)) { + printf "%s\n", $f; + my $doc = createdoc($distfile, $rawdocs{$f}); + $writer->addDocument($doc); +} + +# End of Document adding +$writer->optimize(); +$writer->close; +undef $writer; + -- cgit v1.2.3-65-gdbad