Context Navigation

Back to Ticket #282

Ticket #282: xapian-omega-1.0.7a-from-ticket-285-and-cleaned-up-updated-2010-10-27.patch

File xapian-omega-1.0.7a-from-ticket-285-and-cleaned-up-updated-2010-10-27.patch, 60.9 KB (added by Olly Betts, 15 years ago)
Updated version of patch

xapian-omega-1.0.7a/ChangeLog

diff -u  xapian-omega-1.0.7a/ChangeLog.orig

-              old
+              new
         * configure.ac: Check for strftime.
+-08-22 09:30:12 Reini Urban <reinhard.urban@avl.com>
+        omega-0.9.6c:
+        * omindex.cc: Fix wrong timestamp comparison in cache logic
+        * scriptindex.cc: Add lastmod and size records and values.
+        * excel2text, outlook2text.in: New scripts
+-08-18 15:13:32 Reini Urban <reinhard.urban@avl.com>
+        omega-0.9.6b:
+        * omindex.cc: Add HAVE_UNRAR, HAVE_MSGCONVERT, HAVE_READPST checks.
+        Add options --verbose, --silent
+-08-17 18:06:26 Reini Urban <reinhard.urban@avl.com>
+        omega-0.9.6a:
+        * omindex.cc: Added cache_dir, cached virtual directories zip,msg,pst,...).
+        New option: -c/--nocleanup.
+        Consistently log stderr to /var/log/omega/omindex-error.log.
+        * configure.ac: Add HAVE_UNRAR, HAVE_MSGCONVERT,
+        HAVE_READPST.
+        * configfile.cc: New cache_dir
+        * Makefile.am: Link omindex against configfile.
 Sun Jul 09 01:40:09 BST 2006  Olly Betts <olly@survex.com>
         * docs/omegascript.txt: Note that (by design) an omegascript template

xapian-omega-1.0.7a/Makefile.am

diff -u  xapian-omega-1.0.7a/Makefile.am.orig

-              old
+              new
 pkglibbindir = $(pkglibdir)/bin
 pkglibbin_PROGRAMS = omega
 bin_PROGRAMS = omindex scriptindex
+dist_libexec_SCRIPTS = outlook2text excel2text mimeexplode msgconvert.pl
 dist_bin_SCRIPTS = dbi2omega htdig2omega mbox2omega
 check_PROGRAMS = htmlparsetest md5test utf8converttest
 …
         common/safewindows.h\
         common/stringutils.h
 AM_LDFLAGS = $(ICONV_LDFLAGS)
+AM_LDFLAGS = -no-undefined $(ICONV_LDFLAGS)
 omega_SOURCES = omega.cc query.cc cgiparam.cc utils.cc configfile.cc date.cc\
  cdb_init.cc cdb_find.cc cdb_hash.cc cdb_unpack.cc loadfile.cc\
 …
  md5wrap.cc xmlparse.cc metaxmlparse.cc utf8convert.cc sample.cc diritor.cc\
+ configfile.cc\
  runfilter.cc freemem.cc common/msvc_dirent.cc
 if NEED_MKDTEMP
 …
 scriptindex_SOURCES = scriptindex.cc myhtmlparse.cc htmlparse.cc\
  common/getopt.cc commonhelp.cc utils.cc hashterm.cc loadfile.cc\
+ configfile.cc\
  common/safe.cc common/stringutils.cc utf8convert.cc utf8truncate.cc
 …
 MAINTAINERCLEANFILES = $(dist_man_MANS)
 endif
+CLEANFILES = $(dist_libexec_SCRIPTS) $(dist_bin_SCRIPTS)
+omega.conf: $(srcdir)/omega.conf.in Makefile
+        sed "s,@localstatedir@,$(localstatedir)," $(srcdir)/omega.conf.in > $@
+outlook2text: $(srcdir)/outlook2text.in mimeexplode Makefile
+        sed "s,@MSGCONVERT@,$(MSGCONVERT),;s,@MIMEEXPLODE@,$(pkglibbindir)/mimeexplode," $(srcdir)/outlook2text.in > $@
 if DOCUMENTATION_RULES

xapian-omega-1.0.7a/configfile.cc

diff -u  xapian-omega-1.0.7a/configfile.cc.orig

-              old
+              new
 string template_dir = "/var/lib/omega/templates/";
 string log_dir = "/var/log/omega/";
 string cdb_dir = "/var/lib/omega/cdb/";
+string cache_dir = "/var/lib/omega/cache/";
 /** Return true if the file fname exists.
  */
 …
+    }
     while (in) {
+        char line[1024];
+        in.getline(line, sizeof(line));
+        char *p = line;
+        while (isspace((unsigned char)*p)) ++p;
+        if (!*p || *p == '#') continue; // Ignore blank line and comments
+        char *q = p;
+        while (*q && !isspace((unsigned char)*q)) ++q;
+        string name(p, q - p);
+        p = q;
+        while (isspace((unsigned char)*p)) ++p;
+        q = p;
+        while (*q && !isspace((unsigned char)*q)) ++q;
+        string value(p, q - p);
+        while (*q && isspace((unsigned char)*q)) ++q;
+        if (value.empty() || *q) {
+            throw string("Bad line in configuration file `") + cfile + "'";
+        }
+        string name, value;
+        in >> name >> value;
+        if (value[value.length()-1] != '/') value += "/";
         if (name == "database_dir") {
             database_dir = value + "/";
+            database_dir = value;
         } else if (name == "template_dir") {
             template_dir = value + "/";
+            template_dir = value;
         } else if (name == "log_dir") {
             log_dir = value + "/";
+            log_dir = value;
         } else if (name == "cdb_dir") {
+            cdb_dir = value + "/";
+            cdb_dir = value;
+        } else if (name == "cache_dir") {
+            cache_dir = value;
+        }
+    }

xapian-omega-1.0.7a/configfile.h

diff -u  xapian-omega-1.0.7a/configfile.h.orig

-              old
+              new
 extern string template_dir;
 extern string log_dir;
 extern string cdb_dir;
+extern string cache_dir;
 void read_config_file();

xapian-omega-1.0.7a/excel2text

diff -u  xapian-omega-1.0.7a/excel2text.orig

old	new
	1	#! /bin/sh
	2	# strip numbers, to stdout
	3	xls2csv -q0 "$1" \| sed -re's/[0123456789.]+,//g'

xapian-omega-1.0.7a/mimeexplode

diff -u  xapian-omega-1.0.7a/mimeexplode.orig

-              old
+              new
+#!/usr/bin/perl -w
+=head1 NAME
+mimeexplode - explode one or more MIME messages
+=head1 SYNOPSIS
+    mimeexplode [-d <dir>] <mime-msg-file> <mime-msg-file> ...
+    someprocess | mimeexplode -
+=head1 DESCRIPTION
+Takes one or more files from the command line that contain MIME
+messages, and explodes their contents out into subdirectories
+of the current working directory.  The subdirectories are
+just called C<msg0>, C<msg1>, C<msg2>, etc.  Existing directories are
+skipped over.
+The message information is output to the stdout, like this:
+    Message: msg3 (inputfile1.msg)
+        Part: msg3/filename-1.dat (text/plain)
+        Part: msg3/filename-2.dat (text/plain)
+    Message: msg5 (input-file2.msg)
+        Part: msg5/dir.gif (image/gif)
+        Part: msg5/face.jpg (image/jpeg)
+    Message: msg6 (infile3)
+        Part: msg6/filename-1.dat (text/plain)
+This was written as an example of the MIME:: modules in the
+MIME-parser package I wrote.  It may prove useful as a quick-and-dirty
+way of splitting a MIME message if you need to decode something, and
+you don't have a MIME mail reader on hand.
+=head1 COMMAND LINE OPTIONS
+-d outdir
+=head1 AUTHOR
+Eryq C<eryq@zeegee.com>, in a big hurry...
+Reini Urban C<rurban@x-ray.at>: -d option to always explode into the same dir
+=cut
+#BEGIN { unshift @INC, ".." }    # to test MIME:: stuff before installing it!
+require 5.001;
+use strict;
+use vars;
+use MIME::Parser;
+use Getopt::Std;
+my %opts;
+my $outbase = '';
+my $postfix = '';
+#------------------------------------------------------------
+# make_msg - make and return the name of a msgXXX directory
+#------------------------------------------------------------
+#ignored
+#sub make_msg {
+#    while (-d "msg$Msgno") {
+#       ++$Msgno;
+#       die "self-imposed limit reached" if $Msgno == 256;
+#    }
+#   mkdir "msg$Msgno",0755 or die "couldn't make msg$Msgno: $!";
+#    "msg$Msgno";
+#}
+#------------------------------------------------------------
+# dump_entity - dump an entity's file info
+#------------------------------------------------------------
+sub dump_entity {
+    my $ent = shift;
+    my @parts = $ent->parts;
+    if (@parts) {        # multipart...
+        map { dump_entity($_) } @parts;
+    }
+    else {               # single part...
+        print "    Part: ", $ent->bodyhandle->path,
+              " (", scalar($ent->head->mime_type), ")\n";
+    }
+}
+#------------------------------------------------------------
+# main
+#------------------------------------------------------------
+sub main {
+    my $file;
+    my $entity;
+    # make sure the same message gets exploded into the same dir
+    getopts('d:', \%opts);
+    $outbase = $opts{d} ? $opts{d} : "msg0";
+    my $outdir = $outbase;
+    # Go through messages:
+    @ARGV or unshift @ARGV, "-";
+    while (defined($file = shift @ARGV)) {
+      # Sanity:
+      (-d $outdir) or mkdir "$outdir",0755;
+      (-w "$outdir") or die "cwd $outdir not writable!";
+      #my $msgdir = make_msg();
+      #print "Message: $msgdir ($file)\n";
+      # Create a new parser object:
+      my $parser = new MIME::Parser;
+      ### $parser->parse_nested_messages('REPLACE');
+      # Optional: set up parameters that will affect how it extracts
+      #   documents from the input stream:
+      $parser->output_dir($outdir);
+      # Parse an input stream:
+      open FILE, $file or die "couldn't open $file";
+      $entity = $parser->read(\*FILE) or
+        print STDERR "Couldn't parse MIME in $file; continuing...\n";
+      close FILE;
+      # Congratulations: you now have a (possibly multipart) MIME entity!
+      dump_entity($entity) if $entity;
+      ### $entity->dump_skeleton if $entity;
+      $postfix++;
+      $outdir = $outbase.$postfix;
+    }
+;
+}
+exit (&main ? 0 : -1);
+#------------------------------------------------------------
+;

xapian-omega-1.0.7a/msgconvert.pl

diff -u  xapian-omega-1.0.7a/msgconvert.pl.orig

-              old
+              new
+#!/usr/bin/perl -w
+#
+# msgconvert.pl:
+#
+# Convert .MSG files (made by Outlook (Express)) to multipart MIME messages.
+#
+# Copyright 2002, 2004, 2006 Matijs van Zuijlen
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# CHANGES:
+# 20020715  Recognize new items 'Cc', mime type of attachment, long
+#           filename of attachment, and full headers. Attachments turn out
+#           to be numbered, so a regexp is now used to recognize label of
+#           items that are attachments.
+# 20020831  long file name will definitely be used if present. Full headers
+#           and mime type information are used when present. Created
+#           generic system for specifying known items to be skipped.
+#           Unexpected contents is never reason to bail out anymore. Added
+#           support for usage message and option processing (--verbose).
+# 20040104  Handle address data slightly better, make From line less fake,
+#           make $verbose and $skippable_entries global vars, handle HTML
+#           variant of body text if present (though not optimally).
+# 20040214  Fix typos and incorrect comments.
+# 20040307  - Complete rewrite: All functional parts are now in the package
+#             MSGParser;
+#           - Creation of MIME::Entity object is delayed until the output
+#             routines, which means all data is known; This means I can
+#             create a multipart/alternative body.
+#           - Item names are parsed (thanks to bfrederi@alumni.sfu.ca for
+#             the information).
+# 20040514  Check if $self->{HEAD} actually exists before trying to add its
+#           contents to the output Mime object's header data.
+#           (Bug reported by Thomas Ng).
+#           Don't produce multipart messages if not needed.
+#           (Bug reported by Justin B. Scout).
+# 20040529  Correctly format OLEDATE.
+# 20040530  - Extract date from property 0047 (thanks, Marc Goodman).
+#           - Use address data to make To: and Cc: lines complete
+#           - Use the in-reply-to property
+#           - More unknown properties named.
+#           - Found another property containing an SMTP address.
+#           - Put non-SMTP type addresses back in output.
+# 20040825  Replace 'our' to declare globals with 'use vars'. This means
+#           the globals our now properly scoped inside the package and not
+#           the file.
+#           This also fixes the bug that this program did not work on perl
+#           versions below 5.6. (Bug reported by Tim Gustafson)
+# 20060218  More sensible encoding warnings.
+# 20060219  Move OLE parsing to main program.
+#           Parse nested MSG files (Bug reported by Christof Lukas).
+# 20060225  Simplify code.
+#
+#
+# Import modules.
+#
+package MSGParser;
+use strict;
+use OLE::Storage_Lite;
+use MIME::Entity;
+use MIME::Parser;
+use Date::Format;
+use POSIX qw(mktime);
+use constant DIR_TYPE => 1;
+use constant FILE_TYPE => 2;
+use vars qw($skipproperties $skipheaders);
+#
+# Descriptions partially based on mapitags.h
+#
+$skipproperties = {
+  # Envelope properties
+  '000B' => "Conversation key?",
+  '001A' => "Type of message",
+  '003B' => "Sender address variant",
+  '003D' => "Contains 'Re: '",
+  '003F' => "'recieved by' id",
+  '0040' => "'recieved by' name",
+  '0041' => "Sender variant address id",
+  '0042' => "Sender variant name",
+  '0043' => "'recieved representing' id",
+  '0044' => "'recieved representing' name",
+  '0046' => "Read receipt address id",
+  '0051' => "'recieved by' search key",
+  '0052' => "'recieved representing' search key",
+  '0053' => "Read receipt search key",
+  '0064' => "Sender variant address type",
+  '0065' => "Sender variant address",
+  '0070' => "Conversation topic",
+  '0071' => "Conversation index",
+  '0075' => "'recieved by' address type",
+  '0076' => "'recieved by' email address",
+  '0077' => "'recieved representing' address type",
+  '0078' => "'recieved representing' email address",
+  '007F' => "something like a message id",
+  # Recipient properties
+  '0C19' => "Reply address variant",
+  '0C1D' => "Reply address variant",
+  '0C1E' => "Reply address type",
+  # Non-transmittable properties
+  '0E02' => "?Should BCC be displayed",
+  '0E0A' => "sent mail id",
+  '0E1D' => "Subject w/o Re",
+  '0E27' => "64 bytes: Unknown",
+  '0FF6' => "Index",
+  '0FF9' => "Index",
+  '0FFF' => "Address variant",
+  # Content properties
+  '1008' => "Summary or something",
+  '1009' => "RTF Compressed",
+  # 'Common property'
+  '3001' => "Display name",
+  '3002' => "Address Type",
+  '300B' => "'Search key'",
+  # Attachment properties
+  '3702' => "Attachment encoding",
+  '3703' => "Attachment extension",
+  '3709' => "'Attachment rendering'", # Maybe an icon or something?
+  '3713' => "Icon URL?",
+  # 'Mail user'
+  '3A20' => "Address variant",
+  # 3900 -- 39FF: 'Address book'
+  '39FF' => "7 bit display name",
+  # 'Display table properties'
+  '3FF8' => "Routing data?",
+  '3FF9' => "Routing data?",
+  '3FFA' => "Routing data?",
+  '3FFB' => "Routing data?",
+  # 'Transport-defined envelope property'
+  '4029' => "Sender variant address type",
+  '402A' => "Sender variant address",
+  '402B' => "Sender variant name",
+  '5FF6' => "Recipient name",
+  '5FF7' => "Recipient address variant",
+  # 'Provider-defined internal non-transmittable property'
+  '6740' => "Unknown, binary data",
+  # User defined id's
+  '8000' => "Content Class",
+  '8002' => "Unknown, binary data",
+};
+$skipheaders = {
+  "MIME-Version" => 1,
+  "Content-Type" => 1,
+  "Content-Transfer-Encoding" => 1,
+  "X-Mailer" => 1,
+  "X-Msgconvert" => 1,
+  "X-MS-Tnef-Correlator" => 1,
+  "X-MS-Has-Attach" => 1,
+};
+use constant ENCODING_UNICODE => '001F';
+use constant KNOWN_ENCODINGS => {
+    '000D' => 'Directory',
+    '001F' => 'Unicode',
+    '001E' => 'Ascii?',
+    '0102' => 'Binary',
+};
+use constant MAP_ATTACHMENT_FILE => {
+  '3701' => ["DATA",        0], # Data
+  '3704' => ["SHORTNAME",   1], # Short file name
+  '3707' => ["LONGNAME",    1], # Long file name
+  '370E' => ["MIMETYPE",    1], # mime type
+  '3716' => ["DISPOSITION", 1], # disposition
+};
+use constant MAP_SUBITEM_FILE => {
+  '1000' => ["BODY_PLAIN",      0], # Body
+  '1013' => ["BODY_HTML",       0], # HTML Version of body
+  '0037' => ["SUBJECT",         1], # Subject
+  '0047' => ["SUBMISSION_ID",   1], # Seems to contain the date
+  '007D' => ["HEAD",            1], # Full headers
+  '0C1A' => ["FROM",            1], # Reply-To: Name
+  '0C1E' => ["FROM_ADDR_TYPE",  1], # From: Address type
+  '0C1F' => ["FROM_ADDR",       1], # Reply-To: Address
+  '0E04' => ["TO",              1], # To: Names
+  '0E03' => ["CC",              1], # Cc: Names
+  '1035' => ["MESSAGEID",       1], # Message-Id
+  '1042' => ["INREPLYTO",       1], # In reply to Message-Id
+};
+use constant MAP_ADDRESSITEM_FILE => {
+  '3001' => ["NAME",            1], # Real name
+  '3002' => ["TYPE",            1], # Address type
+  '403D' => ["TYPE",            1], # Address type
+  '3003' => ["ADDRESS",         1], # Address
+  '403E' => ["ADDRESS",         1], # Address
+  '39FE' => ["SMTPADDRESS",     1], # SMTP Address variant
+};
+#
+# Main body of module
+#
+sub new {
+  my $that = shift;
+  my $class = ref $that || $that;
+  my $self = {
+    ATTACHMENTS => [],
+    ADDRESSES => [],
+    VERBOSE => 0,
+    HAS_UNICODE => 0,
+    FROM_ADDR_TYPE => "",
+  };
+  bless $self, $class;
+}
+#
+# Main sub: parse the PPS tree, and return
+#
+sub parse {
+  my $self = shift;
+  my $PPS = shift or die "Internal error: No PPS tree";
+  $self->_RootDir($PPS);
+}
+sub mime_object {
+  my $self = shift;
+  my $bodymime;
+  my $mime;
+  if ($self->_IsMultiPart) {
+    # Construct a multipart message object
+    $mime = MIME::Entity->build(Type => "multipart/mixed");
+    # Set the entity that we'll save the body parts to. If there's more than
+    # one part, it's a new entity, otherwise, it's the main $mime object.
+    if ($self->{BODY_HTML} and $self->{BODY_PLAIN}) {
+      $bodymime = MIME::Entity->build(
+        Type => "multipart/alternative",
+        Encoding => "8bit",
+      );
+      $mime->add_part($bodymime);
+    } else {
+      $bodymime = $mime;
+    }
+    if ($self->{BODY_PLAIN}) {
+      $self->_SaveAttachment($bodymime, {
+        MIMETYPE => 'text/plain; charset=ISO-8859-1',
+        ENCODING => '8bit',
+        DATA => $self->{BODY_PLAIN},
+        DISPOSITION => 'inline',
+      });
+    }
+    if ($self->{BODY_HTML}) {
+      $self->_SaveAttachment($bodymime, {
+        MIMETYPE => 'text/html',
+        ENCODING => '8bit',
+        DATA => $self->{BODY_HTML},
+        DISPOSITION => 'inline',
+      });
+    }
+    foreach my $att (@{$self->{ATTACHMENTS}}) {
+      $self->_SaveAttachment($mime, $att);
+    }
+  } elsif ($self->{BODY_PLAIN}) {
+    # Construct a single part message object with a plain text body
+    $mime = MIME::Entity->build(
+      Type => "text/plain",
+      Data => $self->{BODY_PLAIN}
+    );
+  } elsif ($self->{BODY_HTML}) {
+    # Construct a single part message object with an HTML body
+    $mime = MIME::Entity->build(
+      Type => "text/html",
+      Data => $self->{BODY_HTML}
+    );
+  }
+  $self->_CopyHeaderData($mime);
+  $self->_SetHeaderFields($mime);
+  return $mime;
+}
+# Actually output the message in mbox format
+sub print {
+  my $self = shift;
+  my $mime = $self->mime_object;
+  # Construct From line from whatever we know.
+  my $string = "";
+  $string = (
+    $self->{FROM_ADDR_TYPE} eq "SMTP" ?
+    $self->{FROM_ADDR} :
+    'someone@somewhere'
+  );
+  $string =~ s/\n//g;
+  # The date used here is not really important.
+  print "From ", $string, " ", scalar localtime, "\n";
+  $mime->print(\*STDOUT);
+  print "\n";
+}
+sub set_verbosity {
+  my ($self, $verbosity) = @_;
+  defined $verbosity or die "Internal error: no verbosity level";
+  $self->{VERBOSE} = $verbosity;
+}
+#
+# Below are functions that walk the PPS tree. The *Dir functions handle
+# processing the directory nodes of the tree (mainly, iterating over the
+# children), whereas the *Item functions handle processing the items in the
+# directory (if such an item is itself a directory, it will in turn be
+# processed by the relevant *Dir function).
+#
+#
+# RootItem: Check Root Entry, parse sub-entries.
+# The OLE file consists of a single entry called Root Entry, which has
+# several children. These children are parsed in the sub SubItem.
+#
+sub _RootDir {
+  my ($self, $PPS) = @_;
+  foreach my $child (@{$PPS->{Child}}) {
+    $self->_SubItem($child);
+  }
+}
+sub _SubItem {
+  my ($self, $PPS) = @_;
+  if ($PPS->{Type} == DIR_TYPE) {
+    $self->_SubItemDir($PPS);
+  } elsif ($PPS->{Type} == FILE_TYPE) {
+    $self->_SubItemFile($PPS);
+  } else {
+    warn "Unknown entry type: $PPS->{Type}";
+  }
+}
+sub _SubItemDir {
+  my ($self, $PPS) = @_;
+  $self->_GetOLEDate($PPS);
+  my $name = $self->_GetName($PPS);
+  if ($name =~ /__recip_version1 0_ /) { # Address of one recipient
+    $self->_AddressDir($PPS);
+  } elsif ($name =~ '__attach_version1 0_ ') { # Attachment
+    $self->_AttachmentDir($PPS);
+  } else {
+    $self->_UnknownDir($self->_GetName($PPS));
+  }
+}
+sub _SubItemFile {
+  my ($self, $PPS) = @_;
+  my $name = $self->_GetName($PPS);
+  my ($property, $encoding) = $self->_ParseItemName($name);
+  $self->_MapProperty($self, $PPS->{Data}, $property,
+    MAP_SUBITEM_FILE) or $self->_UnknownFile($name);
+}
+sub _AddressDir {
+  my ($self, $PPS) = @_;
+  my $address = {
+    NAME        => undef,
+    ADDRESS     => undef,
+    TYPE        => "",
+  };
+  foreach my $child (@{$PPS->{Child}}) {
+    $self->_AddressItem($child, $address);
+  }
+  push @{$self->{ADDRESSES}}, $address;
+}
+sub _AddressItem {
+  my ($self, $PPS, $addr_info) = @_;
+  my $name = $self->_GetName($PPS);
+  # DIR Entries: There should be none.
+  if ($PPS->{Type} == DIR_TYPE) {
+    $self->_UnknownDir($name);
+  } elsif ($PPS->{Type} == FILE_TYPE) {
+    my ($property, $encoding) = $self->_ParseItemName($name);
+    $self->_MapProperty($addr_info, $PPS->{Data}, $property,
+      MAP_ADDRESSITEM_FILE) or $self->_UnknownFile($name);
+  } else {
+    warn "Unknown entry type: $PPS->{Type}";
+  }
+}
+sub _AttachmentDir {
+  my ($self, $PPS) = @_;
+  my $attachment = {
+    SHORTNAME   => undef,
+    LONGNAME    => undef,
+    MIMETYPE    => 'application/octet-stream',
+    ENCODING    => 'base64',
+    DISPOSITION => 'attachment',
+    DATA        => undef
+  };
+  foreach my $child (@{$PPS->{Child}}) {
+    $self->_AttachmentItem($child, $attachment);
+  }
+  push @{$self->{ATTACHMENTS}}, $attachment;
+}
+sub _AttachmentItem {
+  my ($self, $PPS, $att_info) = @_;
+  my $name = $self->_GetName($PPS);
+  my ($property, $encoding) = $self->_ParseItemName($name);
+  if ($PPS->{Type} == DIR_TYPE) {
+    if ($property eq '3701') {  # Nested MSG file
+      my $msgp = new MSGParser();
+      $msgp->parse($PPS);
+      my $data = $msgp->mime_object->as_string;
+      $att_info->{DATA} = $data;
+      $att_info->{MIMETYPE} = 'message/rfc822';
+      $att_info->{ENCODING} = '8bit';
+    } else {
+      $self->_UnknownDir($name);
+    }
+  } elsif ($PPS->{Type} == FILE_TYPE) {
+    $self->_MapProperty($att_info, $PPS->{Data}, $property,
+      MAP_ATTACHMENT_FILE) or $self->_UnknownFile($name);
+  } else {
+    warn "Unknown entry type: $PPS->{Type}";
+  }
+}
+sub _MapProperty {
+  my ($self, $hash, $data, $property, $map) = @_;
+  defined $property or return 0;
+  my $arr = $map->{$property} or return 0;
+  $arr->[1] and $data =~ s/\000//g;
+  $hash->{$arr->[0]} = $data;
+  return 1;
+}
+sub _UnknownDir {
+  my ($self, $name) = @_;
+  if ($name eq '__nameid_version1 0') {
+    $self->{VERBOSE}
+      and warn "Skipping DIR entry $name (Introductory stuff)\n";
+    return;
+  }
+  warn "Unknown DIR entry $name\n";
+}
+sub _UnknownFile {
+  my ($self, $name) = @_;
+  if ($name eq '__properties_version1 0') {
+    $self->{VERBOSE}
+      and warn "Skipping FILE entry $name (Properties)\n";
+    return;
+  }
+  my ($property, $encoding) = $self->_ParseItemName($name);
+  unless (defined $property) {
+    warn "Unknown FILE entry $name\n";
+    return;
+  }
+  if ($skipproperties->{$property}) {
+    $self->{VERBOSE}
+      and warn "Skipping property $property ($skipproperties->{$property})\n";
+    return;
+  } elsif ($property =~ /^80/) {
+    $self->{VERBOSE}
+      and warn "Skipping property $property (user-defined property)\n";
+    return;
+  } else {
+    warn "Unknown property $property\n";
+    return;
+  }
+}
+#
+# Helper functions
+#
+sub _GetName {
+  my ($self, $PPS) = @_;
+  return $self->_NormalizeWhiteSpace(OLE::Storage_Lite::Ucs2Asc($PPS->{Name}));
+}
+sub _NormalizeWhiteSpace {
+  my ($self, $name) = @_;
+  $name =~ s/\W/ /g;
+  return $name;
+}
+sub _GetOLEDate {
+  my ($self, $PPS) = @_;
+  unless (defined ($self->{OLEDATE})) {
+    # Make Date
+    my $datearr;
+    $datearr = $PPS->{Time2nd};
+    $datearr = $PPS->{Time1st} unless($datearr);
+    $self->{OLEDATE} = $self->_FormatDate($datearr) if $datearr;
+  }
+}
+sub _FormatDate {
+  my ($self, $datearr) = @_;
+  # TODO: This is a little convoluted. Directly using strftime didn't seem
+  # to work.
+  my $datetime = mktime(@$datearr);
+  return time2str("%a, %d %h %Y %X %z", $datetime);
+}
+# If we didn't get the date from the original header data, we may be able
+# to get it from the SUBMISSION_ID:
+# It seems to have the format of a semicolon-separated list of key=value
+# pairs. The key l has a value with the format:
+# <SERVER>-<DATETIME>Z-<NUMBER>, where DATETIME is the date and time in
+# the format YYMMDDHHMMSS.
+sub _SubmissionIdDate {
+  my $self = shift;
+  my $submission_id = $self->{SUBMISSION_ID} or return undef;
+  $submission_id =~ m/l=.*-(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)Z-.*/
+    or return undef;
+  my $year = $1;
+  $year += 100 if $year < 20;
+  return $self->_FormatDate([$6,$5,$4,$3,$2-1,$year]);
+}
+sub _ParseItemName {
+  my ($self, $name) = @_;
+  if ($name =~ /^__substg1 0_(....)(....)$/) {
+    my ($property, $encoding) = ($1, $2);
+    if ($encoding eq ENCODING_UNICODE and not ($self->{HAS_UNICODE})) {
+      warn "This MSG file contains Unicode fields."
+        . " This is currently unsupported.\n";
+      $self->{HAS_UNICODE} = 1;
+    } elsif (not (KNOWN_ENCODINGS()->{$encoding})) {
+      warn "Unknown encoding $encoding. Results may be strange or wrong.\n";
+    }
+    return ($property, $encoding);
+  } else {
+    return (undef, undef);
+  }
+}
+sub _SaveAttachment {
+  my ($self, $mime, $att) = @_;
+  my $ent = $mime->attach(
+    Type => $att->{MIMETYPE},
+    Encoding => $att->{ENCODING},
+    Data => [],
+    Filename => ($att->{LONGNAME} ? $att->{LONGNAME} : $att->{SHORTNAME}),
+    Disposition => $att->{DISPOSITION}
+  );
+  my $handle;
+  if ($handle = $ent->open("w")) {
+    $handle->print($att->{DATA});
+    $handle->close;
+  } else {
+    warn "Could not write data!";
+  }
+}
+sub _SetAddressPart {
+  my ($self, $adrname, $partname, $data) = @_;
+  my $address = $self->{ADDRESSES}->{$adrname};
+  $data =~ s/\000//g;
+  #warn "Processing address data part $partname : $data\n";
+  if (defined ($address->{$partname})) {
+    if ($address->{$partname} eq $data) {
+      warn "Skipping duplicate but identical address information for"
+      . " $partname\n" if $self->{VERBOSE};
+    } else {
+      warn "Address information $partname inconsistent:\n";
+      warn "    Original data: $address->{$partname}\n";
+      warn "    New data: $data\n";
+    }
+  } else {
+    $address->{$partname} = $data;
+  }
+}
+# Set header fields
+sub _AddHeaderField {
+  my ($self, $mime, $fieldname, $value) = @_;
+  my $oldvalue = $mime->head->get($fieldname);
+  return if $oldvalue;
+  $mime->head->add($fieldname, $value) if $value;
+}
+sub _Address {
+  my ($self, $tag) = @_;
+  my $name = $self->{$tag} || "";
+  my $address = $self->{$tag . "_ADDR"} || "";
+  return "$name <$address>";
+}
+# Find SMTP addresses for the given list of names
+sub _ExpandAddressList {
+  my ($self, $names) = @_;
+  my $addresspool = $self->{ADDRESSES};
+  my @namelist = split /; */, $names;
+  my @result;
+  name: foreach my $name (@namelist) {
+    foreach my $address (@$addresspool) {
+      if ($name eq $address->{NAME}) {
+        my $addresstext = $address->{NAME} . " <";
+        if (defined ($address->{SMTPADDRESS})) {
+          $addresstext .= $address->{SMTPADDRESS};
+        } elsif ($address->{TYPE} eq "SMTP") {
+          $addresstext .= $address->{ADDRESS};
+        }
+        $addresstext .= ">";
+        push @result, $addresstext;
+        next name;
+      }
+    }
+    push @result, $name;
+  }
+  return join ", ", @result;
+}
+sub _ParseHead {
+  my ($self, $data) = @_;
+  defined $data or return undef;
+  # Parse full header date if we got that.
+  my $parser = new MIME::Parser();
+  $parser->output_to_core(1);
+  $parser->decode_headers(1);
+  $data =~ s/^Microsoft Mail.*$/X-MSGConvert: yes/m;
+  my $entity = $parser->parse_data($data)
+    or warn "Couldn't parse full headers!";
+  my $head = $entity->head;
+  $head->unfold;
+  return $head;
+}
+# Find out if we need to construct a multipart message
+sub _IsMultiPart {
+  my $self = shift;
+  return (
+    ($self->{BODY_HTML} and $self->{BODY_PLAIN})
+      or @{$self->{ATTACHMENTS}}>0
+  );
+}
+# Copy original header data.
+# Note: This should contain the Date: header.
+sub _CopyHeaderData {
+  my ($self, $mime) = @_;
+  my $head = $self->_ParseHead($self->{HEAD}) or return;
+  foreach my $tag (grep {!$skipheaders->{$_}} $head->tags) {
+    foreach my $value ($head->get_all($tag)) {
+      $mime->head->add($tag, $value);
+    }
+  }
+}
+# Set header fields
+sub _SetHeaderFields {
+  my ($self, $mime) = @_;
+  # If we didn't get the date from the original header data, we may be able
+  # to get it from the SUBMISSION_ID:
+  $self->_AddHeaderField($mime, 'Date', $self->_SubmissionIdDate());
+  # Third and last chance to set the Date: header; this uses the date the
+  # MSG file was saved.
+  $self->_AddHeaderField($mime, 'Date', $self->{OLEDATE});
+  $self->_AddHeaderField($mime, 'Subject', $self->{SUBJECT});
+  $self->_AddHeaderField($mime, 'From', $self->_Address("FROM"));
+  #$self->_AddHeaderField($mime, 'Reply-To', $self->_Address("REPLYTO"));
+  $self->_AddHeaderField($mime, 'To', $self->_ExpandAddressList($self->{TO}));
+  $self->_AddHeaderField($mime, 'Cc', $self->_ExpandAddressList($self->{CC}));
+  $self->_AddHeaderField($mime, 'Message-Id', $self->{MESSAGEID});
+  $self->_AddHeaderField($mime, 'In-Reply-To', $self->{INREPLYTO});
+}
+package main;
+use Getopt::Long;
+use Pod::Usage;
+# Setup command line processing.
+my $verbose = '';
+my $help = '';      # Print help message and exit.
+GetOptions('verbose' => \$verbose, 'help|?' => \$help) or pod2usage(2);
+pod2usage(1) if $help;
+# Get file name
+my $file = $ARGV[0];
+defined $file or pod2usage(2);
+warn "Will parse file: $file\n" if $verbose;
+# Load and parse MSG file (is OLE)
+my $Msg = OLE::Storage_Lite->new($file);
+my $PPS = $Msg->getPpsTree(1);
+$PPS or die "$file must be an OLE file";
+# parse PPS tree
+my $parser = new MSGParser();
+$parser->set_verbosity(1) if $verbose;
+$parser->parse($PPS);
+$parser->print();
+#
+# Usage info follows.
+#
+__END__
+=head1 NAME
+msgconvert.pl - Convert Outlook .msg files to mbox format
+=head1 SYNOPSIS
+msgconvert.pl [options] <file.msg>
+  Options:
+    --verbose   be verbose
+    --help      help message
+=head1 OPTIONS
+=over 8
+=item B<--verbose>
+    Print information about skipped parts of the .msg file.
+=item B<--help>
+    Print a brief help message.
+=head1 DESCRIPTION
+This program will output the message contained in file.msg in mbox format
+on stdout. It will complain about unrecognized OLE parts on
+stderr.
+=head1 BUGS
+Not all data that's in the .MSG file is converted. There simply are some
+parts whose meaning escapes me. One of these must contain the date the
+message was sent, for example. Formatting of text messages will also be
+lost. YMMV.
+=cut

xapian-omega-1.0.7a/omega.cc

diff -u  xapian-omega-1.0.7a/omega.cc.orig

-              old
+              new
+        }
+    }
+    // filter by URL substring
+    val = cgi_params.find("U");
+    if (val != cgi_params.end()) {
+        string url = val->second;
+        if (!url.empty()) {
+            filters += ("U" + url + "*");
+            filters += filter_sep;
+        }
+    }
     // date range filters
     val = cgi_params.find("START");
     if (val != cgi_params.end()) date_start = val->second;

xapian-omega-1.0.7a/omega.conf.in

diff -u  xapian-omega-1.0.7a/omega.conf.in.orig

-              old
+              new
+# Directory containing Xapian databases:
+database_dir @localstatedir@/omega/data
+# Directory containing OmegaScript templates:
+template_dir @localstatedir@/omega/templates
+# Directory to write Omega logs to:
+log_dir      /var/log/omega
+# Directory containing any cdb files for the $lookup OmegaScript command:
+cdb_dir      @localstatedir@/omega/cdb
+# Directory containing extracted archives:
+cache_dir    @localstatedir@/omega/cache

xapian-omega-1.0.7a/omindex.cc

diff -u  xapian-omega-1.0.7a/omindex.cc.orig

-              old
+              new
  * Copyright 2001,2005 James Aylett
  * Copyright 2001,2002 Ananova Ltd
  * Copyright 2002,2003,2004,2005,2006,2007,2008 Olly Betts
+ * Copyright 2006,2007,2008 AVL List GesmbH
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
 …
 #include <xapian.h>
 #include "commonhelp.h"
+#include "configfile.h"
 #include "diritor.h"
 #include "hashterm.h"
 #include "loadfile.h"
 …
 extern char * mkdtemp(char *);
 #endif
+#ifndef LIBEXECDIR
+// must have ending slash
+//# define LIBEXECDIR "/usr/lib/omega/bin/"
+# define LIBEXECDIR ""
+#endif
+#ifndef PKGDATADIR
+// must have ending slash
+# define PKGDATADIR "/usr/share/omega/"
+#endif
 using namespace std;
 #define TITLE_SIZE 128
 …
 #define PROG_NAME "omindex"
 #define PROG_DESC "Index static website data via the filesystem"
+/* used in runfilter.cc */
+bool verbose = false;
+string error_log;
 static bool skip_duplicates = false;
 static bool follow_symlinks = false;
+static bool nocleanup = false;
+static bool silent = false;
 static string dbpath;
 static string root;
 static string indexroot;
 …
+static void
+index_cached_directory(size_t depth_limit,
+                       const string &file,
+                       const string &url,
+                       const string &ext,
+                       const string &cmd,
+                       map<string, string>& mime_map);
+static
+int mkdir_p(const string &path, mode_t mode);
 inline static bool
 p_notalnum(unsigned int c)
+{
 …
+    cout << "Indexing \"" << url << "\" as " << mimetype << " ... " << flush;
+    if (!silent)
+        cout << "Indexing \"" << url.substr(1) << "\" as " << mimetype << " ... " << flush;
     string urlterm("U");
     urlterm += baseurl;
 …
             // indexing is disallowed
+        }
         if (!p.indexing_allowed) {
+            cout << "indexing disallowed by meta tag - skipping" << endl;
+            if (!silent)
+                cout << "indexing disallowed by meta tag - skipping" << endl;
             return;
+        }
         dump = p.dump;
 …
             return;
+        }
         md5_string(dump, md5);
+#if 0 // FIXME: this won't work as omindex will have the database locked...
+    } else if (mimetype == "message/rfc822") { // // => mbox2script
+        //for stemmer lang, parse stemmer.get_description => Xapian::Stem(bla)
+        string cmd = LIBEXECDIR"mbox2omega " + shell_protect(file) + error_log+"| "
+            "scriptindex " + shell_protect(dbpath) + " "PKGDATADIR"mbox2script.script";
+        try {
+            dump = stdout_to_string(cmd);
+        } catch (ReadError) {
+            cout << "\"" << cmd << "\" failed - skipping" << endl;
+            return;
+        }
+#endif
     } else if (mimetype == "application/pdf") {
         string safefile = shell_protect(file);
 …
     } else if (mimetype == "text/rtf") {
         // The --text option unhelpfully converts all non-ASCII characters to
         // "?" so we use --html instead, which produces HTML entities.
         string cmd = "unrtf --nopict --html 2>/dev/null " + shell_protect(file);
+        string cmd = "unrtf --nopict --html 2>/dev/null " + shell_protect(file) + error_log;
         MyHtmlParser p;
         try {
             p.parse_html(stdout_to_string(cmd));
-…
+               (this hunk was shorter than expected)
             Xapian::docid did = db.replace_document(urlterm, newdocument);
             if (did < updated.size()) {
                 updated[did] = true;
+                if (!silent)
                 cout << "updated." << endl;
             } else {
+                if (!silent)
                 cout << "added." << endl;
+            }
         } catch (...) {
             // FIXME: is this ever actually needed?
             db.add_document(newdocument);
+            if (!silent)
             cout << "added (failed re-seek for duplicate)." << endl;
+        }
     } else {
         // If this were a duplicate, we'd have skipped it above.
         db.add_document(newdocument);
+        if (!silent)
         cout << "added." << endl;
+    }
+}
+/* Note: switched to cache_dir as root for virtual directories,
+   because /srcdir/.zip might not be creatable. */
 static void
 index_directory(size_t depth_limit, const string &dir,
                 map<string, string>& mime_map)
+{
     string path = root + indexroot + dir;
+    cout << "[Entering directory " << dir << "]" << endl;
+    if (!silent)
+        cout << "[Entering directory " << dir.substr(1) << "]" << endl;
     DirectoryIterator d(follow_symlinks);
     try {
+        d.start(path);
+        while (d.next()) try {
+            string url = dir;
+            if (!url.empty() && url[url.size() - 1] != '/') url += '/';
+            url += d.leafname();
+            string file = root + indexroot + url;
+            switch (d.get_type()) {
+                case DirectoryIterator::DIRECTORY:
+                    if (depth_limit == 1) continue;
+                    try {
+                        size_t new_limit = depth_limit;
+                        if (new_limit) --new_limit;
+                        index_directory(new_limit, url, mime_map);
+                    } catch (...) {
+                        cout << "Caught unknown exception in index_directory, rethrowing" << endl;
+                        throw;
+        d.start(root + indexroot + dir);
+    } catch (const std::string & error) {
+        cout << error << " - skipping" << endl;
+        return;
+    }
+    while (d.next()) try {
+        struct stat statbuf;
+        string url = dir;
+        if (!url.empty() && url[url.size() - 1] != '/') url += '/';
+        url += d.leafname();
+        string file = root + indexroot + url;
+        switch (d.get_type()) {
+            case DirectoryIterator::DIRECTORY:
+                if (depth_limit == 1) continue;
+                try {
+                    size_t new_limit = depth_limit;
+                    if (new_limit) --new_limit;
+                    index_directory(new_limit, url, mime_map);
+                } catch (...) {
+                    cout << "Caught unknown exception in index_directory, rethrowing" << endl;
+                    throw;
+                }
+                continue;
+            case DirectoryIterator::REGULAR_FILE: {
+                string ext;
+                string::size_type dot = url.find_last_of('.');
+                if (dot != string::npos) ext = url.substr(dot + 1);
+                if (!ext.empty()) {
+                    ext = string(ext); // lowercase ext
+                    for (unsigned int i=0; i<ext.length(); i++) {
+                        ext[i] = tolower(ext[i]);
+                    }
+                case DirectoryIterator::REGULAR_FILE: {
+                    string ext;
+                    string::size_type dot = url.find_last_of('.');
+                    if (dot != string::npos) ext = url.substr(dot + 1);
+                    map<string,string>::iterator mt = mime_map.find(ext);
+                    if (mt == mime_map.end()) {
+                        // If the extension isn't found, see if the lower-cased
+                        // version (if different) is found.
+                        bool changed = false;
+                        string::iterator i;
+                        for (i = ext.begin(); i != ext.end(); ++i) {
+                            if (*i >= 'A' && *i <= 'Z') {
+                                *i = tolower(*i);
+                                changed = true;
+                            }
+                }
+                if (strcmp(d.leafname(), "mbox") == 0) {
+                    // Special filename.
+                    off_t size = d.get_size();
+                    time_t mtime = d.get_mtime();
+                    index_file(indexroot + url, "message/rfc822", mtime, size);
+                    continue;
+                }
+                map<string,string>::iterator mt = mime_map.find(ext);
+                if (mt == mime_map.end()) {
+                    // If the extension isn't found, see if the lower-cased
+                    // version (if different) is found.
+                    bool changed = false;
+                    string::iterator i;
+                    for (i = ext.begin(); i != ext.end(); ++i) {
+                        if (*i >= 'A' && *i <= 'Z') {
+                            *i = tolower(*i);
+                            changed = true;
+                        }
+                    }
+                    if (changed) mt = mime_map.find(ext);
+                }
+                if (mt != mime_map.end()) {
+                    string oldroot = root;
+                    // Only check the file size if we recognise the
+                    // extension to avoid a call to stat()/lstat() for
+                    // files we can't handle when readdir() tells us the
+                    // file type.
+                    off_t size = d.get_size();
+                    if (size == 0) {
+                        cout << "Skipping empty file: \"" << file << "\""
+                             << endl;
+                        continue;
+                    }
+#ifndef _MSC_VER
+                    // NOTE: unpacking does not work on MSWin32 this way!
+                    // we'd really have to pull in utils.cc:rmdir from xapian-core
+                    if (ext == "zip") {
+                        if (depth_limit == 1) {
+                            cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl;
+                            continue;
+                        }
+                        // overwrite
+                        string cmd = "unzip -u -P. -o " +shell_protect(file) + " -d " +shell_protect(cache_dir+"/.zip"+indexroot+url+"/");
+                        try {
+                            size_t new_limit = depth_limit;
+                            if (new_limit) --new_limit;
+                            index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
+                        } catch (ReadError) {
+                            cout << "failed " << cmd << " << in index_cached_directory" << endl;
+                            root = oldroot;
+                        } catch (...) {
+                            cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
+                            root = oldroot;
+                            throw;
+                        }
                         if (changed) mt = mime_map.find(ext);
+                        continue;
+                    }
+                    if (mt != mime_map.end()) {
+                        // Only check the file size if we recognise the
+                        // extension to avoid a call to stat()/lstat() for
+                        // files we can't handle when readdir() tells us the
+                        // file type.
+                        off_t size = d.get_size();
+                        if (size == 0) {
+                            cout << "Skipping empty file: \"" << file << "\""
+                                 << endl;
+                    else if (ext == "rar") {
+                        if (depth_limit == 1) {
+                            cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl;
                             continue;
+                        }
+                        // It's in our MIME map so we know how to index it.
+                        const string & mimetype = mt->second;
+                        // overwrite
+                        string cmd = "unrar x -o+ " +shell_protect(file) + " "
+                            + shell_protect(cache_dir+"/.rar"+indexroot+url+"/");
+                        try {
+                            size_t new_limit = depth_limit;
+                            if (new_limit) --new_limit;
+                            index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
+                        } catch (ReadError) {
+                            cout << "failed " << cmd << " << in index_cached_directory" << endl;
+                            root = oldroot;
+                        } catch (...) {
+                            cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
+                            root = oldroot;
+                            throw;
+                        }
+                        continue;
+                    }
+#ifdef HAVE_MSGCONVERT
+                    else if (ext == "msg") {
+                        struct stat statcache;
+                        char olddir[256];
+                        if (depth_limit == 1) {
+                            cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl;
+                            continue;
+                        }
+                        string cmd = LIBEXECDIR"outlook2text "+shell_protect(file);
+                        // unpack multiparts and attachments. so we have to chdir first
+                        string fulldir = cache_dir+"/.msg"+indexroot+url;
+                        getcwd(olddir,256);
+#ifdef HAVE_LSTAT
+                        lstat(fulldir.c_str(), &statcache);
+#else
+                        stat(fulldir.c_str(), &statcache);
+#endif
+                        if (!S_ISDIR(statcache.st_mode)) {
+                            mkdir_p(fulldir, 0755);
+                        }
                         try {
+                            time_t mtime = d.get_mtime();
+                            index_file(indexroot + url, mimetype, mtime, size);
+                        } catch (NoSuchFilter) {
+                            // FIXME: we ought to ignore by mime-type not
+                            // extension.
+                            cout << "Filter for \"" << mimetype
+                                 << "\" not installed - ignoring extension \""
+                                 << ext << "\"" << endl;
+                            mime_map.erase(mt);
+                        }
+                    } else {
+                        cout << "Unknown extension: \"" << file
+                             << "\" - skipping" << endl;
+                            chdir (fulldir.c_str());
+                            size_t new_limit = depth_limit;
+                            if (new_limit) --new_limit;
+                            index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
+                            chdir (olddir);
+                        } catch (ReadError) {
+                            cout << "failed " << cmd << " << in index_cached_directory" << endl;
+                            chdir (olddir);
+                            root = oldroot;
+                        } catch (...) {
+                            cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
+                            chdir (olddir);
+                            root = oldroot;
+                            throw;
+                        }
+                        continue;
+                    }
+                    continue;
+                }
+                default:
+                    cout << "Not a regular file \"" << file
+#endif
+#ifdef HAVE_READPST
+                    else if (ext == "pst") {
+                        if (depth_limit == 1) {
+                            cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl;
+                            continue;
+                        }
+                        // unpack attachments also, together with mbox files
+                        string cmd = "readpst -r -cv -w -o "
+                            + shell_protect(cache_dir+"/.pst"+indexroot+url+"/")+" "+shell_protect(file);
+                        try {
+                            size_t new_limit = depth_limit;
+                            if (new_limit) --new_limit;
+                            index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
+                        } catch (ReadError) {
+                            root = oldroot;
+                            cout << "failed " << cmd << " << in index_cached_directory" << endl;
+                        } catch (...) {
+                            root = oldroot;
+                            cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
+                            throw;
+                        }
+                        continue;
+                    }
+#endif
+#endif
+                    // It's in our MIME map so we know how to index it.
+                    const string & mimetype = mt->second;
+                    try {
+                        time_t mtime = d.get_mtime();
+                        index_file(indexroot + url, mimetype, mtime, size);
+                    } catch (NoSuchFilter) {
+                        // FIXME: we ought to ignore by mime-type not
+                        // extension.
+                        cout << "Filter for \"" << mimetype
+                             << "\" not installed - ignoring extension \""
+                             << ext << "\"" << endl;
+                        mime_map.erase(mt);
+                    }
+                } else {
+                    cout << "Unknown extension: \"" << file
                          << "\" - skipping" << endl;
+                }
+                continue;
+            }
         } catch (const std::string & error) {
             cout << error << " - skipping" << endl;
             continue;
+            default:
+                cout << "Not a regular file \"" << file
+                     << "\" - skipping" << endl;
+        }
     } catch (const std::string & error) {
+        cout << error << " - skipping directory" << endl;
+        return;
+        cout << error << " - skipping" << endl;
+        continue;
+    }
+}
+static
+int mkdir_p(const string &path, mode_t mode) {
+#ifdef __WIN32__
+    stdout_to_string("mkdir \""+shell_protect(path)+"\"");
+#else
+    stdout_to_string("mkdir -p "+shell_protect(path));
+#endif
+    return 0;
+}
+/*
+ * unpack .msg/.pst/.rar/.zip into local cache dir and recurse there
+ */
+static void
+index_cached_directory(size_t depth_limit,
+                       const string &file,
+                       const string &url,
+                       const string &ext,
+                       const string &cmd,
+                       map<string, string>& mime_map)
+{
+    string oldroot = root;
+    root = cache_dir;
+    string cache = root+"/."+ext+indexroot;
+    string cachedir = cache+url;
+    struct stat statfile, statcache;
+    bool extract_cache;
+#ifdef HAVE_LSTAT
+    lstat(file.c_str(), &statfile);
+    lstat(cachedir.c_str(), &statcache);
+#else
+    stat(file.c_str(), &statfile);
+    stat(cachedir.c_str(), &statcache);
+#endif
+    extract_cache = true;
+    // if cachedir exists and if file is older as cachedir and if cachedir existed 5 secs ago,
+    // then it was already extracted.
+    if (S_ISDIR(statcache.st_mode)
+        && S_ISREG(statfile.st_mode)
+        && (statfile.st_mtime < statcache.st_mtime)
+        && (statcache.st_mtime < (time_t)(time(NULL)-500))) // not created by nested mkdir call
+    {
+        // but is it in the database also? prevent from deleting skipped files
+        if (!silent)
+            cout << "Unchanged cache \"" << cachedir << "\" - \"" << file << "\" - skip extraction "
+                 // << statfile.st_mtime << " < " << statcache.st_mtime
+                 << endl;
+        extract_cache = false;
+    }
+    if (S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode) )
+    {
+        // check database timestamp for cached container, esp. for cleaned up caches.
+        // if already in db we need not to extract again
+        string urlterm("U");
+        urlterm += baseurl;
+        urlterm += "/."+ext+indexroot+url;
+        if (urlterm.length() > MAX_SAFE_TERM_LENGTH)
+            urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH);
+        {
+            // at first find the docid with the beginning urlterm and check its timestamp
+            Xapian::docid docid = 0;
+            Xapian::PostingIterator p = db.postlist_begin(urlterm);
+            if (p != db.postlist_end(urlterm)) {
+                docid = *p;
+            }
+            if (docid && !ignore_time) {
+                // new: first search value (1)
+                Xapian::Document doc = db.get_document(docid);
+                string lastmod;
+                if (doc.values_count())
+                    lastmod = doc.get_value(VALUE_LASTMOD);
+                if (!lastmod.empty()) {
+                    if (string_to_int(lastmod) >= statfile.st_mtime) {
+                        if (!silent)
+                            cout << "Cache "<< "."+ext+indexroot+url << " not newer. Ignored." << endl;
+                        if (docid < updated.size()) {
+                            updated[docid] = true;
+                            root = oldroot;
+                            return;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    if (extract_cache) {
+        if (!silent)
+            cout << "[EXTRACT into cache " << cachedir << "]" << endl;
+        if (verbose && S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode))
+            cout << " ...changed cache \"" << cachedir << "\" - \"" << file << "\" "
+                 << statfile.st_mtime << " < " << statcache.st_mtime << " time: " << time(NULL)
+                 << endl;
+        if (!S_ISDIR(statcache.st_mode))
+            mkdir_p(cachedir, 0755);
+        stdout_to_string(cmd);
+#ifndef __WIN32__
+        stdout_to_string("chmod -R u+rwx " + shell_protect(cachedir));
+#endif
+#ifdef HAVE_LSTAT
+        lstat(cachedir.c_str(), &statcache);
+#else
+        stat(cachedir.c_str(), &statcache);
+#endif
+    }
+    if (S_ISDIR(statcache.st_mode)) {
+        if (depth_limit == 1) {
+            cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl;
+        } else {
+            // max loop 5, magic start: /.ext+file
+            index_directory(depth_limit+5, "/."+ext+url, mime_map);
+            if (!nocleanup) {
+                if (!silent)
+                    cout << "[CLEANUP " << "rm -rf " << shell_protect(cachedir) << "]" << endl;
+                rm_rf(cachedir);
+            }
+        }
+    }
+    else { // no -p would be fatal here
+        cout << "cachedir " << shell_protect(cachedir) << " does not exist - skipped" << endl;
+    }
+    root = oldroot;
+}
 int
 …
     static const struct option longopts[] = {
         { "help",       no_argument,            NULL, 'h' },
         { "version",    no_argument,            NULL, 'v' },
+        { "verbose",    no_argument,            NULL, 'V' },
+        { "silent",     no_argument,            NULL, 'S' },
         { "overwrite",  no_argument,            NULL, 'o' },
         { "duplicates", required_argument,      NULL, 'd' },
         { "preserve-nonduplicates",     no_argument,    NULL, 'p' },
 …
         { "depth-limit",required_argument,      NULL, 'l' },
         { "follow",     no_argument,            NULL, 'f' },
         { "stemmer",    required_argument,      NULL, 's' },
+        { "nocleanup",  no_argument,            NULL, 'c' },
+        { "cachedir",   required_argument,      NULL, 'C' },
         { 0, 0, NULL, 0 }
     };
 …
     mime_map["xlt"] = "application/vnd.ms-excel"; // Excel template
     mime_map["ppt"] = "application/vnd.ms-powerpoint";
     mime_map["pps"] = "application/vnd.ms-powerpoint"; // Powerpoint slideshow
+#ifdef HAVE_READPST
+    //  Outlook messager folder
+    mime_map["pst"] = "application/vnd.ms-outlook-pst"; // readpst | uudeview (libpst)
+#endif
+#ifdef HAVE_MSGCONVERT
+    mime_map["msg"] = "application/vnd.ms-outlook";     // outlook2text via msgconvert.pl
+#endif
+    mime_map["mbox"] = "message/rfc822";                // => mbox2omega
     // Perl:
     mime_map["pl"] = "text/x-perl";
     mime_map["pm"] = "text/x-perl";
 …
     // DjVu:
     mime_map["djv"] = "image/vnd.djvu";
     mime_map["djvu"] = "image/vnd.djvu";
+#ifndef _MSC_VER
+    mime_map["zip"] = "application/x-zip"; // recursive scanning
+#  ifdef HAVE_UNRAR
+    mime_map["rar"] = "application/x-rar"; // recursive scanning
+#  endif
+#endif
+    read_config_file();
     while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:lpf", longopts, NULL)) != -1) {
+    while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:C:lpfc", longopts, NULL))!=EOF) {
         switch (getopt_ret) {
         case 'h': {
             cout << PROG_NAME" - "PROG_DESC"\n\n"
 …
 "                                duplicate replace mode\n"
 "  -D, --db                 path to database to use\n"
 "  -U, --url                base url DIRECTORY represents (default: /)\n"
+"  -C, --cachedir           path to local cache to use (default from omega.conf)\n"
 "  -M, --mime-type          additional MIME mapping ext:type\n"
 "  -l, --depth-limit=LIMIT  set recursion limit (0 = unlimited)\n"
 "  -f, --follow             follow symbolic links\n"
+"  -c, --nocleanup          keep cache, don't delete temporary .zip,.rar,.pst,.msg cache folders\n"
 "      --overwrite          create the database anew (the default is to update\n"
+"                           if the database already exists)" << endl;
+"                           if the database already exists)"
+"      --verbose            Print commands also\n"
+"      --silent             Print only errors\n";
             print_stemmer_help("     ");
             print_help_and_version_help("     ");
             return 0;
 …
         case 'p': // don't delete unupdated documents
             preserve_unupdated = true;
             break;
+        case 'V':
+            verbose = true;
+            break;
+        case 'c':
+            nocleanup = true;
+            break;
         case 'l': { // Set recursion limit
             int arg = atoi(optarg);
             if (arg < 0) arg = 0;
 …
         case 'U':
             baseurl = optarg;
             break;
+        case 'C':
+            cache_dir = optarg;
+            break;
         case 'o': // --overwrite
             overwrite = true;
             break;
 …
     if (baseurl.empty()) {
         cerr << PROG_NAME": --url not specified, assuming `/'.\n";
+    }
+    error_log = " 2>>"+log_dir+"omindex-error.log";
     // baseurl mustn't end '/' or you end up with the wrong URL
     // (//thing is different to /thing). We could probably make this
     // safe a different way, by ensuring that we don't put a leading '/'
 …
     } else {
         indexroot = ""; // index the whole of root
+    }
+    // add the db basename to cache_dir
+    {
+        const char *p = strrchr(dbpath.c_str(), '/');
+        // on windows only
+        if (!p) p = strrchr(dbpath.c_str(), '\\');
+        if (p) { p++; } else { p = dbpath.c_str(); }
+        cache_dir += p;
+    }
     int exitcode = 1;
     try {

xapian-omega-1.0.7a/outlook2text.in

diff -u  xapian-omega-1.0.7a/outlook2text.in.orig

-              old
+              new
+#! /bin/sh
+# converts msg to mbox and extract attachments
+# either be in the cache dir, or accept it as 2nd arg
+if [ -n $2 ]; then
+  @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "$2"
+else
+  # already is in the cache dir
+  base=`basename "$1" .msg`
+  @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "${base}"
+fi

xapian-omega-1.0.7a/query.cc

diff -u  xapian-omega-1.0.7a/query.cc.orig

-              old
+              new
         switch (t[0]) {
             case 'a':
                 return (t == "a" || t == "about" || t == "an" || t == "and" ||
+                        t == "are" || t == "as" || t == "at");
+                    t == "are" || t == "as" || t == "at" || t == "according" ||
+                    t == "again"  || t == "against"  || t == "ah"  || t == "all" ||
+                    t == "although"  || t == "always" || t == "anyone" || t == "after" ||
+                    t == "also"  || t == "any");
             case 'b':
                 return (t == "be" || t == "by");
             case 'e':

xapian-omega-1.0.7a/runfilter.cc

diff -u  xapian-omega-1.0.7a/runfilter.cc.orig

-              old
+              new
 using namespace std;
+extern string error_log;
+extern bool verbose;
 string
 stdout_to_string(const string &cmd)
+{
 …
             setrlimit(RLIMIT_AS, &ram_limit);
+        }
+        execl("/bin/sh", "/bin/sh", "-c", cmd.c_str(), (void*)NULL);
+        string tmp;
+        tmp = cmd + error_log;
+        if (verbose) {
+            cout << " Executing '" << tmp << "'..." << endl;
+        }
+        execl("/bin/sh", "/bin/sh", "-c", tmp.c_str(), (void*)NULL);
         _exit(-1);
+    }
 …
         throw ReadError();
+    }
 #else
+    FILE * fh = popen(cmd.c_str(), "r");
+    string tmp;
+    tmp = cmd + error_log;
+    if (verbose) {
+        cout << " Executing '" << tmp << "'..." << endl;
+    }
+    FILE * fh = popen(tmp.c_str(), "r");
     if (fh == NULL) throw ReadError();
     while (!feof(fh)) {
         char buf[4096];

xapian-omega-1.0.7a/scriptindex.cc

diff -u  xapian-omega-1.0.7a/scriptindex.cc.orig

-              old
+              new
  * Copyright 2001 Sam Liddicott
  * Copyright 2001,2002 Ananova Ltd
  * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts
+ * Copyright 2006,2007 AVL List GesmbH
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
 …
 #include <stdio.h>
 #include <time.h>
 #include "safeunistd.h"
+#include <sys/stat.h>
 #include "commonhelp.h"
+#include "configfile.h"
 #include "hashterm.h"
 #include "loadfile.h"
 #include "myhtmlparse.h"
 #include "stringutils.h"
 #include "utf8truncate.h"
 #include "utils.h"
+#include "values.h"
 #include "gnu_getopt.h"
 …
+{
     string line;
     size_t line_no = 0;
+    time_t last_mod = 0;
+    long   file_size = 0;
+    if (strcmp(fname,"<stdin>") != 0) {
+        struct stat statbuf;
+        stat(fname, &statbuf);
+        if (! statbuf.st_size) {
+            cout << "Empty \"" << fname << "\" - skipping\n";
+            return false;
+        }
+        file_size = statbuf.st_size;
+        last_mod = statbuf.st_mtime;
+    }
     while (!stream.eof() && getline(stream, line)) {
         ++line_no;
         Xapian::Document doc;
 …
             for (i = fields.begin(); i != fields.end(); ++i) {
                 list<string>::const_iterator j;
                 for (j = i->second.begin(); j != i->second.end(); j++) {
+                    if (i->first == "lastmod")  last_mod = 0;
+                    if (i->first == "size")     file_size = 0;
                     data += i->first;
                     data += '=';
                     data += *j;
                     data += '\n';
+                }
+            }
+            // provide some extra fields if not already provided by the script
+            if (last_mod) {        // if indexed per filename
+                data += "lastmod="+int_to_string(last_mod)+'\n';
+                doc.add_value(VALUE_LASTMOD, int_to_string(last_mod));
+            }
+            if (file_size) {        // if indexed per filename
+                data += "size="+int_to_string(file_size)+'\n';
+                doc.add_value(VALUE_FILESIZE, int_to_string(file_size));
+            }
             // Put the data in the document
             doc.set_data(data);

xapian-omega-1.0.7a/utils.cc

diff -u  xapian-omega-1.0.7a/utils.cc.orig

-              old
+              new
 using namespace std;
+#ifdef __WIN32__
+#include "safewindows.h"
+#endif
 // This ought to be enough for any of the conversions below.
 #define BUFSIZE 100
+/// Allow system to work directly on C++ strings.
+inline int system(const string &command) { return system(command.c_str()); }
+/// Remove a directory and contents.
+void
+rm_rf(const string &filename)
+{
+    // Check filename exists and is actually a directory
+    struct stat sb;
+    if (stat(filename, &sb) != 0 || !S_ISDIR(sb.st_mode)) return;
+    string safefile = shell_protect(filename);
+#ifdef __WIN32__
+# if 1
+    static int win95 = -1;
+    if (win95 == -1) {
+        OSVERSIONINFO info;
+        memset(&info, 0, sizeof(OSVERSIONINFO));
+        info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+        if (GetVersionEx(&info)) {
+            win95 = (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS);
+        }
+    }
+    if (win95) {
+        // for 95 like systems:
+        system("deltree /y \"" + safefile + "\"");
+    } else {
+        // for NT like systems:
+        system("rd /s /q \"" + safefile + "\"");
+    }
+# else
+    safefile.append("\0", 2);
+    SHFILEOPSTRUCT shfo;
+    memset((void*)&shfo, 0, sizeof(shfo));
+    shfo.hwnd = 0;
+    shfo.wFunc = FO_DELETE;
+    shfo.pFrom = safefile.data();
+    shfo.fFlags = FOF_NOCONFIRMATION|FOF_NOERRORUI|FOF_SILENT;
+    (void)SHFileOperation(&shfo);
+# endif
+#else
+    system("rm -rf " + safefile);
+#endif
+}

xapian-omega-1.0.7a/utils.h

diff -u  xapian-omega-1.0.7a/utils.h.orig

-              old
+              new
 #include <string>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#ifdef _MSC_VER
+# include <direct.h>
+# include <io.h>
+#else
+# include <unistd.h>
+#endif
+#include <ctype.h>
+#include <fcntl.h>
 /** Converts year, month, day into an 8 character string like: "20061031". */
 std::string date_to_string(int year, int month, int day);
 …
 /** Converts a string to an int. */
 int string_to_int(const std::string & s);
+void rm_rf(const std::string &filename);
 #endif

xapian-omega-1.0.7a/xapian-omega.spec.in

diff -u  xapian-omega-1.0.7a/xapian-omega.spec.in.orig

-              old
+              new
 # Create /var directories
 mkdir -p %{buildroot}%{contentdir}/omega/data
 mkdir -p %{buildroot}%{contentdir}/omega/cdb
+mkdir -p %{buildroot}%{contentdir}/omega/cache
 mkdir -p %{buildroot}%{logdir}/omega
 # Default templates
 mkdir -p %{buildroot}%{contentdir}/omega/templates
 …
 /var/www/icons/omega
 %{_datadir}/%{name}
 %config(noreplace) /etc/omega.conf
+%doc %{_datadir}/doc/%{name}-%{version}
+%docdir /usr/share/doc/%{name}-%{version}
+%doc AUTHORS ChangeLog COPYING NEWS README TODO
 # man pages may be gzipped, hence the trailing wildcard.
 %{_mandir}/man1/omindex.1*
 %{_mandir}/man1/scriptindex.1*

Download in other formats:

Original Format