Ticket #282: xapian-omega-1.0.7a-from-ticket-285-and-cleaned-up-updated-2010-10-29.patch

File xapian-omega-1.0.7a-from-ticket-285-and-cleaned-up-updated-2010-10-29.patch, 48.2 KB (added by Olly Betts, 14 years ago)

further updated patch

  • xapian-omega-1.0.7a/ChangeLog

    diff -u  xapian-omega-1.0.7a/ChangeLog.orig
    old new  
     15262006-08-22 09:30:12 Reini Urban <reinhard.urban@avl.com>
     1527
     1528        omega-0.9.6c:
     1529        * omindex.cc: Fix wrong timestamp comparison in cache logic
     1530        * scriptindex.cc: Add lastmod and size records and values.
     1531        * excel2text, outlook2text.in: New scripts
     1532
     15332006-08-18 15:13:32 Reini Urban <reinhard.urban@avl.com>
     1534
     1535        omega-0.9.6b:
     1536        * omindex.cc: Add HAVE_UNRAR, HAVE_MSGCONVERT, HAVE_READPST checks.
     1537        Add options --silent
     1538       
     15392006-08-17 18:06:26 Reini Urban <reinhard.urban@avl.com>
     1540
     1541        omega-0.9.6a:
     1542        * omindex.cc: Added cached virtual directories zip,msg,pst,...).
     1543        Consistently log stderr to /var/log/omega/omindex-error.log.
     1544        * configure.ac: Add HAVE_UNRAR, HAVE_MSGCONVERT, HAVE_READPST.
     1545
    15261546Sun Jul 09 01:40:09 BST 2006  Olly Betts <olly@survex.com>
    15271547
    15281548        * docs/omegascript.txt: Note that (by design) an omegascript template
  • xapian-omega-1.0.7a/Makefile.am

    diff -u  xapian-omega-1.0.7a/Makefile.am.orig
    old new  
    6161pkglibbindir = $(pkglibdir)/bin
    6262pkglibbin_PROGRAMS = omega
    6363bin_PROGRAMS = omindex scriptindex
     64dist_libexec_SCRIPTS = outlook2text excel2text mimeexplode msgconvert.pl
    6465dist_bin_SCRIPTS = dbi2omega htdig2omega mbox2omega
    6566
    6667check_PROGRAMS = htmlparsetest md5test utf8converttest
     
    9293        common/safewindows.h\
    9394        common/stringutils.h
    9495
    95 AM_LDFLAGS = $(ICONV_LDFLAGS)
     96AM_LDFLAGS = -no-undefined $(ICONV_LDFLAGS)
    9697
    9798omega_SOURCES = omega.cc query.cc cgiparam.cc utils.cc configfile.cc date.cc\
    9899 cdb_init.cc cdb_find.cc cdb_hash.cc cdb_unpack.cc loadfile.cc\
     
    127128MAINTAINERCLEANFILES = $(dist_man_MANS)
    128129endif
    129130
     131CLEANFILES = $(dist_libexec_SCRIPTS) $(dist_bin_SCRIPTS)
     132
     133omega.conf: $(srcdir)/omega.conf.in Makefile
     134        sed "s,@localstatedir@,$(localstatedir)," $(srcdir)/omega.conf.in > $@
     135outlook2text: $(srcdir)/outlook2text.in mimeexplode Makefile
     136        sed "s,@MSGCONVERT@,$(MSGCONVERT),;s,@MIMEEXPLODE@,$(pkglibbindir)/mimeexplode," $(srcdir)/outlook2text.in > $@
     137
    130138if DOCUMENTATION_RULES
  • xapian-omega-1.0.7a/excel2text

    diff -u  xapian-omega-1.0.7a/excel2text.orig
    old new  
     1#! /bin/sh
     2# strip numbers, to stdout
     3xls2csv -q0 "$1" | sed -re's/[0123456789.]+,//g'
  • xapian-omega-1.0.7a/mimeexplode

    diff -u  xapian-omega-1.0.7a/mimeexplode.orig
    old new  
     1#!/usr/bin/perl -w
     2
     3=head1 NAME
     4
     5mimeexplode - explode one or more MIME messages
     6
     7=head1 SYNOPSIS
     8
     9    mimeexplode [-d <dir>] <mime-msg-file> <mime-msg-file> ...
     10
     11    someprocess | mimeexplode -
     12
     13=head1 DESCRIPTION
     14
     15Takes one or more files from the command line that contain MIME
     16messages, and explodes their contents out into subdirectories
     17of the current working directory.  The subdirectories are
     18just called C<msg0>, C<msg1>, C<msg2>, etc.  Existing directories are
     19skipped over.
     20
     21The message information is output to the stdout, like this:
     22
     23    Message: msg3 (inputfile1.msg)
     24        Part: msg3/filename-1.dat (text/plain)
     25        Part: msg3/filename-2.dat (text/plain)
     26    Message: msg5 (input-file2.msg)
     27        Part: msg5/dir.gif (image/gif)
     28        Part: msg5/face.jpg (image/jpeg)
     29    Message: msg6 (infile3)
     30        Part: msg6/filename-1.dat (text/plain)
     31
     32This was written as an example of the MIME:: modules in the
     33MIME-parser package I wrote.  It may prove useful as a quick-and-dirty
     34way of splitting a MIME message if you need to decode something, and
     35you don't have a MIME mail reader on hand.
     36
     37=head1 COMMAND LINE OPTIONS
     38
     39-d outdir
     40
     41=head1 AUTHOR
     42
     43Eryq C<eryq@zeegee.com>, in a big hurry...
     44Reini Urban C<rurban@x-ray.at>: -d option to always explode into the same dir
     45
     46=cut
     47
     48#BEGIN { unshift @INC, ".." }    # to test MIME:: stuff before installing it!
     49
     50require 5.001;
     51
     52use strict;
     53use vars;
     54
     55use MIME::Parser;
     56use Getopt::Std;
     57my %opts;
     58my $outbase = '';
     59my $postfix = '';
     60
     61#------------------------------------------------------------
     62# make_msg - make and return the name of a msgXXX directory
     63#------------------------------------------------------------
     64
     65#ignored
     66#sub make_msg {
     67#    while (-d "msg$Msgno") {
     68#       ++$Msgno;
     69#       die "self-imposed limit reached" if $Msgno == 256;
     70#    }
     71#   mkdir "msg$Msgno",0755 or die "couldn't make msg$Msgno: $!";
     72#    "msg$Msgno";
     73#}
     74
     75#------------------------------------------------------------
     76# dump_entity - dump an entity's file info
     77#------------------------------------------------------------
     78sub dump_entity {
     79    my $ent = shift;
     80    my @parts = $ent->parts;
     81
     82    if (@parts) {        # multipart...
     83        map { dump_entity($_) } @parts;
     84    }
     85    else {               # single part...
     86        print "    Part: ", $ent->bodyhandle->path,
     87              " (", scalar($ent->head->mime_type), ")\n";
     88    }
     89}
     90
     91#------------------------------------------------------------
     92# main
     93#------------------------------------------------------------
     94sub main {
     95    my $file;
     96    my $entity;
     97
     98    # make sure the same message gets exploded into the same dir
     99    getopts('d:', \%opts);
     100    $outbase = $opts{d} ? $opts{d} : "msg0";
     101    my $outdir = $outbase;
     102
     103    # Go through messages:
     104    @ARGV or unshift @ARGV, "-";
     105    while (defined($file = shift @ARGV)) {
     106
     107      # Sanity:
     108      (-d $outdir) or mkdir "$outdir",0755;
     109      (-w "$outdir") or die "cwd $outdir not writable!";
     110      #my $msgdir = make_msg();
     111      #print "Message: $msgdir ($file)\n";
     112
     113      # Create a new parser object:
     114      my $parser = new MIME::Parser;
     115      ### $parser->parse_nested_messages('REPLACE');
     116
     117      # Optional: set up parameters that will affect how it extracts
     118      #   documents from the input stream:
     119      $parser->output_dir($outdir);
     120
     121      # Parse an input stream:
     122      open FILE, $file or die "couldn't open $file";
     123      $entity = $parser->read(\*FILE) or
     124        print STDERR "Couldn't parse MIME in $file; continuing...\n";
     125      close FILE;
     126
     127      # Congratulations: you now have a (possibly multipart) MIME entity!
     128      dump_entity($entity) if $entity;
     129      ### $entity->dump_skeleton if $entity;
     130
     131      $postfix++;
     132      $outdir = $outbase.$postfix;
     133    }
     134    1;
     135}
     136
     137exit (&main ? 0 : -1);
     138#------------------------------------------------------------
     1391;
     140
  • xapian-omega-1.0.7a/msgconvert.pl

    diff -u  xapian-omega-1.0.7a/msgconvert.pl.orig
    old new  
     1#!/usr/bin/perl -w
     2#
     3# msgconvert.pl:
     4#
     5# Convert .MSG files (made by Outlook (Express)) to multipart MIME messages.
     6#
     7# Copyright 2002, 2004, 2006 Matijs van Zuijlen
     8#
     9# This program is free software; you can redistribute it and/or modify it
     10# under the terms of the GNU General Public License as published by the
     11# Free Software Foundation; either version 2 of the License, or (at your
     12# option) any later version.
     13#
     14# This program is distributed in the hope that it will be useful, but
     15# WITHOUT ANY WARRANTY; without even the implied warranty of
     16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
     17# Public License for more details.
     18#
     19# CHANGES:
     20# 20020715  Recognize new items 'Cc', mime type of attachment, long
     21#           filename of attachment, and full headers. Attachments turn out
     22#           to be numbered, so a regexp is now used to recognize label of
     23#           items that are attachments.
     24# 20020831  long file name will definitely be used if present. Full headers
     25#           and mime type information are used when present. Created
     26#           generic system for specifying known items to be skipped.
     27#           Unexpected contents is never reason to bail out anymore. Added
     28#           support for usage message and option processing (--verbose).
     29# 20040104  Handle address data slightly better, make From line less fake,
     30#           make $verbose and $skippable_entries global vars, handle HTML
     31#           variant of body text if present (though not optimally).
     32# 20040214  Fix typos and incorrect comments.
     33# 20040307  - Complete rewrite: All functional parts are now in the package
     34#             MSGParser;
     35#           - Creation of MIME::Entity object is delayed until the output
     36#             routines, which means all data is known; This means I can
     37#             create a multipart/alternative body.
     38#           - Item names are parsed (thanks to bfrederi@alumni.sfu.ca for
     39#             the information).
     40# 20040514  Check if $self->{HEAD} actually exists before trying to add its
     41#           contents to the output Mime object's header data.
     42#           (Bug reported by Thomas Ng).
     43#           Don't produce multipart messages if not needed.
     44#           (Bug reported by Justin B. Scout).
     45# 20040529  Correctly format OLEDATE.
     46# 20040530  - Extract date from property 0047 (thanks, Marc Goodman).
     47#           - Use address data to make To: and Cc: lines complete
     48#           - Use the in-reply-to property
     49#           - More unknown properties named.
     50#           - Found another property containing an SMTP address.
     51#           - Put non-SMTP type addresses back in output.
     52# 20040825  Replace 'our' to declare globals with 'use vars'. This means
     53#           the globals our now properly scoped inside the package and not
     54#           the file.
     55#           This also fixes the bug that this program did not work on perl
     56#           versions below 5.6. (Bug reported by Tim Gustafson)
     57# 20060218  More sensible encoding warnings.
     58# 20060219  Move OLE parsing to main program.
     59#           Parse nested MSG files (Bug reported by Christof Lukas).
     60# 20060225  Simplify code.
     61#
     62
     63#
     64# Import modules.
     65#
     66package MSGParser;
     67use strict;
     68use OLE::Storage_Lite;
     69use MIME::Entity;
     70use MIME::Parser;
     71use Date::Format;
     72use POSIX qw(mktime);
     73use constant DIR_TYPE => 1;
     74use constant FILE_TYPE => 2;
     75
     76use vars qw($skipproperties $skipheaders);
     77#
     78# Descriptions partially based on mapitags.h
     79#
     80$skipproperties = {
     81  # Envelope properties
     82  '000B' => "Conversation key?",
     83  '001A' => "Type of message",
     84  '003B' => "Sender address variant",
     85  '003D' => "Contains 'Re: '",
     86  '003F' => "'recieved by' id",
     87  '0040' => "'recieved by' name",
     88  '0041' => "Sender variant address id",
     89  '0042' => "Sender variant name",
     90  '0043' => "'recieved representing' id",
     91  '0044' => "'recieved representing' name",
     92  '0046' => "Read receipt address id",
     93  '0051' => "'recieved by' search key",
     94  '0052' => "'recieved representing' search key",
     95  '0053' => "Read receipt search key",
     96  '0064' => "Sender variant address type",
     97  '0065' => "Sender variant address",
     98  '0070' => "Conversation topic",
     99  '0071' => "Conversation index",
     100  '0075' => "'recieved by' address type",
     101  '0076' => "'recieved by' email address",
     102  '0077' => "'recieved representing' address type",
     103  '0078' => "'recieved representing' email address",
     104  '007F' => "something like a message id",
     105  # Recipient properties
     106  '0C19' => "Reply address variant",
     107  '0C1D' => "Reply address variant",
     108  '0C1E' => "Reply address type",
     109  # Non-transmittable properties
     110  '0E02' => "?Should BCC be displayed",
     111  '0E0A' => "sent mail id",
     112  '0E1D' => "Subject w/o Re",
     113  '0E27' => "64 bytes: Unknown",
     114  '0FF6' => "Index",
     115  '0FF9' => "Index",
     116  '0FFF' => "Address variant",
     117  # Content properties
     118  '1008' => "Summary or something",
     119  '1009' => "RTF Compressed",
     120  # 'Common property'
     121  '3001' => "Display name",
     122  '3002' => "Address Type",
     123  '300B' => "'Search key'",
     124  # Attachment properties
     125  '3702' => "Attachment encoding",
     126  '3703' => "Attachment extension",
     127  '3709' => "'Attachment rendering'", # Maybe an icon or something?
     128  '3713' => "Icon URL?",
     129  # 'Mail user'
     130  '3A20' => "Address variant",
     131  # 3900 -- 39FF: 'Address book'
     132  '39FF' => "7 bit display name",
     133  # 'Display table properties'
     134  '3FF8' => "Routing data?",
     135  '3FF9' => "Routing data?",
     136  '3FFA' => "Routing data?",
     137  '3FFB' => "Routing data?",
     138  # 'Transport-defined envelope property'
     139  '4029' => "Sender variant address type",
     140  '402A' => "Sender variant address",
     141  '402B' => "Sender variant name",
     142  '5FF6' => "Recipient name",
     143  '5FF7' => "Recipient address variant",
     144  # 'Provider-defined internal non-transmittable property'
     145  '6740' => "Unknown, binary data",
     146  # User defined id's
     147  '8000' => "Content Class",
     148  '8002' => "Unknown, binary data",
     149};
     150
     151$skipheaders = {
     152  "MIME-Version" => 1,
     153  "Content-Type" => 1,
     154  "Content-Transfer-Encoding" => 1,
     155  "X-Mailer" => 1,
     156  "X-Msgconvert" => 1,
     157  "X-MS-Tnef-Correlator" => 1,
     158  "X-MS-Has-Attach" => 1,
     159};
     160
     161use constant ENCODING_UNICODE => '001F';
     162use constant KNOWN_ENCODINGS => {
     163    '000D' => 'Directory',
     164    '001F' => 'Unicode',
     165    '001E' => 'Ascii?',
     166    '0102' => 'Binary',
     167};
     168
     169use constant MAP_ATTACHMENT_FILE => {
     170  '3701' => ["DATA",        0], # Data
     171  '3704' => ["SHORTNAME",   1], # Short file name
     172  '3707' => ["LONGNAME",    1], # Long file name
     173  '370E' => ["MIMETYPE",    1], # mime type
     174  '3716' => ["DISPOSITION", 1], # disposition
     175};
     176
     177use constant MAP_SUBITEM_FILE => {
     178  '1000' => ["BODY_PLAIN",      0], # Body
     179  '1013' => ["BODY_HTML",       0], # HTML Version of body
     180  '0037' => ["SUBJECT",         1], # Subject
     181  '0047' => ["SUBMISSION_ID",   1], # Seems to contain the date
     182  '007D' => ["HEAD",            1], # Full headers
     183  '0C1A' => ["FROM",            1], # Reply-To: Name
     184  '0C1E' => ["FROM_ADDR_TYPE",  1], # From: Address type
     185  '0C1F' => ["FROM_ADDR",       1], # Reply-To: Address
     186  '0E04' => ["TO",              1], # To: Names
     187  '0E03' => ["CC",              1], # Cc: Names
     188  '1035' => ["MESSAGEID",       1], # Message-Id
     189  '1042' => ["INREPLYTO",       1], # In reply to Message-Id
     190};
     191
     192use constant MAP_ADDRESSITEM_FILE => {
     193  '3001' => ["NAME",            1], # Real name
     194  '3002' => ["TYPE",            1], # Address type
     195  '403D' => ["TYPE",            1], # Address type
     196  '3003' => ["ADDRESS",         1], # Address
     197  '403E' => ["ADDRESS",         1], # Address
     198  '39FE' => ["SMTPADDRESS",     1], # SMTP Address variant
     199};
     200
     201#
     202# Main body of module
     203#
     204
     205sub new {
     206  my $that = shift;
     207  my $class = ref $that || $that;
     208
     209  my $self = {
     210    ATTACHMENTS => [],
     211    ADDRESSES => [],
     212    VERBOSE => 0,
     213    HAS_UNICODE => 0,
     214    FROM_ADDR_TYPE => "",
     215  };
     216  bless $self, $class;
     217}
     218
     219#
     220# Main sub: parse the PPS tree, and return
     221#
     222sub parse {
     223  my $self = shift;
     224  my $PPS = shift or die "Internal error: No PPS tree";
     225  $self->_RootDir($PPS);
     226}
     227
     228sub mime_object {
     229  my $self = shift;
     230
     231  my $bodymime;
     232  my $mime;
     233
     234  if ($self->_IsMultiPart) {
     235    # Construct a multipart message object
     236
     237    $mime = MIME::Entity->build(Type => "multipart/mixed");
     238
     239    # Set the entity that we'll save the body parts to. If there's more than
     240    # one part, it's a new entity, otherwise, it's the main $mime object.
     241    if ($self->{BODY_HTML} and $self->{BODY_PLAIN}) {
     242      $bodymime = MIME::Entity->build(
     243        Type => "multipart/alternative",
     244        Encoding => "8bit",
     245      );
     246      $mime->add_part($bodymime);
     247    } else {
     248      $bodymime = $mime;
     249    }
     250    if ($self->{BODY_PLAIN}) {
     251      $self->_SaveAttachment($bodymime, {
     252        MIMETYPE => 'text/plain; charset=ISO-8859-1',
     253        ENCODING => '8bit',
     254        DATA => $self->{BODY_PLAIN},
     255        DISPOSITION => 'inline',
     256      });
     257    }
     258    if ($self->{BODY_HTML}) {
     259      $self->_SaveAttachment($bodymime, {
     260        MIMETYPE => 'text/html',
     261        ENCODING => '8bit',
     262        DATA => $self->{BODY_HTML},
     263        DISPOSITION => 'inline',
     264      });
     265    }
     266    foreach my $att (@{$self->{ATTACHMENTS}}) {
     267      $self->_SaveAttachment($mime, $att);
     268    }
     269  } elsif ($self->{BODY_PLAIN}) {
     270    # Construct a single part message object with a plain text body
     271    $mime = MIME::Entity->build(
     272      Type => "text/plain",
     273      Data => $self->{BODY_PLAIN}
     274    );
     275  } elsif ($self->{BODY_HTML}) {
     276    # Construct a single part message object with an HTML body
     277    $mime = MIME::Entity->build(
     278      Type => "text/html",
     279      Data => $self->{BODY_HTML}
     280    );
     281  }
     282
     283  $self->_CopyHeaderData($mime);
     284
     285  $self->_SetHeaderFields($mime);
     286
     287  return $mime;
     288}
     289
     290# Actually output the message in mbox format
     291sub print {
     292  my $self = shift;
     293
     294  my $mime = $self->mime_object;
     295
     296  # Construct From line from whatever we know.
     297  my $string = "";
     298  $string = (
     299    $self->{FROM_ADDR_TYPE} eq "SMTP" ?
     300    $self->{FROM_ADDR} :
     301    'someone@somewhere'
     302  );
     303  $string =~ s/\n//g;
     304
     305  # The date used here is not really important.
     306  print "From ", $string, " ", scalar localtime, "\n";
     307  $mime->print(\*STDOUT);
     308  print "\n";
     309}
     310
     311sub set_verbosity {
     312  my ($self, $verbosity) = @_;
     313  defined $verbosity or die "Internal error: no verbosity level";
     314  $self->{VERBOSE} = $verbosity;
     315}
     316
     317#
     318# Below are functions that walk the PPS tree. The *Dir functions handle
     319# processing the directory nodes of the tree (mainly, iterating over the
     320# children), whereas the *Item functions handle processing the items in the
     321# directory (if such an item is itself a directory, it will in turn be
     322# processed by the relevant *Dir function).
     323#
     324
     325#
     326# RootItem: Check Root Entry, parse sub-entries.
     327# The OLE file consists of a single entry called Root Entry, which has
     328# several children. These children are parsed in the sub SubItem.
     329#
     330sub _RootDir {
     331  my ($self, $PPS) = @_;
     332
     333  foreach my $child (@{$PPS->{Child}}) {
     334    $self->_SubItem($child);
     335  }
     336}
     337
     338sub _SubItem {
     339  my ($self, $PPS) = @_;
     340 
     341  if ($PPS->{Type} == DIR_TYPE) {
     342    $self->_SubItemDir($PPS);
     343  } elsif ($PPS->{Type} == FILE_TYPE) {
     344    $self->_SubItemFile($PPS);
     345  } else {
     346    warn "Unknown entry type: $PPS->{Type}";
     347  }
     348}
     349
     350sub _SubItemDir {
     351  my ($self, $PPS) = @_;
     352
     353  $self->_GetOLEDate($PPS);
     354
     355  my $name = $self->_GetName($PPS);
     356
     357  if ($name =~ /__recip_version1 0_ /) { # Address of one recipient
     358    $self->_AddressDir($PPS);
     359  } elsif ($name =~ '__attach_version1 0_ ') { # Attachment
     360    $self->_AttachmentDir($PPS);
     361  } else {
     362    $self->_UnknownDir($self->_GetName($PPS));
     363  }
     364}
     365
     366sub _SubItemFile {
     367  my ($self, $PPS) = @_;
     368
     369  my $name = $self->_GetName($PPS);
     370  my ($property, $encoding) = $self->_ParseItemName($name);
     371
     372  $self->_MapProperty($self, $PPS->{Data}, $property,
     373    MAP_SUBITEM_FILE) or $self->_UnknownFile($name);
     374}
     375
     376sub _AddressDir {
     377  my ($self, $PPS) = @_;
     378
     379  my $address = {
     380    NAME        => undef,
     381    ADDRESS     => undef,
     382    TYPE        => "",
     383  };
     384  foreach my $child (@{$PPS->{Child}}) {
     385    $self->_AddressItem($child, $address);
     386  }
     387  push @{$self->{ADDRESSES}}, $address;
     388}
     389
     390sub _AddressItem {
     391  my ($self, $PPS, $addr_info) = @_;
     392
     393  my $name = $self->_GetName($PPS);
     394
     395  # DIR Entries: There should be none.
     396  if ($PPS->{Type} == DIR_TYPE) {
     397    $self->_UnknownDir($name);
     398  } elsif ($PPS->{Type} == FILE_TYPE) {
     399    my ($property, $encoding) = $self->_ParseItemName($name);
     400    $self->_MapProperty($addr_info, $PPS->{Data}, $property,
     401      MAP_ADDRESSITEM_FILE) or $self->_UnknownFile($name);
     402  } else {
     403    warn "Unknown entry type: $PPS->{Type}";
     404  }
     405}
     406
     407sub _AttachmentDir {
     408  my ($self, $PPS) = @_;
     409
     410  my $attachment = {
     411    SHORTNAME   => undef,
     412    LONGNAME    => undef,
     413    MIMETYPE    => 'application/octet-stream',
     414    ENCODING    => 'base64',
     415    DISPOSITION => 'attachment',
     416    DATA        => undef
     417  };
     418  foreach my $child (@{$PPS->{Child}}) {
     419    $self->_AttachmentItem($child, $attachment);
     420  }
     421  push @{$self->{ATTACHMENTS}}, $attachment;
     422}
     423
     424sub _AttachmentItem {
     425  my ($self, $PPS, $att_info) = @_;
     426
     427  my $name = $self->_GetName($PPS);
     428
     429  my ($property, $encoding) = $self->_ParseItemName($name);
     430
     431  if ($PPS->{Type} == DIR_TYPE) {
     432
     433    if ($property eq '3701') {  # Nested MSG file
     434      my $msgp = new MSGParser();
     435      $msgp->parse($PPS);
     436      my $data = $msgp->mime_object->as_string;
     437      $att_info->{DATA} = $data;
     438      $att_info->{MIMETYPE} = 'message/rfc822';
     439      $att_info->{ENCODING} = '8bit';
     440    } else {
     441      $self->_UnknownDir($name);
     442    }
     443
     444  } elsif ($PPS->{Type} == FILE_TYPE) {
     445    $self->_MapProperty($att_info, $PPS->{Data}, $property,
     446      MAP_ATTACHMENT_FILE) or $self->_UnknownFile($name);
     447  } else {
     448    warn "Unknown entry type: $PPS->{Type}";
     449  }
     450}
     451
     452sub _MapProperty {
     453  my ($self, $hash, $data, $property, $map) = @_;
     454
     455  defined $property or return 0;
     456  my $arr = $map->{$property} or return 0;
     457
     458  $arr->[1] and $data =~ s/\000//g;
     459  $hash->{$arr->[0]} = $data;
     460
     461  return 1;
     462}
     463
     464sub _UnknownDir {
     465  my ($self, $name) = @_;
     466
     467  if ($name eq '__nameid_version1 0') {
     468    $self->{VERBOSE}
     469      and warn "Skipping DIR entry $name (Introductory stuff)\n";
     470    return;
     471  }
     472  warn "Unknown DIR entry $name\n";
     473}
     474
     475sub _UnknownFile {
     476  my ($self, $name) = @_;
     477
     478  if ($name eq '__properties_version1 0') {
     479    $self->{VERBOSE}
     480      and warn "Skipping FILE entry $name (Properties)\n";
     481    return;
     482  }
     483
     484  my ($property, $encoding) = $self->_ParseItemName($name);
     485  unless (defined $property) {
     486    warn "Unknown FILE entry $name\n";
     487    return;
     488  }
     489  if ($skipproperties->{$property}) {
     490    $self->{VERBOSE}
     491      and warn "Skipping property $property ($skipproperties->{$property})\n";
     492    return;
     493  } elsif ($property =~ /^80/) {
     494    $self->{VERBOSE}
     495      and warn "Skipping property $property (user-defined property)\n";
     496    return;
     497  } else {
     498    warn "Unknown property $property\n";
     499    return;
     500  }
     501}
     502
     503#
     504# Helper functions
     505#
     506
     507sub _GetName {
     508  my ($self, $PPS) = @_;
     509  return $self->_NormalizeWhiteSpace(OLE::Storage_Lite::Ucs2Asc($PPS->{Name}));
     510}
     511
     512sub _NormalizeWhiteSpace {
     513  my ($self, $name) = @_;
     514  $name =~ s/\W/ /g;
     515  return $name;
     516}
     517
     518sub _GetOLEDate {
     519  my ($self, $PPS) = @_;
     520  unless (defined ($self->{OLEDATE})) {
     521    # Make Date
     522    my $datearr;
     523    $datearr = $PPS->{Time2nd};
     524    $datearr = $PPS->{Time1st} unless($datearr);
     525    $self->{OLEDATE} = $self->_FormatDate($datearr) if $datearr;
     526  }
     527}
     528
     529sub _FormatDate {
     530  my ($self, $datearr) = @_;
     531
     532  # TODO: This is a little convoluted. Directly using strftime didn't seem
     533  # to work.
     534  my $datetime = mktime(@$datearr);
     535  return time2str("%a, %d %h %Y %X %z", $datetime);
     536}
     537
     538# If we didn't get the date from the original header data, we may be able
     539# to get it from the SUBMISSION_ID:
     540# It seems to have the format of a semicolon-separated list of key=value
     541# pairs. The key l has a value with the format:
     542# <SERVER>-<DATETIME>Z-<NUMBER>, where DATETIME is the date and time in
     543# the format YYMMDDHHMMSS.
     544sub _SubmissionIdDate {
     545  my $self = shift;
     546
     547  my $submission_id = $self->{SUBMISSION_ID} or return undef;
     548  $submission_id =~ m/l=.*-(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)Z-.*/
     549    or return undef;
     550  my $year = $1;
     551  $year += 100 if $year < 20;
     552  return $self->_FormatDate([$6,$5,$4,$3,$2-1,$year]);
     553}
     554
     555sub _ParseItemName {
     556  my ($self, $name) = @_;
     557
     558  if ($name =~ /^__substg1 0_(....)(....)$/) {
     559    my ($property, $encoding) = ($1, $2);
     560    if ($encoding eq ENCODING_UNICODE and not ($self->{HAS_UNICODE})) {
     561      warn "This MSG file contains Unicode fields."
     562        . " This is currently unsupported.\n";
     563      $self->{HAS_UNICODE} = 1;
     564    } elsif (not (KNOWN_ENCODINGS()->{$encoding})) {
     565      warn "Unknown encoding $encoding. Results may be strange or wrong.\n";
     566    }
     567    return ($property, $encoding);
     568  } else {
     569    return (undef, undef);
     570  }
     571}
     572
     573sub _SaveAttachment {
     574  my ($self, $mime, $att) = @_;
     575
     576  my $ent = $mime->attach(
     577    Type => $att->{MIMETYPE},
     578    Encoding => $att->{ENCODING},
     579    Data => [],
     580    Filename => ($att->{LONGNAME} ? $att->{LONGNAME} : $att->{SHORTNAME}),
     581    Disposition => $att->{DISPOSITION}
     582  );
     583
     584  my $handle;
     585  if ($handle = $ent->open("w")) {
     586    $handle->print($att->{DATA});
     587    $handle->close;
     588  } else {
     589    warn "Could not write data!";
     590  }
     591}
     592
     593sub _SetAddressPart {
     594  my ($self, $adrname, $partname, $data) = @_;
     595
     596  my $address = $self->{ADDRESSES}->{$adrname};
     597  $data =~ s/\000//g;
     598  #warn "Processing address data part $partname : $data\n";
     599  if (defined ($address->{$partname})) {
     600    if ($address->{$partname} eq $data) {
     601      warn "Skipping duplicate but identical address information for"
     602      . " $partname\n" if $self->{VERBOSE};
     603    } else {
     604      warn "Address information $partname inconsistent:\n";
     605      warn "    Original data: $address->{$partname}\n";
     606      warn "    New data: $data\n";
     607    }
     608  } else {
     609    $address->{$partname} = $data;
     610  }
     611}
     612
     613# Set header fields
     614sub _AddHeaderField {
     615  my ($self, $mime, $fieldname, $value) = @_;
     616
     617  my $oldvalue = $mime->head->get($fieldname);
     618  return if $oldvalue;
     619  $mime->head->add($fieldname, $value) if $value;
     620}
     621
     622sub _Address {
     623  my ($self, $tag) = @_;
     624  my $name = $self->{$tag} || "";
     625  my $address = $self->{$tag . "_ADDR"} || "";
     626  return "$name <$address>";
     627}
     628
     629# Find SMTP addresses for the given list of names
     630sub _ExpandAddressList {
     631  my ($self, $names) = @_;
     632
     633  my $addresspool = $self->{ADDRESSES};
     634  my @namelist = split /; */, $names;
     635  my @result;
     636  name: foreach my $name (@namelist) {
     637    foreach my $address (@$addresspool) {
     638      if ($name eq $address->{NAME}) {
     639        my $addresstext = $address->{NAME} . " <";
     640        if (defined ($address->{SMTPADDRESS})) {
     641          $addresstext .= $address->{SMTPADDRESS};
     642        } elsif ($address->{TYPE} eq "SMTP") {
     643          $addresstext .= $address->{ADDRESS};
     644        }
     645        $addresstext .= ">";
     646        push @result, $addresstext;
     647        next name;
     648      }
     649    }
     650    push @result, $name;
     651  }
     652  return join ", ", @result;
     653}
     654
     655sub _ParseHead {
     656  my ($self, $data) = @_;
     657  defined $data or return undef;
     658  # Parse full header date if we got that.
     659  my $parser = new MIME::Parser();
     660  $parser->output_to_core(1);
     661  $parser->decode_headers(1);
     662  $data =~ s/^Microsoft Mail.*$/X-MSGConvert: yes/m;
     663  my $entity = $parser->parse_data($data)
     664    or warn "Couldn't parse full headers!";
     665  my $head = $entity->head;
     666  $head->unfold;
     667  return $head;
     668}
     669
     670# Find out if we need to construct a multipart message
     671sub _IsMultiPart {
     672  my $self = shift;
     673
     674  return (
     675    ($self->{BODY_HTML} and $self->{BODY_PLAIN})
     676      or @{$self->{ATTACHMENTS}}>0
     677  );
     678}
     679
     680# Copy original header data.
     681# Note: This should contain the Date: header.
     682sub _CopyHeaderData {
     683  my ($self, $mime) = @_;
     684
     685  my $head = $self->_ParseHead($self->{HEAD}) or return;
     686
     687  foreach my $tag (grep {!$skipheaders->{$_}} $head->tags) {
     688    foreach my $value ($head->get_all($tag)) {
     689      $mime->head->add($tag, $value);
     690    }
     691  }
     692}
     693
     694# Set header fields
     695sub _SetHeaderFields {
     696  my ($self, $mime) = @_;
     697
     698  # If we didn't get the date from the original header data, we may be able
     699  # to get it from the SUBMISSION_ID:
     700  $self->_AddHeaderField($mime, 'Date', $self->_SubmissionIdDate());
     701
     702  # Third and last chance to set the Date: header; this uses the date the
     703  # MSG file was saved.
     704  $self->_AddHeaderField($mime, 'Date', $self->{OLEDATE});
     705  $self->_AddHeaderField($mime, 'Subject', $self->{SUBJECT});
     706  $self->_AddHeaderField($mime, 'From', $self->_Address("FROM"));
     707  #$self->_AddHeaderField($mime, 'Reply-To', $self->_Address("REPLYTO"));
     708  $self->_AddHeaderField($mime, 'To', $self->_ExpandAddressList($self->{TO}));
     709  $self->_AddHeaderField($mime, 'Cc', $self->_ExpandAddressList($self->{CC}));
     710  $self->_AddHeaderField($mime, 'Message-Id', $self->{MESSAGEID});
     711  $self->_AddHeaderField($mime, 'In-Reply-To', $self->{INREPLYTO});
     712}
     713
     714package main;
     715use Getopt::Long;
     716use Pod::Usage;
     717
     718# Setup command line processing.
     719my $verbose = '';
     720my $help = '';      # Print help message and exit.
     721GetOptions('verbose' => \$verbose, 'help|?' => \$help) or pod2usage(2);
     722pod2usage(1) if $help;
     723
     724# Get file name
     725my $file = $ARGV[0];
     726defined $file or pod2usage(2);
     727warn "Will parse file: $file\n" if $verbose;
     728
     729# Load and parse MSG file (is OLE)
     730my $Msg = OLE::Storage_Lite->new($file);
     731my $PPS = $Msg->getPpsTree(1);
     732$PPS or die "$file must be an OLE file";
     733
     734# parse PPS tree
     735my $parser = new MSGParser();
     736$parser->set_verbosity(1) if $verbose;
     737$parser->parse($PPS);
     738$parser->print();
     739
     740#
     741# Usage info follows.
     742#
     743__END__
     744
     745=head1 NAME
     746
     747msgconvert.pl - Convert Outlook .msg files to mbox format
     748
     749=head1 SYNOPSIS
     750
     751msgconvert.pl [options] <file.msg>
     752
     753  Options:
     754    --verbose   be verbose
     755    --help      help message
     756
     757=head1 OPTIONS
     758
     759=over 8
     760
     761=item B<--verbose>
     762
     763    Print information about skipped parts of the .msg file.
     764
     765=item B<--help>
     766
     767    Print a brief help message.
     768
     769=head1 DESCRIPTION
     770
     771This program will output the message contained in file.msg in mbox format
     772on stdout. It will complain about unrecognized OLE parts on
     773stderr.
     774
     775=head1 BUGS
     776
     777Not all data that's in the .MSG file is converted. There simply are some
     778parts whose meaning escapes me. One of these must contain the date the
     779message was sent, for example. Formatting of text messages will also be
     780lost. YMMV.
     781
     782=cut
  • xapian-omega-1.0.7a/omindex.cc

    diff -u  xapian-omega-1.0.7a/omindex.cc.orig
    old new  
    44 * Copyright 2001,2005 James Aylett
    55 * Copyright 2001,2002 Ananova Ltd
    66 * Copyright 2002,2003,2004,2005,2006,2007,2008 Olly Betts
     7 * Copyright 2006,2007,2008 AVL List GesmbH
    78 *
    89 * This program is free software; you can redistribute it and/or
    910 * modify it under the terms of the GNU General Public License as
     
    6263extern char * mkdtemp(char *);
    6364#endif
    6465
     66#ifndef LIBEXECDIR
     67// must have ending slash
     68//# define LIBEXECDIR "/usr/lib/omega/bin/"
     69# define LIBEXECDIR ""
     70#endif
     71#ifndef PKGDATADIR
     72// must have ending slash
     73# define PKGDATADIR "/usr/share/omega/"
     74#endif
     75
    6576using namespace std;
    6677
    6778#define TITLE_SIZE 128
     
    6980
    7081#define PROG_NAME "omindex"
    7182#define PROG_DESC "Index static website data via the filesystem"
     83
     84/* used in runfilter.cc */
     85bool verbose = false;
     86string error_log;
    7287
    7388static bool skip_duplicates = false;
    7489static bool follow_symlinks = false;
     90static bool silent = false;
     91static string cache_dir;
    7592static string dbpath;
    7693static string root;
    7794static string indexroot;
     
    136153
     154static void
     155index_cached_directory(size_t depth_limit,
     156                       const string &file,
     157                       const string &url,
     158                       const string &ext,
     159                       const string &cmd,
     160                       map<string, string>& mime_map);
     161static
     162int mkdir_p(const string &path, mode_t mode);
     163
    137164inline static bool
    138165p_notalnum(unsigned int c)
    139166{
     
    217244            // indexing is disallowed
    218245        }
    219246        if (!p.indexing_allowed) {
    220             cout << "indexing disallowed by meta tag - skipping" << endl;
     247            if (!silent)
     248                cout << "indexing disallowed by meta tag - skipping" << endl;
    221249            return;
    222250        }
    223251        dump = p.dump;
     
    245273            return;
    246274        }
    247275        md5_string(dump, md5);
     276#if 0 // FIXME: this won't work as omindex will have the database locked...
     277    } else if (mimetype == "message/rfc822") { // // => mbox2script
     278        //for stemmer lang, parse stemmer.get_description => Xapian::Stem(bla)
     279        string cmd = LIBEXECDIR"mbox2omega " + shell_protect(file) + error_log+"| "
     280            "scriptindex " + shell_protect(dbpath) + " "PKGDATADIR"mbox2script.script";
     281        try {
     282            dump = stdout_to_string(cmd);
     283        } catch (ReadError) {
     284            cout << "\"" << cmd << "\" failed - skipping" << endl;
     285            return;
     286        }
     287#endif
    248288    } else if (mimetype == "application/pdf") {
    249289        string safefile = shell_protect(file);
     
    383423    } else if (mimetype == "text/rtf") {
    384424        // The --text option unhelpfully converts all non-ASCII characters to
    385425        // "?" so we use --html instead, which produces HTML entities.
    386         string cmd = "unrtf --nopict --html 2>/dev/null " + shell_protect(file);
     426        string cmd = "unrtf --nopict --html 2>/dev/null " + shell_protect(file) + error_log;
    387427        MyHtmlParser p;
    388428        try {
    389429            p.parse_html(stdout_to_string(cmd));
     
    566606                continue;
    567607            }
    568608            case DirectoryIterator::REGULAR_FILE: {
     609                if (strcasecmp(d.leafname(), "mbox") == 0) {
     610                    // Special filename.
     611                    index_file(url, "message/rfc822", d);
     612                    continue;
     613                }
    569614
    570615                string ext;
    571616                string::size_type dot = url.find_last_of('.');
     
    610655                        continue;
    611656                    }
    612657
     658                    string oldroot = root;
     659#ifndef _MSC_VER
     660                    // NOTE: unpacking does not work on MSWin32 this way!
     661                    if (ext == "zip") {
     662                        if (depth_limit == 1) {
     663                            cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl;
     664                            continue;
     665                        }
     666                        // overwrite
     667                        string cmd = "unzip -u -P. -o " +shell_protect(file) + " -d " +shell_protect(cache_dir+"/.zip"+indexroot+url+"/");
     668                        try {
     669                            size_t new_limit = depth_limit;
     670                            if (new_limit) --new_limit;
     671                            index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
     672                        } catch (ReadError) {
     673                            cout << "failed " << cmd << " << in index_cached_directory" << endl;
     674                            root = oldroot;
     675                        } catch (...) {
     676                            cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     677                            root = oldroot;
     678                            throw;
     679                        }
     680                        continue;
     681                    }
     682                    else if (ext == "rar") {
     683                        if (depth_limit == 1) {
     684                            cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl;
     685                            continue;
     686                        }
     687
     688                        // overwrite
     689                        string cmd = "unrar x -o+ " +shell_protect(file) + " "
     690                            + shell_protect(cache_dir+"/.rar"+indexroot+url+"/");
     691                        try {
     692                            size_t new_limit = depth_limit;
     693                            if (new_limit) --new_limit;
     694                            index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
     695                        } catch (ReadError) {
     696                            cout << "failed " << cmd << " << in index_cached_directory" << endl;
     697                            root = oldroot;
     698                        } catch (...) {
     699                            cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     700                            root = oldroot;
     701                            throw;
     702                        }
     703                        continue;
     704                    }
     705#ifdef HAVE_MSGCONVERT
     706                    else if (ext == "msg") {
     707                        struct stat statcache;
     708                        char olddir[256];
     709                       
     710                        if (depth_limit == 1) {
     711                            cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl;
     712                            continue;
     713                        }
     714                        string cmd = LIBEXECDIR"outlook2text "+shell_protect(file);
     715                        // unpack multiparts and attachments. so we have to chdir first
     716                        string fulldir = cache_dir+"/.msg"+indexroot+url;
     717                        getcwd(olddir,256);
     718#ifdef HAVE_LSTAT
     719                        lstat(fulldir.c_str(), &statcache);
     720#else
     721                        stat(fulldir.c_str(), &statcache);
     722#endif
     723                        if (!S_ISDIR(statcache.st_mode)) {
     724                            mkdir_p(fulldir, 0755);
     725                        }
     726                        try {
     727                            chdir (fulldir.c_str());
     728                            size_t new_limit = depth_limit;
     729                            if (new_limit) --new_limit;
     730                            index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
     731                            chdir (olddir);
     732                        } catch (ReadError) {
     733                            cout << "failed " << cmd << " << in index_cached_directory" << endl;
     734                            chdir (olddir);
     735                            root = oldroot;
     736                        } catch (...) {
     737                            cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     738                            chdir (olddir);
     739                            root = oldroot;
     740                            throw;
     741                        }
     742                        continue;
     743                    }
     744#endif
     745#ifdef HAVE_READPST
     746                    else if (ext == "pst") {
     747                        if (depth_limit == 1) {
     748                            cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl;
     749                            continue;
     750                        }
     751                        // unpack attachments also, together with mbox files
     752                        string cmd = "readpst -r -cv -w -o "
     753                            + shell_protect(cache_dir+"/.pst"+indexroot+url+"/")+" "+shell_protect(file);
     754                        try {
     755                            size_t new_limit = depth_limit;
     756                            if (new_limit) --new_limit;
     757                            index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
     758                        } catch (ReadError) {
     759                            root = oldroot;
     760                            cout << "failed " << cmd << " << in index_cached_directory" << endl;
     761                        } catch (...) {
     762                            root = oldroot;
     763                            cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     764                            throw;
     765                        }
     766                        continue;
     767                    }
     768#endif
     769#endif
    613770                    // It's in our MIME map so we know how to index it.
    614771                    const string & mimetype = mt->second;
    615772                    try {
     
    640797}
     798
     799static
     800int mkdir_p(const string &path, mode_t mode) {
     801#ifdef __WIN32__
     802    system(("mkdir \"" + shell_protect(path) + "\"").c_str());
     803#else
     804    system(("mkdir -p " + shell_protect(path)).c_str());
     805#endif
     806    return 0;
     807}
     808
     809/*
     810 * unpack .msg/.pst/.rar/.zip into local cache dir and recurse there
     811 */
     812static void
     813index_cached_directory(size_t depth_limit,
     814                       const string &file,
     815                       const string &url,
     816                       const string &ext,
     817                       const string &cmd,
     818                       map<string, string>& mime_map)
     819{
     820    string oldroot = root;
     821    root = cache_dir;
     822    string cache = root+"/."+ext+indexroot;
     823    string cachedir = cache+url;
     824    struct stat statfile, statcache;
     825    bool extract_cache;
     826#ifdef HAVE_LSTAT
     827    lstat(file.c_str(), &statfile);
     828    lstat(cachedir.c_str(), &statcache);
     829#else
     830    stat(file.c_str(), &statfile);
     831    stat(cachedir.c_str(), &statcache);
     832#endif
     833    extract_cache = true;
     834    // if cachedir exists and if file is older as cachedir and if cachedir existed 5 secs ago,
     835    // then it was already extracted.
     836    if (S_ISDIR(statcache.st_mode)
     837        && S_ISREG(statfile.st_mode)
     838        && (statfile.st_mtime < statcache.st_mtime)
     839        && (statcache.st_mtime < (time_t)(time(NULL)-500))) // not created by nested mkdir call
     840    {
     841        // but is it in the database also? prevent from deleting skipped files
     842        if (!silent)
     843            cout << "Unchanged cache \"" << cachedir << "\" - \"" << file << "\" - skip extraction "
     844                 // << statfile.st_mtime << " < " << statcache.st_mtime
     845                 << endl;
     846        extract_cache = false;
     847    }
     848    if (S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode) )
     849    {
     850        // check database timestamp for cached container, esp. for cleaned up caches.
     851        // if already in db we need not to extract again
     852        string urlterm("U");
     853        urlterm += baseurl;
     854        urlterm += "/."+ext+indexroot+url;
     855        if (urlterm.length() > MAX_SAFE_TERM_LENGTH)
     856            urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH);
     857       
     858        {
     859            // at first find the docid with the beginning urlterm and check its timestamp
     860            Xapian::docid docid = 0;
     861            Xapian::PostingIterator p = db.postlist_begin(urlterm);
     862            if (p != db.postlist_end(urlterm)) {
     863                docid = *p;
     864            }
     865            if (docid && !ignore_time) {
     866                // new: first search value (1)
     867                Xapian::Document doc = db.get_document(docid);
     868                string lastmod;
     869                if (doc.values_count())
     870                    lastmod = doc.get_value(VALUE_LASTMOD);
     871                if (!lastmod.empty()) {
     872                    if (string_to_int(lastmod) >= statfile.st_mtime) {
     873                        if (!silent)
     874                            cout << "Cache "<< "."+ext+indexroot+url << " not newer. Ignored." << endl;
     875                        if (docid < updated.size()) {
     876                            updated[docid] = true;
     877                            root = oldroot;
     878                            return;
     879                        }
     880                    }
     881                }
     882            }
     883        }
     884    }
     885
     886    if (extract_cache) {
     887        if (!silent)
     888            cout << "[EXTRACT into cache " << cachedir << "]" << endl;
     889        if (verbose && S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode))
     890            cout << " ...changed cache \"" << cachedir << "\" - \"" << file << "\" "
     891                 << statfile.st_mtime << " < " << statcache.st_mtime << " time: " << time(NULL)
     892                 << endl;
     893        if (!S_ISDIR(statcache.st_mode))
     894            mkdir_p(cachedir, 0755);
     895        stdout_to_string(cmd);
     896#ifndef __WIN32__
     897        stdout_to_string("chmod -R u+rwx " + shell_protect(cachedir));
     898#endif
     899#ifdef HAVE_LSTAT
     900        lstat(cachedir.c_str(), &statcache);
     901#else
     902        stat(cachedir.c_str(), &statcache);
     903#endif
     904    }
     905
     906    if (S_ISDIR(statcache.st_mode)) {
     907        if (depth_limit == 1) {
     908            cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl;
     909        } else {
     910            // max loop 5, magic start: /.ext+file
     911            index_directory(depth_limit+5, "/."+ext+url, mime_map);
     912            if (!silent)
     913                cout << "[CLEANUP " << "rm -rf " << shell_protect(cachedir) << "]" << endl;
     914            rm_rf(cachedir);
     915        }
     916    }
     917    else { // no -p would be fatal here
     918        cout << "cachedir " << shell_protect(cachedir) << " does not exist - skipped" << endl;
     919    }
     920    root = oldroot;
     921}
    641922
    642923int
     
    653934        { "version",    no_argument,            NULL, 'v' },
     935        { "silent",     no_argument,            NULL, 'S' },
    654936        { "overwrite",  no_argument,            NULL, 'o' },
     
    717999    mime_map["xlt"] = "application/vnd.ms-excel"; // Excel template
    7181000    mime_map["ppt"] = "application/vnd.ms-powerpoint";
    7191001    mime_map["pps"] = "application/vnd.ms-powerpoint"; // Powerpoint slideshow
     1002#ifdef HAVE_READPST
     1003    //  Outlook messager folder
     1004    mime_map["pst"] = "application/vnd.ms-outlook-pst"; // readpst | uudeview (libpst)
     1005#endif
     1006#ifdef HAVE_MSGCONVERT
     1007    mime_map["msg"] = "application/vnd.ms-outlook";     // outlook2text via msgconvert.pl
     1008#endif
     1009    mime_map["mbox"] = "message/rfc822";                // => mbox2omega
     1010    mime_map["mbx"] = "message/rfc822";                // => mbox2omega
    7201011    // Perl:
    7211012    mime_map["pl"] = "text/x-perl";
    7221013    mime_map["pm"] = "text/x-perl";
     
    7271018    // DjVu:
    7281019    mime_map["djv"] = "image/vnd.djvu";
    7291020    mime_map["djvu"] = "image/vnd.djvu";
     1021#ifndef _MSC_VER
     1022    mime_map["zip"] = "application/x-zip"; // recursive scanning
     1023#  ifdef HAVE_UNRAR
     1024    mime_map["rar"] = "application/x-rar"; // recursive scanning
     1025#  endif
     1026#endif
    7301027 
     
    7531050"  -f, --follow             follow symbolic links\n"
     1051"      --silent             Print only errors\n"
    7541052"      --overwrite          create the database anew (the default is to update\n"
     
    8441142    if (baseurl.empty()) {
    8451143        cerr << PROG_NAME": --url not specified, assuming `/'.\n";
    8461144    }
     1145    // FIXME: need to set log_dir!
     1146    error_log = " 2>>"+log_dir+"omindex-error.log";
     
    8691169    }
     1170    // add the db basename to cache_dir
     1171    {
     1172        ensure_tmpdir(); // FIXME: be lazy!
     1173        cache_dir = tmpdir;
     1174        const char *p = strrchr(dbpath.c_str(), '/');
     1175        // on windows only
     1176        if (!p) p = strrchr(dbpath.c_str(), '\\');
     1177        if (p) { p++; } else { p = dbpath.c_str(); }
     1178        cache_dir += p;
     1179    }
    8701180
    8711181    int exitcode = 1;
    8721182    try {
  • xapian-omega-1.0.7a/outlook2text.in

    diff -u  xapian-omega-1.0.7a/outlook2text.in.orig
    old new  
     1#! /bin/sh
     2# converts msg to mbox and extract attachments
     3# either be in the cache dir, or accept it as 2nd arg
     4if [ -n $2 ]; then
     5  @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "$2"
     6else
     7  # already is in the cache dir
     8  base=`basename "$1" .msg`
     9  @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "${base}"
     10fi
  • xapian-omega-1.0.7a/query.cc

    diff -u  xapian-omega-1.0.7a/query.cc.orig
    old new  
    141141        switch (t[0]) {
    142142            case 'a':
    143143                return (t == "a" || t == "about" || t == "an" || t == "and" ||
    144                         t == "are" || t == "as" || t == "at");
     144                    t == "are" || t == "as" || t == "at" || t == "according" ||
     145                    t == "again"  || t == "against"  || t == "ah"  || t == "all" ||
     146                    t == "although"  || t == "always" || t == "anyone" || t == "after" ||
     147                    t == "also"  || t == "any");
    145148            case 'b':
    146149                return (t == "be" || t == "by");
    147150            case 'e':
  • xapian-omega-1.0.7a/runfilter.cc

    diff -u  xapian-omega-1.0.7a/runfilter.cc.orig
    old new  
    6060
    6161using namespace std;
    6262
     63extern string error_log;
     64extern bool verbose;
     65
    6366string
    6467stdout_to_string(const string &cmd)
    6568{
     
    97100            setrlimit(RLIMIT_AS, &ram_limit);
    98101        }
    99102
    100         execl("/bin/sh", "/bin/sh", "-c", cmd.c_str(), (void*)NULL);
     103        string tmp;
     104        tmp = cmd + error_log;
     105        if (verbose) {
     106            cout << " Executing '" << tmp << "'..." << endl;
     107        }
     108
     109        execl("/bin/sh", "/bin/sh", "-c", tmp.c_str(), (void*)NULL);
    101110        _exit(-1);
    102111    }
    103112
     
    134143        throw ReadError();
    135144    }
    136145#else
    137     FILE * fh = popen(cmd.c_str(), "r");
     146    string tmp = cmd;
     147    tmp += error_log;
     148    if (verbose) {
     149        cout << " Executing '" << tmp << "'..." << endl;
     150    }
     151    FILE * fh = popen(tmp.c_str(), "r");
    138152    if (fh == NULL) throw ReadError();
    139153    while (!feof(fh)) {
    140154        char buf[4096];
  • xapian-omega-1.0.7a/scriptindex.cc

    diff -u  xapian-omega-1.0.7a/scriptindex.cc.orig
    old new  
    44 * Copyright 2001 Sam Liddicott
    55 * Copyright 2001,2002 Ananova Ltd
    66 * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts
     7 * Copyright 2006,2007 AVL List GesmbH
    78 *
    89 * This program is free software; you can redistribute it and/or
    910 * modify it under the terms of the GNU General Public License as
     
    3839#include <stdio.h>
    3940#include <time.h>
    4041#include "safeunistd.h"
     42#include <sys/stat.h>
    4143
     
    5860#include "utf8truncate.h"
    5961#include "utils.h"
     62#include "values.h"
    6063
    6164#include "gnu_getopt.h"
     
    422425{
    423426    string line;
    424427    size_t line_no = 0;
     428    time_t last_mod = 0;
     429    long   file_size = 0;
     430
     431    if (strcmp(fname,"<stdin>") != 0) {
     432        struct stat statbuf;
     433        stat(fname, &statbuf);
     434        if (! statbuf.st_size) {
     435            cout << "Empty \"" << fname << "\" - skipping\n";
     436            return false;
     437        }
     438        file_size = statbuf.st_size;
     439        last_mod = statbuf.st_mtime;
     440    }
    425441    while (!stream.eof() && getline(stream, line)) {
    426442        ++line_no;
    427443        Xapian::Document doc;
     
    638654            for (i = fields.begin(); i != fields.end(); ++i) {
    639655                list<string>::const_iterator j;
    640656                for (j = i->second.begin(); j != i->second.end(); j++) {
     657                    if (i->first == "lastmod")  last_mod = 0;
     658                    if (i->first == "size")     file_size = 0;
    641659                    data += i->first;
    642660                    data += '=';
    643661                    data += *j;
    644662                    data += '\n';
    645663                }
    646664            }
     665            // provide some extra fields if not already provided by the script
     666            if (last_mod) {        // if indexed per filename
     667                data += "lastmod="+int_to_string(last_mod)+'\n';
     668                doc.add_value(VALUE_LASTMOD, int_to_string(last_mod));
     669            }
     670            if (file_size) {        // if indexed per filename
     671                data += "size="+int_to_string(file_size)+'\n';
     672                doc.add_value(VALUE_FILESIZE, int_to_string(file_size));
     673            }
    647674
    648675            // Put the data in the document
    649676            doc.set_data(data);
  • xapian-omega-1.0.7a/utils.cc

    diff -u  xapian-omega-1.0.7a/utils.cc.orig
    old new  
    3030
    3131using namespace std;
    3232
     33#ifdef __WIN32__
     34#include "safewindows.h"
     35#endif
     36
    3337// This ought to be enough for any of the conversions below.
     
    4044#define BUFSIZE 100
    4145
     46/// Allow system to work directly on C++ strings.
     47inline int system(const string &command) { return system(command.c_str()); }
     48
     49/// Remove a directory and contents.
     50void
     51rm_rf(const string &filename)
     52{
     53    // Check filename exists and is actually a directory
     54    struct stat sb;
     55    if (stat(filename, &sb) != 0 || !S_ISDIR(sb.st_mode)) return;
     56
     57    string safefile = shell_protect(filename);
     58#ifdef __WIN32__
     59# if 1
     60    static int win95 = -1;
     61    if (win95 == -1) {
     62        OSVERSIONINFO info;
     63        memset(&info, 0, sizeof(OSVERSIONINFO));
     64        info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
     65        if (GetVersionEx(&info)) {
     66            win95 = (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS);
     67        }
     68    }
     69
     70    if (win95) {
     71        // for 95 like systems:
     72        system("deltree /y \"" + safefile + "\"");
     73    } else {
     74        // for NT like systems:
     75        system("rd /s /q \"" + safefile + "\"");
     76    }
     77# else
     78    safefile.append("\0", 2);
     79    SHFILEOPSTRUCT shfo;
     80    memset((void*)&shfo, 0, sizeof(shfo));
     81    shfo.hwnd = 0;
     82    shfo.wFunc = FO_DELETE;
     83    shfo.pFrom = safefile.data();
     84    shfo.fFlags = FOF_NOCONFIRMATION|FOF_NOERRORUI|FOF_SILENT;
     85    (void)SHFileOperation(&shfo);
     86# endif
     87#else
     88    system("rm -rf " + safefile);
     89#endif
     90}
  • xapian-omega-1.0.7a/utils.h

    diff -u  xapian-omega-1.0.7a/utils.h.orig
    old new  
    3749/** Converts a string to an int. */
    3850int string_to_int(const std::string & s);
    3951
     52void rm_rf(const std::string &filename);
     53
    4054#endif
  • xapian-omega-1.0.7a/xapian-omega.spec.in

    diff -u  xapian-omega-1.0.7a/xapian-omega.spec.in.orig
    old new  
    7778/var/www/icons/omega
    7879%{_datadir}/%{name}
    7980%config(noreplace) /etc/omega.conf
    80 %doc %{_datadir}/doc/%{name}-%{version}
     81%docdir /usr/share/doc/%{name}-%{version}
     82%doc AUTHORS ChangeLog COPYING NEWS README TODO
    8183# man pages may be gzipped, hence the trailing wildcard.
    8284%{_mandir}/man1/omindex.1*
    8385%{_mandir}/man1/scriptindex.1*