Ticket #282: xapian-omega-trunk-r16058-from-ticket-285-and-cleaned-up-updated-2011-12-06.patch

File xapian-omega-trunk-r16058-from-ticket-285-and-cleaned-up-updated-2011-12-06.patch, 47.9 KB (added by Olly Betts, 12 years ago)

Patch against trunk r16062

  • xapian-applications/omega/ChangeLog

    diff --git a/xapian-applications/omega/ChangeLog b/xapian-applications/omega/ChangeLog
    index 598304a..9c0abdc 100644
    a b  
     12006-08-22 09:30:12 Reini Urban <reinhard.urban@avl.com>
     2
     3        omega-0.9.6c:
     4        * omindex.cc: Fix wrong timestamp comparison in cache logic
     5        * outlook2text.in: New script
     6
     72006-08-17 18:06:26 Reini Urban <reinhard.urban@avl.com>
     8
     9        omega-0.9.6a:
     10        * omindex.cc: Added cached virtual directories zip,msg,pst,...).
     11        Consistently log stderr to /var/log/omega/omindex-error.log.
     12
    113Sat Oct 29 14:49:40 GMT 2011  Olly Betts <olly@survex.com>
    214
    315        * docs/omegascript.rst: Add note to discourage use of percentage
  • xapian-applications/omega/Makefile.am

    diff --git a/xapian-applications/omega/Makefile.am b/xapian-applications/omega/Makefile.am
    index 6ee0a07..2505a5a 100644
    a b pkglibbindir = $(pkglibdir)/bin  
    7878pkglibbin_PROGRAMS = omega
    7979dist_pkglibbin_SCRIPTS = outlookmsg2html
    8080bin_PROGRAMS = omindex scriptindex
     81dist_libexec_SCRIPTS = outlook2text mimeexplode msgconvert.pl
    8182dist_bin_SCRIPTS = dbi2omega htdig2omega mbox2omega
    8283
    8384check_PROGRAMS = htmlparsetest md5test utf8converttest
    dist_man_MANS = omindex.1 scriptindex.1  
    160161MAINTAINERCLEANFILES = $(dist_man_MANS)
    161162endif
    162163
     164CLEANFILES = outlook2text
     165
     166outlook2text: $(srcdir)/outlook2text.in Makefile
     167        sed "s,@MSGCONVERT@,$(MSGCONVERT),;s,@MIMEEXPLODE@,$(pkglibbindir)/mimeexplode," $(srcdir)/outlook2text.in > $@
     168
    163169if DOCUMENTATION_RULES
    164170omindex.1: omindex$(EXEEXT) makemanpage
    165171        ./makemanpage ./omindex $(srcdir)/omindex.cc omindex.1
  • new file xapian-applications/omega/mimeexplode

    diff --git a/xapian-applications/omega/mimeexplode b/xapian-applications/omega/mimeexplode
    new file mode 100644
    index 0000000..70743ab
    - +  
     1#!/usr/bin/perl -w
     2
     3=head1 NAME
     4
     5mimeexplode - explode one or more MIME messages
     6
     7=head1 SYNOPSIS
     8
     9    mimeexplode [-d <dir>] <mime-msg-file> <mime-msg-file> ...
     10
     11    someprocess | mimeexplode -
     12
     13=head1 DESCRIPTION
     14
     15Takes one or more files from the command line that contain MIME
     16messages, and explodes their contents out into subdirectories
     17of the current working directory.  The subdirectories are
     18just called C<msg0>, C<msg1>, C<msg2>, etc.  Existing directories are
     19skipped over.
     20
     21The message information is output to the stdout, like this:
     22
     23    Message: msg3 (inputfile1.msg)
     24        Part: msg3/filename-1.dat (text/plain)
     25        Part: msg3/filename-2.dat (text/plain)
     26    Message: msg5 (input-file2.msg)
     27        Part: msg5/dir.gif (image/gif)
     28        Part: msg5/face.jpg (image/jpeg)
     29    Message: msg6 (infile3)
     30        Part: msg6/filename-1.dat (text/plain)
     31
     32This was written as an example of the MIME:: modules in the
     33MIME-parser package I wrote.  It may prove useful as a quick-and-dirty
     34way of splitting a MIME message if you need to decode something, and
     35you don't have a MIME mail reader on hand.
     36
     37=head1 COMMAND LINE OPTIONS
     38
     39-d outdir
     40
     41=head1 AUTHOR
     42
     43Eryq C<eryq@zeegee.com>, in a big hurry...
     44Reini Urban C<rurban@x-ray.at>: -d option to always explode into the same dir
     45
     46=cut
     47
     48#BEGIN { unshift @INC, ".." }    # to test MIME:: stuff before installing it!
     49
     50require 5.001;
     51
     52use strict;
     53use vars;
     54
     55use MIME::Parser;
     56use Getopt::Std;
     57my %opts;
     58my $outbase = '';
     59my $postfix = '';
     60
     61#------------------------------------------------------------
     62# make_msg - make and return the name of a msgXXX directory
     63#------------------------------------------------------------
     64
     65#ignored
     66#sub make_msg {
     67#    while (-d "msg$Msgno") {
     68#       ++$Msgno;
     69#       die "self-imposed limit reached" if $Msgno == 256;
     70#    }
     71#   mkdir "msg$Msgno",0755 or die "couldn't make msg$Msgno: $!";
     72#    "msg$Msgno";
     73#}
     74
     75#------------------------------------------------------------
     76# dump_entity - dump an entity's file info
     77#------------------------------------------------------------
     78sub dump_entity {
     79    my $ent = shift;
     80    my @parts = $ent->parts;
     81
     82    if (@parts) {        # multipart...
     83        map { dump_entity($_) } @parts;
     84    }
     85    else {               # single part...
     86        print "    Part: ", $ent->bodyhandle->path,
     87              " (", scalar($ent->head->mime_type), ")\n";
     88    }
     89}
     90
     91#------------------------------------------------------------
     92# main
     93#------------------------------------------------------------
     94sub main {
     95    my $file;
     96    my $entity;
     97
     98    # make sure the same message gets exploded into the same dir
     99    getopts('d:', \%opts);
     100    $outbase = $opts{d} ? $opts{d} : "msg0";
     101    my $outdir = $outbase;
     102
     103    # Go through messages:
     104    @ARGV or unshift @ARGV, "-";
     105    while (defined($file = shift @ARGV)) {
     106
     107      # Sanity:
     108      (-d $outdir) or mkdir "$outdir",0755;
     109      (-w "$outdir") or die "cwd $outdir not writable!";
     110      #my $msgdir = make_msg();
     111      #print "Message: $msgdir ($file)\n";
     112
     113      # Create a new parser object:
     114      my $parser = new MIME::Parser;
     115      ### $parser->parse_nested_messages('REPLACE');
     116
     117      # Optional: set up parameters that will affect how it extracts
     118      #   documents from the input stream:
     119      $parser->output_dir($outdir);
     120
     121      # Parse an input stream:
     122      open FILE, $file or die "couldn't open $file";
     123      $entity = $parser->read(\*FILE) or
     124        print STDERR "Couldn't parse MIME in $file; continuing...\n";
     125      close FILE;
     126
     127      # Congratulations: you now have a (possibly multipart) MIME entity!
     128      dump_entity($entity) if $entity;
     129      ### $entity->dump_skeleton if $entity;
     130
     131      $postfix++;
     132      $outdir = $outbase.$postfix;
     133    }
     134    1;
     135}
     136
     137exit (&main ? 0 : -1);
     138#------------------------------------------------------------
     1391;
     140
  • new file xapian-applications/omega/msgconvert.pl

    diff --git a/xapian-applications/omega/msgconvert.pl b/xapian-applications/omega/msgconvert.pl
    new file mode 100644
    index 0000000..cf32079
    - +  
     1#!/usr/bin/perl -w
     2#
     3# msgconvert.pl:
     4#
     5# Convert .MSG files (made by Outlook (Express)) to multipart MIME messages.
     6#
     7# Copyright 2002, 2004, 2006 Matijs van Zuijlen
     8#
     9# This program is free software; you can redistribute it and/or modify it
     10# under the terms of the GNU General Public License as published by the
     11# Free Software Foundation; either version 2 of the License, or (at your
     12# option) any later version.
     13#
     14# This program is distributed in the hope that it will be useful, but
     15# WITHOUT ANY WARRANTY; without even the implied warranty of
     16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
     17# Public License for more details.
     18#
     19# CHANGES:
     20# 20020715  Recognize new items 'Cc', mime type of attachment, long
     21#           filename of attachment, and full headers. Attachments turn out
     22#           to be numbered, so a regexp is now used to recognize label of
     23#           items that are attachments.
     24# 20020831  long file name will definitely be used if present. Full headers
     25#           and mime type information are used when present. Created
     26#           generic system for specifying known items to be skipped.
     27#           Unexpected contents is never reason to bail out anymore. Added
     28#           support for usage message and option processing (--verbose).
     29# 20040104  Handle address data slightly better, make From line less fake,
     30#           make $verbose and $skippable_entries global vars, handle HTML
     31#           variant of body text if present (though not optimally).
     32# 20040214  Fix typos and incorrect comments.
     33# 20040307  - Complete rewrite: All functional parts are now in the package
     34#             MSGParser;
     35#           - Creation of MIME::Entity object is delayed until the output
     36#             routines, which means all data is known; This means I can
     37#             create a multipart/alternative body.
     38#           - Item names are parsed (thanks to bfrederi@alumni.sfu.ca for
     39#             the information).
     40# 20040514  Check if $self->{HEAD} actually exists before trying to add its
     41#           contents to the output Mime object's header data.
     42#           (Bug reported by Thomas Ng).
     43#           Don't produce multipart messages if not needed.
     44#           (Bug reported by Justin B. Scout).
     45# 20040529  Correctly format OLEDATE.
     46# 20040530  - Extract date from property 0047 (thanks, Marc Goodman).
     47#           - Use address data to make To: and Cc: lines complete
     48#           - Use the in-reply-to property
     49#           - More unknown properties named.
     50#           - Found another property containing an SMTP address.
     51#           - Put non-SMTP type addresses back in output.
     52# 20040825  Replace 'our' to declare globals with 'use vars'. This means
     53#           the globals our now properly scoped inside the package and not
     54#           the file.
     55#           This also fixes the bug that this program did not work on perl
     56#           versions below 5.6. (Bug reported by Tim Gustafson)
     57# 20060218  More sensible encoding warnings.
     58# 20060219  Move OLE parsing to main program.
     59#           Parse nested MSG files (Bug reported by Christof Lukas).
     60# 20060225  Simplify code.
     61#
     62
     63#
     64# Import modules.
     65#
     66package MSGParser;
     67use strict;
     68use OLE::Storage_Lite;
     69use MIME::Entity;
     70use MIME::Parser;
     71use Date::Format;
     72use POSIX qw(mktime);
     73use constant DIR_TYPE => 1;
     74use constant FILE_TYPE => 2;
     75
     76use vars qw($skipproperties $skipheaders);
     77#
     78# Descriptions partially based on mapitags.h
     79#
     80$skipproperties = {
     81  # Envelope properties
     82  '000B' => "Conversation key?",
     83  '001A' => "Type of message",
     84  '003B' => "Sender address variant",
     85  '003D' => "Contains 'Re: '",
     86  '003F' => "'recieved by' id",
     87  '0040' => "'recieved by' name",
     88  '0041' => "Sender variant address id",
     89  '0042' => "Sender variant name",
     90  '0043' => "'recieved representing' id",
     91  '0044' => "'recieved representing' name",
     92  '0046' => "Read receipt address id",
     93  '0051' => "'recieved by' search key",
     94  '0052' => "'recieved representing' search key",
     95  '0053' => "Read receipt search key",
     96  '0064' => "Sender variant address type",
     97  '0065' => "Sender variant address",
     98  '0070' => "Conversation topic",
     99  '0071' => "Conversation index",
     100  '0075' => "'recieved by' address type",
     101  '0076' => "'recieved by' email address",
     102  '0077' => "'recieved representing' address type",
     103  '0078' => "'recieved representing' email address",
     104  '007F' => "something like a message id",
     105  # Recipient properties
     106  '0C19' => "Reply address variant",
     107  '0C1D' => "Reply address variant",
     108  '0C1E' => "Reply address type",
     109  # Non-transmittable properties
     110  '0E02' => "?Should BCC be displayed",
     111  '0E0A' => "sent mail id",
     112  '0E1D' => "Subject w/o Re",
     113  '0E27' => "64 bytes: Unknown",
     114  '0FF6' => "Index",
     115  '0FF9' => "Index",
     116  '0FFF' => "Address variant",
     117  # Content properties
     118  '1008' => "Summary or something",
     119  '1009' => "RTF Compressed",
     120  # 'Common property'
     121  '3001' => "Display name",
     122  '3002' => "Address Type",
     123  '300B' => "'Search key'",
     124  # Attachment properties
     125  '3702' => "Attachment encoding",
     126  '3703' => "Attachment extension",
     127  '3709' => "'Attachment rendering'", # Maybe an icon or something?
     128  '3713' => "Icon URL?",
     129  # 'Mail user'
     130  '3A20' => "Address variant",
     131  # 3900 -- 39FF: 'Address book'
     132  '39FF' => "7 bit display name",
     133  # 'Display table properties'
     134  '3FF8' => "Routing data?",
     135  '3FF9' => "Routing data?",
     136  '3FFA' => "Routing data?",
     137  '3FFB' => "Routing data?",
     138  # 'Transport-defined envelope property'
     139  '4029' => "Sender variant address type",
     140  '402A' => "Sender variant address",
     141  '402B' => "Sender variant name",
     142  '5FF6' => "Recipient name",
     143  '5FF7' => "Recipient address variant",
     144  # 'Provider-defined internal non-transmittable property'
     145  '6740' => "Unknown, binary data",
     146  # User defined id's
     147  '8000' => "Content Class",
     148  '8002' => "Unknown, binary data",
     149};
     150
     151$skipheaders = {
     152  "MIME-Version" => 1,
     153  "Content-Type" => 1,
     154  "Content-Transfer-Encoding" => 1,
     155  "X-Mailer" => 1,
     156  "X-Msgconvert" => 1,
     157  "X-MS-Tnef-Correlator" => 1,
     158  "X-MS-Has-Attach" => 1,
     159};
     160
     161use constant ENCODING_UNICODE => '001F';
     162use constant KNOWN_ENCODINGS => {
     163    '000D' => 'Directory',
     164    '001F' => 'Unicode',
     165    '001E' => 'Ascii?',
     166    '0102' => 'Binary',
     167};
     168
     169use constant MAP_ATTACHMENT_FILE => {
     170  '3701' => ["DATA",        0], # Data
     171  '3704' => ["SHORTNAME",   1], # Short file name
     172  '3707' => ["LONGNAME",    1], # Long file name
     173  '370E' => ["MIMETYPE",    1], # mime type
     174  '3716' => ["DISPOSITION", 1], # disposition
     175};
     176
     177use constant MAP_SUBITEM_FILE => {
     178  '1000' => ["BODY_PLAIN",      0], # Body
     179  '1013' => ["BODY_HTML",       0], # HTML Version of body
     180  '0037' => ["SUBJECT",         1], # Subject
     181  '0047' => ["SUBMISSION_ID",   1], # Seems to contain the date
     182  '007D' => ["HEAD",            1], # Full headers
     183  '0C1A' => ["FROM",            1], # Reply-To: Name
     184  '0C1E' => ["FROM_ADDR_TYPE",  1], # From: Address type
     185  '0C1F' => ["FROM_ADDR",       1], # Reply-To: Address
     186  '0E04' => ["TO",              1], # To: Names
     187  '0E03' => ["CC",              1], # Cc: Names
     188  '1035' => ["MESSAGEID",       1], # Message-Id
     189  '1042' => ["INREPLYTO",       1], # In reply to Message-Id
     190};
     191
     192use constant MAP_ADDRESSITEM_FILE => {
     193  '3001' => ["NAME",            1], # Real name
     194  '3002' => ["TYPE",            1], # Address type
     195  '403D' => ["TYPE",            1], # Address type
     196  '3003' => ["ADDRESS",         1], # Address
     197  '403E' => ["ADDRESS",         1], # Address
     198  '39FE' => ["SMTPADDRESS",     1], # SMTP Address variant
     199};
     200
     201#
     202# Main body of module
     203#
     204
     205sub new {
     206  my $that = shift;
     207  my $class = ref $that || $that;
     208
     209  my $self = {
     210    ATTACHMENTS => [],
     211    ADDRESSES => [],
     212    VERBOSE => 0,
     213    HAS_UNICODE => 0,
     214    FROM_ADDR_TYPE => "",
     215  };
     216  bless $self, $class;
     217}
     218
     219#
     220# Main sub: parse the PPS tree, and return
     221#
     222sub parse {
     223  my $self = shift;
     224  my $PPS = shift or die "Internal error: No PPS tree";
     225  $self->_RootDir($PPS);
     226}
     227
     228sub mime_object {
     229  my $self = shift;
     230
     231  my $bodymime;
     232  my $mime;
     233
     234  if ($self->_IsMultiPart) {
     235    # Construct a multipart message object
     236
     237    $mime = MIME::Entity->build(Type => "multipart/mixed");
     238
     239    # Set the entity that we'll save the body parts to. If there's more than
     240    # one part, it's a new entity, otherwise, it's the main $mime object.
     241    if ($self->{BODY_HTML} and $self->{BODY_PLAIN}) {
     242      $bodymime = MIME::Entity->build(
     243        Type => "multipart/alternative",
     244        Encoding => "8bit",
     245      );
     246      $mime->add_part($bodymime);
     247    } else {
     248      $bodymime = $mime;
     249    }
     250    if ($self->{BODY_PLAIN}) {
     251      $self->_SaveAttachment($bodymime, {
     252        MIMETYPE => 'text/plain; charset=ISO-8859-1',
     253        ENCODING => '8bit',
     254        DATA => $self->{BODY_PLAIN},
     255        DISPOSITION => 'inline',
     256      });
     257    }
     258    if ($self->{BODY_HTML}) {
     259      $self->_SaveAttachment($bodymime, {
     260        MIMETYPE => 'text/html',
     261        ENCODING => '8bit',
     262        DATA => $self->{BODY_HTML},
     263        DISPOSITION => 'inline',
     264      });
     265    }
     266    foreach my $att (@{$self->{ATTACHMENTS}}) {
     267      $self->_SaveAttachment($mime, $att);
     268    }
     269  } elsif ($self->{BODY_PLAIN}) {
     270    # Construct a single part message object with a plain text body
     271    $mime = MIME::Entity->build(
     272      Type => "text/plain",
     273      Data => $self->{BODY_PLAIN}
     274    );
     275  } elsif ($self->{BODY_HTML}) {
     276    # Construct a single part message object with an HTML body
     277    $mime = MIME::Entity->build(
     278      Type => "text/html",
     279      Data => $self->{BODY_HTML}
     280    );
     281  }
     282
     283  $self->_CopyHeaderData($mime);
     284
     285  $self->_SetHeaderFields($mime);
     286
     287  return $mime;
     288}
     289
     290# Actually output the message in mbox format
     291sub print {
     292  my $self = shift;
     293
     294  my $mime = $self->mime_object;
     295
     296  # Construct From line from whatever we know.
     297  my $string = "";
     298  $string = (
     299    $self->{FROM_ADDR_TYPE} eq "SMTP" ?
     300    $self->{FROM_ADDR} :
     301    'someone@somewhere'
     302  );
     303  $string =~ s/\n//g;
     304
     305  # The date used here is not really important.
     306  print "From ", $string, " ", scalar localtime, "\n";
     307  $mime->print(\*STDOUT);
     308  print "\n";
     309}
     310
     311sub set_verbosity {
     312  my ($self, $verbosity) = @_;
     313  defined $verbosity or die "Internal error: no verbosity level";
     314  $self->{VERBOSE} = $verbosity;
     315}
     316
     317#
     318# Below are functions that walk the PPS tree. The *Dir functions handle
     319# processing the directory nodes of the tree (mainly, iterating over the
     320# children), whereas the *Item functions handle processing the items in the
     321# directory (if such an item is itself a directory, it will in turn be
     322# processed by the relevant *Dir function).
     323#
     324
     325#
     326# RootItem: Check Root Entry, parse sub-entries.
     327# The OLE file consists of a single entry called Root Entry, which has
     328# several children. These children are parsed in the sub SubItem.
     329#
     330sub _RootDir {
     331  my ($self, $PPS) = @_;
     332
     333  foreach my $child (@{$PPS->{Child}}) {
     334    $self->_SubItem($child);
     335  }
     336}
     337
     338sub _SubItem {
     339  my ($self, $PPS) = @_;
     340 
     341  if ($PPS->{Type} == DIR_TYPE) {
     342    $self->_SubItemDir($PPS);
     343  } elsif ($PPS->{Type} == FILE_TYPE) {
     344    $self->_SubItemFile($PPS);
     345  } else {
     346    warn "Unknown entry type: $PPS->{Type}";
     347  }
     348}
     349
     350sub _SubItemDir {
     351  my ($self, $PPS) = @_;
     352
     353  $self->_GetOLEDate($PPS);
     354
     355  my $name = $self->_GetName($PPS);
     356
     357  if ($name =~ /__recip_version1 0_ /) { # Address of one recipient
     358    $self->_AddressDir($PPS);
     359  } elsif ($name =~ '__attach_version1 0_ ') { # Attachment
     360    $self->_AttachmentDir($PPS);
     361  } else {
     362    $self->_UnknownDir($self->_GetName($PPS));
     363  }
     364}
     365
     366sub _SubItemFile {
     367  my ($self, $PPS) = @_;
     368
     369  my $name = $self->_GetName($PPS);
     370  my ($property, $encoding) = $self->_ParseItemName($name);
     371
     372  $self->_MapProperty($self, $PPS->{Data}, $property,
     373    MAP_SUBITEM_FILE) or $self->_UnknownFile($name);
     374}
     375
     376sub _AddressDir {
     377  my ($self, $PPS) = @_;
     378
     379  my $address = {
     380    NAME        => undef,
     381    ADDRESS     => undef,
     382    TYPE        => "",
     383  };
     384  foreach my $child (@{$PPS->{Child}}) {
     385    $self->_AddressItem($child, $address);
     386  }
     387  push @{$self->{ADDRESSES}}, $address;
     388}
     389
     390sub _AddressItem {
     391  my ($self, $PPS, $addr_info) = @_;
     392
     393  my $name = $self->_GetName($PPS);
     394
     395  # DIR Entries: There should be none.
     396  if ($PPS->{Type} == DIR_TYPE) {
     397    $self->_UnknownDir($name);
     398  } elsif ($PPS->{Type} == FILE_TYPE) {
     399    my ($property, $encoding) = $self->_ParseItemName($name);
     400    $self->_MapProperty($addr_info, $PPS->{Data}, $property,
     401      MAP_ADDRESSITEM_FILE) or $self->_UnknownFile($name);
     402  } else {
     403    warn "Unknown entry type: $PPS->{Type}";
     404  }
     405}
     406
     407sub _AttachmentDir {
     408  my ($self, $PPS) = @_;
     409
     410  my $attachment = {
     411    SHORTNAME   => undef,
     412    LONGNAME    => undef,
     413    MIMETYPE    => 'application/octet-stream',
     414    ENCODING    => 'base64',
     415    DISPOSITION => 'attachment',
     416    DATA        => undef
     417  };
     418  foreach my $child (@{$PPS->{Child}}) {
     419    $self->_AttachmentItem($child, $attachment);
     420  }
     421  push @{$self->{ATTACHMENTS}}, $attachment;
     422}
     423
     424sub _AttachmentItem {
     425  my ($self, $PPS, $att_info) = @_;
     426
     427  my $name = $self->_GetName($PPS);
     428
     429  my ($property, $encoding) = $self->_ParseItemName($name);
     430
     431  if ($PPS->{Type} == DIR_TYPE) {
     432
     433    if ($property eq '3701') {  # Nested MSG file
     434      my $msgp = new MSGParser();
     435      $msgp->parse($PPS);
     436      my $data = $msgp->mime_object->as_string;
     437      $att_info->{DATA} = $data;
     438      $att_info->{MIMETYPE} = 'message/rfc822';
     439      $att_info->{ENCODING} = '8bit';
     440    } else {
     441      $self->_UnknownDir($name);
     442    }
     443
     444  } elsif ($PPS->{Type} == FILE_TYPE) {
     445    $self->_MapProperty($att_info, $PPS->{Data}, $property,
     446      MAP_ATTACHMENT_FILE) or $self->_UnknownFile($name);
     447  } else {
     448    warn "Unknown entry type: $PPS->{Type}";
     449  }
     450}
     451
     452sub _MapProperty {
     453  my ($self, $hash, $data, $property, $map) = @_;
     454
     455  defined $property or return 0;
     456  my $arr = $map->{$property} or return 0;
     457
     458  $arr->[1] and $data =~ s/\000//g;
     459  $hash->{$arr->[0]} = $data;
     460
     461  return 1;
     462}
     463
     464sub _UnknownDir {
     465  my ($self, $name) = @_;
     466
     467  if ($name eq '__nameid_version1 0') {
     468    $self->{VERBOSE}
     469      and warn "Skipping DIR entry $name (Introductory stuff)\n";
     470    return;
     471  }
     472  warn "Unknown DIR entry $name\n";
     473}
     474
     475sub _UnknownFile {
     476  my ($self, $name) = @_;
     477
     478  if ($name eq '__properties_version1 0') {
     479    $self->{VERBOSE}
     480      and warn "Skipping FILE entry $name (Properties)\n";
     481    return;
     482  }
     483
     484  my ($property, $encoding) = $self->_ParseItemName($name);
     485  unless (defined $property) {
     486    warn "Unknown FILE entry $name\n";
     487    return;
     488  }
     489  if ($skipproperties->{$property}) {
     490    $self->{VERBOSE}
     491      and warn "Skipping property $property ($skipproperties->{$property})\n";
     492    return;
     493  } elsif ($property =~ /^80/) {
     494    $self->{VERBOSE}
     495      and warn "Skipping property $property (user-defined property)\n";
     496    return;
     497  } else {
     498    warn "Unknown property $property\n";
     499    return;
     500  }
     501}
     502
     503#
     504# Helper functions
     505#
     506
     507sub _GetName {
     508  my ($self, $PPS) = @_;
     509  return $self->_NormalizeWhiteSpace(OLE::Storage_Lite::Ucs2Asc($PPS->{Name}));
     510}
     511
     512sub _NormalizeWhiteSpace {
     513  my ($self, $name) = @_;
     514  $name =~ s/\W/ /g;
     515  return $name;
     516}
     517
     518sub _GetOLEDate {
     519  my ($self, $PPS) = @_;
     520  unless (defined ($self->{OLEDATE})) {
     521    # Make Date
     522    my $datearr;
     523    $datearr = $PPS->{Time2nd};
     524    $datearr = $PPS->{Time1st} unless($datearr);
     525    $self->{OLEDATE} = $self->_FormatDate($datearr) if $datearr;
     526  }
     527}
     528
     529sub _FormatDate {
     530  my ($self, $datearr) = @_;
     531
     532  # TODO: This is a little convoluted. Directly using strftime didn't seem
     533  # to work.
     534  my $datetime = mktime(@$datearr);
     535  return time2str("%a, %d %h %Y %X %z", $datetime);
     536}
     537
     538# If we didn't get the date from the original header data, we may be able
     539# to get it from the SUBMISSION_ID:
     540# It seems to have the format of a semicolon-separated list of key=value
     541# pairs. The key l has a value with the format:
     542# <SERVER>-<DATETIME>Z-<NUMBER>, where DATETIME is the date and time in
     543# the format YYMMDDHHMMSS.
     544sub _SubmissionIdDate {
     545  my $self = shift;
     546
     547  my $submission_id = $self->{SUBMISSION_ID} or return undef;
     548  $submission_id =~ m/l=.*-(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)Z-.*/
     549    or return undef;
     550  my $year = $1;
     551  $year += 100 if $year < 20;
     552  return $self->_FormatDate([$6,$5,$4,$3,$2-1,$year]);
     553}
     554
     555sub _ParseItemName {
     556  my ($self, $name) = @_;
     557
     558  if ($name =~ /^__substg1 0_(....)(....)$/) {
     559    my ($property, $encoding) = ($1, $2);
     560    if ($encoding eq ENCODING_UNICODE and not ($self->{HAS_UNICODE})) {
     561      warn "This MSG file contains Unicode fields."
     562        . " This is currently unsupported.\n";
     563      $self->{HAS_UNICODE} = 1;
     564    } elsif (not (KNOWN_ENCODINGS()->{$encoding})) {
     565      warn "Unknown encoding $encoding. Results may be strange or wrong.\n";
     566    }
     567    return ($property, $encoding);
     568  } else {
     569    return (undef, undef);
     570  }
     571}
     572
     573sub _SaveAttachment {
     574  my ($self, $mime, $att) = @_;
     575
     576  my $ent = $mime->attach(
     577    Type => $att->{MIMETYPE},
     578    Encoding => $att->{ENCODING},
     579    Data => [],
     580    Filename => ($att->{LONGNAME} ? $att->{LONGNAME} : $att->{SHORTNAME}),
     581    Disposition => $att->{DISPOSITION}
     582  );
     583
     584  my $handle;
     585  if ($handle = $ent->open("w")) {
     586    $handle->print($att->{DATA});
     587    $handle->close;
     588  } else {
     589    warn "Could not write data!";
     590  }
     591}
     592
     593sub _SetAddressPart {
     594  my ($self, $adrname, $partname, $data) = @_;
     595
     596  my $address = $self->{ADDRESSES}->{$adrname};
     597  $data =~ s/\000//g;
     598  #warn "Processing address data part $partname : $data\n";
     599  if (defined ($address->{$partname})) {
     600    if ($address->{$partname} eq $data) {
     601      warn "Skipping duplicate but identical address information for"
     602      . " $partname\n" if $self->{VERBOSE};
     603    } else {
     604      warn "Address information $partname inconsistent:\n";
     605      warn "    Original data: $address->{$partname}\n";
     606      warn "    New data: $data\n";
     607    }
     608  } else {
     609    $address->{$partname} = $data;
     610  }
     611}
     612
     613# Set header fields
     614sub _AddHeaderField {
     615  my ($self, $mime, $fieldname, $value) = @_;
     616
     617  my $oldvalue = $mime->head->get($fieldname);
     618  return if $oldvalue;
     619  $mime->head->add($fieldname, $value) if $value;
     620}
     621
     622sub _Address {
     623  my ($self, $tag) = @_;
     624  my $name = $self->{$tag} || "";
     625  my $address = $self->{$tag . "_ADDR"} || "";
     626  return "$name <$address>";
     627}
     628
     629# Find SMTP addresses for the given list of names
     630sub _ExpandAddressList {
     631  my ($self, $names) = @_;
     632
     633  my $addresspool = $self->{ADDRESSES};
     634  my @namelist = split /; */, $names;
     635  my @result;
     636  name: foreach my $name (@namelist) {
     637    foreach my $address (@$addresspool) {
     638      if ($name eq $address->{NAME}) {
     639        my $addresstext = $address->{NAME} . " <";
     640        if (defined ($address->{SMTPADDRESS})) {
     641          $addresstext .= $address->{SMTPADDRESS};
     642        } elsif ($address->{TYPE} eq "SMTP") {
     643          $addresstext .= $address->{ADDRESS};
     644        }
     645        $addresstext .= ">";
     646        push @result, $addresstext;
     647        next name;
     648      }
     649    }
     650    push @result, $name;
     651  }
     652  return join ", ", @result;
     653}
     654
     655sub _ParseHead {
     656  my ($self, $data) = @_;
     657  defined $data or return undef;
     658  # Parse full header date if we got that.
     659  my $parser = new MIME::Parser();
     660  $parser->output_to_core(1);
     661  $parser->decode_headers(1);
     662  $data =~ s/^Microsoft Mail.*$/X-MSGConvert: yes/m;
     663  my $entity = $parser->parse_data($data)
     664    or warn "Couldn't parse full headers!";
     665  my $head = $entity->head;
     666  $head->unfold;
     667  return $head;
     668}
     669
     670# Find out if we need to construct a multipart message
     671sub _IsMultiPart {
     672  my $self = shift;
     673
     674  return (
     675    ($self->{BODY_HTML} and $self->{BODY_PLAIN})
     676      or @{$self->{ATTACHMENTS}}>0
     677  );
     678}
     679
     680# Copy original header data.
     681# Note: This should contain the Date: header.
     682sub _CopyHeaderData {
     683  my ($self, $mime) = @_;
     684
     685  my $head = $self->_ParseHead($self->{HEAD}) or return;
     686
     687  foreach my $tag (grep {!$skipheaders->{$_}} $head->tags) {
     688    foreach my $value ($head->get_all($tag)) {
     689      $mime->head->add($tag, $value);
     690    }
     691  }
     692}
     693
     694# Set header fields
     695sub _SetHeaderFields {
     696  my ($self, $mime) = @_;
     697
     698  # If we didn't get the date from the original header data, we may be able
     699  # to get it from the SUBMISSION_ID:
     700  $self->_AddHeaderField($mime, 'Date', $self->_SubmissionIdDate());
     701
     702  # Third and last chance to set the Date: header; this uses the date the
     703  # MSG file was saved.
     704  $self->_AddHeaderField($mime, 'Date', $self->{OLEDATE});
     705  $self->_AddHeaderField($mime, 'Subject', $self->{SUBJECT});
     706  $self->_AddHeaderField($mime, 'From', $self->_Address("FROM"));
     707  #$self->_AddHeaderField($mime, 'Reply-To', $self->_Address("REPLYTO"));
     708  $self->_AddHeaderField($mime, 'To', $self->_ExpandAddressList($self->{TO}));
     709  $self->_AddHeaderField($mime, 'Cc', $self->_ExpandAddressList($self->{CC}));
     710  $self->_AddHeaderField($mime, 'Message-Id', $self->{MESSAGEID});
     711  $self->_AddHeaderField($mime, 'In-Reply-To', $self->{INREPLYTO});
     712}
     713
     714package main;
     715use Getopt::Long;
     716use Pod::Usage;
     717
     718# Setup command line processing.
     719my $verbose = '';
     720my $help = '';      # Print help message and exit.
     721GetOptions('verbose' => \$verbose, 'help|?' => \$help) or pod2usage(2);
     722pod2usage(1) if $help;
     723
     724# Get file name
     725my $file = $ARGV[0];
     726defined $file or pod2usage(2);
     727warn "Will parse file: $file\n" if $verbose;
     728
     729# Load and parse MSG file (is OLE)
     730my $Msg = OLE::Storage_Lite->new($file);
     731my $PPS = $Msg->getPpsTree(1);
     732$PPS or die "$file must be an OLE file";
     733
     734# parse PPS tree
     735my $parser = new MSGParser();
     736$parser->set_verbosity(1) if $verbose;
     737$parser->parse($PPS);
     738$parser->print();
     739
     740#
     741# Usage info follows.
     742#
     743__END__
     744
     745=head1 NAME
     746
     747msgconvert.pl - Convert Outlook .msg files to mbox format
     748
     749=head1 SYNOPSIS
     750
     751msgconvert.pl [options] <file.msg>
     752
     753  Options:
     754    --verbose   be verbose
     755    --help      help message
     756
     757=head1 OPTIONS
     758
     759=over 8
     760
     761=item B<--verbose>
     762
     763    Print information about skipped parts of the .msg file.
     764
     765=item B<--help>
     766
     767    Print a brief help message.
     768
     769=head1 DESCRIPTION
     770
     771This program will output the message contained in file.msg in mbox format
     772on stdout. It will complain about unrecognized OLE parts on
     773stderr.
     774
     775=head1 BUGS
     776
     777Not all data that's in the .MSG file is converted. There simply are some
     778parts whose meaning escapes me. One of these must contain the date the
     779message was sent, for example. Formatting of text messages will also be
     780lost. YMMV.
     781
     782=cut
  • xapian-applications/omega/omindex.cc

    diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc
    index 5d91036..90224fb 100644
    a b  
    55 * Copyright 2001,2002 Ananova Ltd
    66 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011 Olly Betts
    77 * Copyright 2009 Frank J Bruzzaniti
     8 * Copyright 2006,2007,2008 AVL List GesmbH
    89 *
    910 * This program is free software; you can redistribute it and/or
    1011 * modify it under the terms of the GNU General Public License as
     
    6566
    6667#include "gnu_getopt.h"
    6768
     69#ifndef LIBEXECDIR
     70// must have ending slash
     71//# define LIBEXECDIR "/usr/lib/omega/bin/"
     72# define LIBEXECDIR ""
     73#endif
     74#ifndef PKGDATADIR
     75// must have ending slash
     76# define PKGDATADIR "/usr/share/omega/"
     77#endif
     78
    6879using namespace std;
    6980
    7081#define TITLE_SIZE 128
    static bool follow_symlinks = false;  
    7889static bool ignore_exclusions = false;
    7990static bool spelling = false;
    8091static bool verbose = false;
     92string error_log; /* used in runfilter.cc */
     93static string baseurl;
     94static string dbpath;
     95static string cache_dir;
    8196static enum {
    8297    EMPTY_BODY_WARN, EMPTY_BODY_INDEX, EMPTY_BODY_SKIP
    8398} empty_body = EMPTY_BODY_WARN;
    static time_t last_mod_max;  
    98113// text are common, so we handle these with a std::map.
    99114static map<string, string> commands;
    100115
     116static void
     117index_directory(const string &path, const string &url_, size_t depth_limit,
     118                map<string, string>& mime_map);
     119
    101120inline static bool
    102121p_notalnum(unsigned int c)
    103122{
    void  
    310329index_mimetype(const string & file, const string & url, const string & ext,
    311330               const string &mimetype, DirectoryIterator &d);
    312331
     332static
     333void mkdir_p(const string &path, mode_t mode) {
     334    (void)mode; // FIXME
     335#ifdef __WIN32__
     336    system(("mkdir \"" + shell_protect(path) + "\"").c_str());
     337#else
     338    if (system(("mkdir -p " + shell_protect(path)).c_str()) < 0) { /* FIXME */ }
     339#endif
     340}
     341
     342/*
     343 * unpack .msg/.pst/.rar/.zip into local cache dir and recurse there
     344 */
     345static void
     346index_cached_directory(size_t depth_limit,
     347                       const string &file,
     348                       const string &url,
     349                       const string &ext,
     350                       const string &cmd,
     351                       map<string, string>& mime_map)
     352{
     353    string oldroot = root;
     354    root = cache_dir;
     355    string cache = root+"/."+ext;
     356    string cachedir = cache+url;
     357    struct stat statfile, statcache;
     358    bool extract_cache;
     359#ifdef HAVE_LSTAT
     360    lstat(file.c_str(), &statfile);
     361    lstat(cachedir.c_str(), &statcache);
     362#else
     363    stat(file.c_str(), &statfile);
     364    stat(cachedir.c_str(), &statcache);
     365#endif
     366    extract_cache = true;
     367    // if cachedir exists and if file is older as cachedir and if cachedir existed 5 secs ago,
     368    // then it was already extracted.
     369    if (S_ISDIR(statcache.st_mode)
     370        && S_ISREG(statfile.st_mode)
     371        && (statfile.st_mtime < statcache.st_mtime)
     372        && (statcache.st_mtime < (time_t)(time(NULL)-500))) // not created by nested mkdir call
     373    {
     374        // but is it in the database also? prevent from deleting skipped files
     375        if (verbose)
     376            cout << "Unchanged cache \"" << cachedir << "\" - \"" << file << "\" - skip extraction "
     377                 // << statfile.st_mtime << " < " << statcache.st_mtime
     378                 << endl;
     379        extract_cache = false;
     380    }
     381    if (S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode) ) {
     382        // If last_mod > last_mod_max, we know for sure that the file is new
     383        // or updated.
     384        if (statfile.st_mtime <= last_mod_max) {
     385            // check database timestamp for cached container, esp. for cleaned up caches.
     386            // if already in db we need not to extract again
     387            string urlterm("U");
     388            urlterm += baseurl;
     389            urlterm += "/."+ext+url;
     390            if (urlterm.length() > MAX_SAFE_TERM_LENGTH)
     391                urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH);
     392       
     393            Xapian::PostingIterator p = db.postlist_begin(urlterm);
     394            if (p != db.postlist_end(urlterm)) {
     395                Xapian::docid docid = *p;
     396                Xapian::Document doc = db.get_document(docid);
     397                string value = doc.get_value(VALUE_LASTMOD);
     398                time_t old_last_mod = binary_string_to_int(value);
     399                if (statfile.st_mtime <= old_last_mod) {
     400                    if (verbose)
     401                        cout << "Cache "<< "."+ext+url << " not newer. Ignored." << endl;
     402                    // The docid should be in updated - the only valid
     403                    // exception is if the URL was long and hashed to the
     404                    // same URL as an existing document indexed in the same
     405                    // batch.
     406                    if (usual(docid < updated.size() && !updated[docid])) {
     407                        updated[docid] = true;
     408                        --old_docs_not_seen;
     409                    }
     410                    root = oldroot;
     411                    return;
     412                }
     413            }
     414        }
     415    }
     416
     417    if (extract_cache) {
     418        if (verbose)
     419            cout << "[EXTRACT into cache " << cachedir << "]" << endl;
     420        if (verbose && S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode))
     421            cout << " ...changed cache \"" << cachedir << "\" - \"" << file << "\" "
     422                 << statfile.st_mtime << " < " << statcache.st_mtime << " time: " << time(NULL)
     423                 << endl;
     424        if (!S_ISDIR(statcache.st_mode))
     425            mkdir_p(cachedir, 0755);
     426        stdout_to_string(cmd);
     427#ifndef __WIN32__
     428        stdout_to_string("chmod -R u+rwx " + shell_protect(cachedir));
     429#endif
     430#ifdef HAVE_LSTAT
     431        lstat(cachedir.c_str(), &statcache);
     432#else
     433        stat(cachedir.c_str(), &statcache);
     434#endif
     435    }
     436
     437    if (S_ISDIR(statcache.st_mode)) {
     438        if (depth_limit == 1) {
     439            cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl;
     440        } else {
     441            // max loop 5, magic start: /.ext+file
     442            index_directory(file + "/."+ext+url, url, depth_limit + 5, mime_map);
     443            if (verbose)
     444                cout << "[CLEANUP " << "rm -rf " << shell_protect(cachedir) << "]" << endl;
     445            rm_rf(cachedir);
     446        }
     447    }
     448    else { // no -p would be fatal here
     449        cout << "cachedir " << shell_protect(cachedir) << " does not exist - skipped" << endl;
     450    }
     451    root = oldroot;
     452}
     453
    313454static void
    314455index_file(const string &file, const string &url, DirectoryIterator & d,
    315            map<string, string>& mime_map)
     456           map<string, string>& mime_map, size_t depth_limit)
    316457{
    317458    string ext;
    318459    const char * dot_ptr = strrchr(d.leafname(), '.');
    index_file(const string &file, const string &url, DirectoryIterator & d,  
    340481
    341482    string mimetype;
    342483    if (mt == mime_map.end()) {
     484        if (strcasecmp(d.leafname(), "mbox") == 0) {
     485            // Special filename.
     486            mimetype = "message/rfc822";
     487            goto got_mimetype;
     488        }
     489
    343490        mimetype = d.get_magic_mimetype();
    344491        if (mimetype.empty()) {
    345492            skip(file, "Unknown extension and unrecognised format",
    index_file(const string &file, const string &url, DirectoryIterator & d,  
    352499        mimetype = mt->second;
    353500    }
    354501
     502got_mimetype:
     503
    355504    if (verbose)
    356505        cout << "Indexing \"" << file.substr(root.size()) << "\" as "
    357506             << mimetype << " ... ";
    index_mimetype(const string & file, const string & url, const string & ext,  
    418567            }
    419568        }
    420569    }
     570    // add the db basename to cache_dir
     571    {
     572        ensure_tmpdir(); // FIXME: be lazy!
     573        cache_dir = tmpdir;
     574        const char *p = strrchr(dbpath.c_str(), '/');
     575        // on windows only
     576        if (!p) p = strrchr(dbpath.c_str(), '\\');
     577        if (p) { p++; } else { p = dbpath.c_str(); }
     578        cache_dir += p;
     579    }
    421580
    422581    if (verbose) cout << flush;
    423582
    index_mimetype(const string & file, const string & url, const string & ext,  
    482641            } else {
    483642                // FIXME: What charset is the file?  Look at contents?
    484643            }
     644#if 0 // FIXME: this won't work as omindex will have the database locked...
     645    } else if (mimetype == "message/rfc822") { // // => mbox2script
     646        //for stemmer lang, parse stemmer.get_description => Xapian::Stem(bla)
     647        string cmd = LIBEXECDIR"mbox2omega " + shell_protect(file) + error_log+"| "
     648            "scriptindex " + shell_protect(dbpath) + " "PKGDATADIR"mbox2script.script";
     649        try {
     650            dump = stdout_to_string(cmd);
     651        } catch (ReadError) {
     652            cout << "\"" << cmd << "\" failed - skipping" << endl;
     653            return;
     654        }
     655#endif
    485656        } else if (mimetype == "application/pdf") {
    486657            string safefile = shell_protect(file);
    487658            string cmd = "pdftotext -enc UTF-8 " + safefile + " -";
    index_mimetype(const string & file, const string & url, const string & ext,  
    711882
    712883            generate_sample_from_csv(dump, sample);
    713884        } else if (mimetype == "application/vnd.ms-outlook") {
    714             string cmd = get_pkglibbindir() + "/outlookmsg2html " + shell_protect(file);
    715             MyHtmlParser p;
    716             p.ignore_metarobots();
     885            string oldroot = root;
     886            struct stat statcache;
     887            char olddir[256];
     888           
     889            if (depth_limit == 1) {
     890                skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME);
     891                return;
     892            }
     893            string cmd = LIBEXECDIR"outlook2text "+shell_protect(file);
     894            // unpack multiparts and attachments. so we have to chdir first
     895            string fulldir = cache_dir+"/.msg"+url;
     896            if (getcwd(olddir, 256) == NULL) { /* FIXME */ }
     897#ifdef HAVE_LSTAT
     898            lstat(fulldir.c_str(), &statcache);
     899#else
     900            stat(fulldir.c_str(), &statcache);
     901#endif
     902            if (!S_ISDIR(statcache.st_mode)) {
     903                mkdir_p(fulldir, 0755);
     904            }
    717905            try {
    718                 dump = stdout_to_string(cmd);
    719                 // FIXME: what should the default charset be?
    720                 p.parse_html(dump, "iso-8859-1", false);
    721             } catch (const string & newcharset) {
    722                 p.reset();
    723                 p.ignore_metarobots();
    724                 p.parse_html(dump, newcharset, true);
     906                if (chdir(fulldir.c_str()) < 0) { /* FIXME */ }
     907                size_t new_limit = depth_limit;
     908                if (new_limit) --new_limit;
     909                index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
     910                if (chdir(olddir) < 0) { /* FIXME */ }
    725911            } catch (ReadError) {
    726                 skip_cmd_failed(file, cmd);
    727                 return;
     912                cout << "failed " << cmd << " << in index_cached_directory" << endl;
     913                if (chdir(olddir) < 0) { /* FIXME */ }
     914                root = oldroot;
     915            } catch (...) {
     916                cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     917                if (chdir(olddir) < 0) { /* FIXME */ }
     918                root = oldroot;
     919                throw;
    728920            }
    729             dump = p.dump;
    730             title = p.title;
    731             keywords = p.keywords;
    732             sample = p.sample;
    733             author = p.author;
     921            return;
    734922        } else if (mimetype == "image/svg+xml") {
    735923            SvgParser svgparser;
    736924            svgparser.parse_html(d.file_to_string());
    index_mimetype(const string & file, const string & url, const string & ext,  
    759947            if (idx != string::npos) {
    760948                dump.assign(desc, idx + 1, string::npos);
    761949            }
     950        } else if (mimetype == "application/x-zip") {
     951            string oldroot = root;
     952            if (depth_limit == 1) {
     953                skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME);
     954                return;
     955            }
     956            // overwrite
     957            string cmd = "unzip -u -P. -o " +shell_protect(file) + " -d " +shell_protect(cache_dir+"/.zip"+url+"/");
     958            try {
     959                size_t new_limit = depth_limit;
     960                if (new_limit) --new_limit;
     961                index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
     962            } catch (ReadError) {
     963                cout << "failed " << cmd << " << in index_cached_directory" << endl;
     964                root = oldroot;
     965            } catch (...) {
     966                cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     967                root = oldroot;
     968                throw;
     969            }
     970            return;
     971        } else if (mimetype == "application/x-rar") {
     972            string oldroot = root;
     973            if (depth_limit == 1) {
     974                skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME);
     975                return;
     976            }
     977
     978            // overwrite
     979            string cmd = "unrar x -o+ " +shell_protect(file) + " "
     980                + shell_protect(cache_dir+"/.rar"+url+"/");
     981            try {
     982                size_t new_limit = depth_limit;
     983                if (new_limit) --new_limit;
     984                index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
     985            } catch (ReadError) {
     986                cout << "failed " << cmd << " << in index_cached_directory" << endl;
     987                root = oldroot;
     988            } catch (...) {
     989                cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     990                root = oldroot;
     991                throw;
     992            }
     993            return;
     994        } else if (mimetype == "application/vnd.ms-outlook-pst") {
     995            string oldroot = root;
     996            if (depth_limit == 1) {
     997                skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME);
     998                return;
     999            }
     1000            // unpack attachments also, together with mbox files
     1001            string cmd = "readpst -r -cv -w -o "
     1002                + shell_protect(cache_dir+"/.pst"+url+"/")+" "+shell_protect(file);
     1003            try {
     1004                size_t new_limit = depth_limit;
     1005                if (new_limit) --new_limit;
     1006                index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
     1007            } catch (ReadError) {
     1008                root = oldroot;
     1009                cout << "failed " << cmd << " << in index_cached_directory" << endl;
     1010            } catch (...) {
     1011                root = oldroot;
     1012                cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     1013                throw;
     1014            }
     1015            return;
    7621016        } else {
    7631017            // Don't know how to index this type.
    7641018            skip_unknown_mimetype(file, mimetype);
    index_directory(const string &path, const string &url_, size_t depth_limit,  
    9751229                        break;
    9761230                    }
    9771231                    case DirectoryIterator::REGULAR_FILE:
    978                         index_file(file, url, d, mime_map);
     1232                        index_file(file, url, d, mime_map, depth_limit);
    9791233                        break;
    9801234                    default:
    9811235                        skip(file, "Not a regular file",
    main(int argc, char **argv)  
    9991253    bool overwrite = false;
    10001254    // If delete_removed_documents is true, delete any documents we don't see.
    10011255    bool delete_removed_documents = true;
    1002     string baseurl;
    10031256    size_t depth_limit = 0;
    10041257
    10051258    static const struct option longopts[] = {
    main(int argc, char **argv)  
    11241377    mime_map["pps"] = "application/vnd.ms-powerpoint"; // Powerpoint slideshow
    11251378    mime_map["msg"] = "application/vnd.ms-outlook"; // Outlook .msg email
    11261379
     1380    //  Outlook message folder:
     1381    mime_map["pst"] = "application/vnd.ms-outlook-pst"; // readpst | uudeview (libpst)
     1382
     1383    // Miscellaneous compound formats:
     1384    mime_map["mbox"] = "message/rfc822";                // => mbox2omega
     1385    mime_map["mbx"] = "message/rfc822";                // => mbox2omega
     1386#ifndef _MSC_VER
     1387    mime_map["zip"] = "application/x-zip"; // recursive scanning
     1388    mime_map["rar"] = "application/x-rar"; // recursive scanning
     1389#endif
     1390
    11271391    // Perl:
    11281392    mime_map["pl"] = "text/x-perl";
    11291393    mime_map["pm"] = "text/x-perl";
    main(int argc, char **argv)  
    11801444        argv[1] = const_cast<char *>("--version");
    11811445    }
    11821446
    1183     string dbpath;
    11841447    int getopt_ret;
    11851448    while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:F:l:s:pfSVe:i",
    11861449                                         longopts, NULL)) != -1) {
    main(int argc, char **argv)  
    13301593        baseurl += '/';
    13311594    }
    13321595
     1596    string log_dir = "./"; // FIXME: need to set log_dir to something appropriate.
     1597    error_log = " 2>>"+log_dir+"omindex-error.log";
     1598
    13331599    if (optind >= argc || optind + 2 < argc) {
    13341600        cerr << PROG_NAME": you must specify a directory to index.\n"
    13351601"Do this either as a single directory (corresponding to the base URL)\n"
  • new file xapian-applications/omega/outlook2text.in

    diff --git a/xapian-applications/omega/outlook2text.in b/xapian-applications/omega/outlook2text.in
    new file mode 100644
    index 0000000..b7cf3e2
    - +  
     1#! /bin/sh
     2# converts msg to mbox and extract attachments
     3# either be in the cache dir, or accept it as 2nd arg
     4if [ -n $2 ]; then
     5  @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "$2"
     6else
     7  # already is in the cache dir
     8  base=`basename "$1" .msg`
     9  @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "${base}"
     10fi
  • xapian-applications/omega/runfilter.cc

    diff --git a/xapian-applications/omega/runfilter.cc b/xapian-applications/omega/runfilter.cc
    index bb5e4fd..e77bc56 100644
    a b  
    5555
    5656using namespace std;
    5757
     58extern string error_log;
     59
    5860string
    5961stdout_to_string(const string &cmd)
    6062{
    6163    string out;
     64    string tmp = cmd;
     65    tmp += error_log;
    6266#if defined HAVE_FORK && defined HAVE_SOCKETPAIR && defined HAVE_SETRLIMIT
    6367    // We want to be able to get the exit status of the child process.
    6468    signal(SIGCHLD, SIG_DFL);
    stdout_to_string(const string &cmd)  
    101105        }
    102106#endif
    103107
    104         execl("/bin/sh", "/bin/sh", "-c", cmd.c_str(), (void*)NULL);
     108        execl("/bin/sh", "/bin/sh", "-c", tmp.c_str(), (void*)NULL);
    105109        _exit(-1);
    106110    }
    107111
    stdout_to_string(const string &cmd)  
    168172        throw ReadError();
    169173    }
    170174#else
    171     FILE * fh = popen(cmd.c_str(), "r");
     175    FILE * fh = popen(tmp.c_str(), "r");
    172176    if (fh == NULL) throw ReadError();
    173177    while (!feof(fh)) {
    174178        char buf[4096];
  • xapian-applications/omega/utils.cc

    diff --git a/xapian-applications/omega/utils.cc b/xapian-applications/omega/utils.cc
    index 797c47d..3e94268 100644
    a b  
    2626#include <cassert>
    2727#include <stdio.h> // for sprintf/snprintf
    2828#include <cstdlib>
     29#include <cstring>
     30#include "safesysstat.h"
    2931
    3032#include <string>
    3133
    3234using namespace std;
    3335
     36#ifdef __WIN32__
     37#include "safewindows.h"
     38#endif
     39
    3440// This ought to be enough for any of the conversions below.
    3541#define BUFSIZE 100
    3642
    using namespace std;  
    4046    int len = SNPRINTF(buf, BUFSIZE, (FMT), val);\
    4147    if (len == -1 || len > BUFSIZE) return string(buf, BUFSIZE);\
    4248    return string(buf, len);
     49/// Allow system to work directly on C++ strings.
     50inline int system(const string &command) { return system(command.c_str()); }
     51
     52// Duplicated from omindex.cc - FIXME
     53static string
     54shell_protect(const string & file)
     55{
     56    string safefile = file;
     57#ifdef __WIN32__
     58    bool need_to_quote = false;
     59    for (string::iterator i = safefile.begin(); i != safefile.end(); ++i) {
     60        unsigned char ch = *i;
     61        if (!isalnum(ch) && ch < 128) {
     62            if (ch == '/') {
     63                // Convert Unix path separators to backslashes.  C library
     64                // functions understand "/" in paths, but external commands
     65                // generally don't, and also may interpret a leading '/' as
     66                // introducing a command line option.
     67                *i = '\\';
     68            } else if (ch == ' ') {
     69                need_to_quote = true;
     70            } else if (ch < 32 || strchr("<>\"|*?", ch)) {
     71                // Check for invalid characters in the filename.
     72                string m("Invalid character '");
     73                m += ch;
     74                m += "' in filename \"";
     75                m += file;
     76                m += '"';
     77                throw m;
     78            }
     79        }
     80    }
     81    if (safefile[0] == '-') {
     82        // If the filename starts with a '-', protect it from being treated as
     83        // an option by prepending ".\".
     84        safefile.insert(0, ".\\");
     85    }
     86    if (need_to_quote) {
     87        safefile.insert(0, "\"");
     88        safefile += '"';
     89    }
     90#else
     91    string::size_type p = 0;
     92    if (!safefile.empty() && safefile[0] == '-') {
     93        // If the filename starts with a '-', protect it from being treated as
     94        // an option by prepending "./".
     95        safefile.insert(0, "./");
     96        p = 2;
     97    }
     98    while (p < safefile.size()) {
     99        // Don't escape some safe characters which are common in filenames.
     100        unsigned char ch = safefile[p];
     101        if (!isalnum(ch) && strchr("/._-", ch) == NULL) {
     102            safefile.insert(p, "\\");
     103            ++p;
     104        }
     105        ++p;
     106    }
     107#endif
     108    return safefile;
     109}
     110
     111/// Remove a directory and contents.
     112void
     113rm_rf(const string &filename)
     114{
     115    // Check filename exists and is actually a directory
     116    struct stat sb;
     117    if (stat(filename.c_str(), &sb) != 0 || !S_ISDIR(sb.st_mode)) return;
     118
     119    string safefile = shell_protect(filename);
     120#ifdef __WIN32__
     121# if 1
     122    static int win95 = -1;
     123    if (win95 == -1) {
     124        OSVERSIONINFO info;
     125        memset(&info, 0, sizeof(OSVERSIONINFO));
     126        info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
     127        if (GetVersionEx(&info)) {
     128            win95 = (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS);
     129        }
     130    }
     131
     132    if (win95) {
     133        // for 95 like systems:
     134        system("deltree /y \"" + safefile + "\"");
     135    } else {
     136        // for NT like systems:
     137        system("rd /s /q \"" + safefile + "\"");
     138    }
     139# else
     140    safefile.append("\0", 2);
     141    SHFILEOPSTRUCT shfo;
     142    memset((void*)&shfo, 0, sizeof(shfo));
     143    shfo.hwnd = 0;
     144    shfo.wFunc = FO_DELETE;
     145    shfo.pFrom = safefile.data();
     146    shfo.fFlags = FOF_NOCONFIRMATION|FOF_NOERRORUI|FOF_SILENT;
     147    (void)SHFileOperation(&shfo);
     148# endif
     149#else
     150    system("rm -rf " + safefile);
     151#endif
     152}
    43153#else
    44154#define CONVERT_TO_STRING(FMT) \
    45155    char buf[BUFSIZE];\
  • xapian-applications/omega/utils.h

    diff --git a/xapian-applications/omega/utils.h b/xapian-applications/omega/utils.h
    index a54a4f8..b2241b7 100644
    a b int string_to_int(const std::string & s);  
    3434/** Remove any leading and/or trailing whitespace from @a s. */
    3535void trim(std::string & s);
    3636
     37void rm_rf(const std::string &filename);
     38
    3739#endif