Ticket #282: xapian-omega-trunk-r16879-from-ticket-285-and-cleaned-up-updated-2012-11-13.patch

File xapian-omega-trunk-r16879-from-ticket-285-and-cleaned-up-updated-2012-11-13.patch, 46.8 KB (added by Olly Betts, 11 years ago)

updated patch against trunk

  • xapian-applications/omega/ChangeLog

    diff --git a/xapian-applications/omega/ChangeLog b/xapian-applications/omega/ChangeLog
    index a8e4b7e..33bc585 100644
    a b  
     12006-08-22 09:30:12 Reini Urban <reinhard.urban@avl.com>
     2
     3        omega-0.9.6c:
     4        * omindex.cc: Fix wrong timestamp comparison in cache logic
     5        * outlook2text.in: New script
     6
     72006-08-17 18:06:26 Reini Urban <reinhard.urban@avl.com>
     8
     9        omega-0.9.6a:
     10        * omindex.cc: Added cached virtual directories zip,msg,pst,...).
     11        Consistently log stderr to /var/log/omega/omindex-error.log.
     12
    113Tue Sep 25 23:57:12 GMT 2012  Olly Betts <olly@survex.com>
    214
    315        * Makefile.am,omindex.cc: Replace shell_protect() with
  • xapian-applications/omega/Makefile.am

    diff --git a/xapian-applications/omega/Makefile.am b/xapian-applications/omega/Makefile.am
    index 3599376..9b29a02 100644
    a b pkglibbindir = $(pkglibdir)/bin  
    7878pkglibbin_PROGRAMS = omega
    7979dist_pkglibbin_SCRIPTS = outlookmsg2html
    8080bin_PROGRAMS = omindex scriptindex
     81dist_libexec_SCRIPTS = outlook2text mimeexplode msgconvert.pl
    8182dist_bin_SCRIPTS = dbi2omega htdig2omega mbox2omega
    8283
    8384check_PROGRAMS = atomparsetest htmlparsetest md5test urlenctest utf8converttest
    dist_man_MANS = omindex.1 scriptindex.1  
    173174MAINTAINERCLEANFILES = $(dist_man_MANS)
    174175endif
    175176
     177CLEANFILES = outlook2text
     178
     179outlook2text: $(srcdir)/outlook2text.in Makefile
     180        sed "s,@MSGCONVERT@,$(pkglibbindir)/msgconvert.pl,;s,@MIMEEXPLODE@,$(pkglibbindir)/mimeexplode," $(srcdir)/outlook2text.in > $@
     181
    176182if DOCUMENTATION_RULES
    177183omindex.1: omindex$(EXEEXT) makemanpage
    178184        ./makemanpage ./omindex $(srcdir)/omindex.cc omindex.1
  • new file xapian-applications/omega/mimeexplode

    diff --git a/xapian-applications/omega/mimeexplode b/xapian-applications/omega/mimeexplode
    new file mode 100644
    index 0000000..70743ab
    - +  
     1#!/usr/bin/perl -w
     2
     3=head1 NAME
     4
     5mimeexplode - explode one or more MIME messages
     6
     7=head1 SYNOPSIS
     8
     9    mimeexplode [-d <dir>] <mime-msg-file> <mime-msg-file> ...
     10
     11    someprocess | mimeexplode -
     12
     13=head1 DESCRIPTION
     14
     15Takes one or more files from the command line that contain MIME
     16messages, and explodes their contents out into subdirectories
     17of the current working directory.  The subdirectories are
     18just called C<msg0>, C<msg1>, C<msg2>, etc.  Existing directories are
     19skipped over.
     20
     21The message information is output to the stdout, like this:
     22
     23    Message: msg3 (inputfile1.msg)
     24        Part: msg3/filename-1.dat (text/plain)
     25        Part: msg3/filename-2.dat (text/plain)
     26    Message: msg5 (input-file2.msg)
     27        Part: msg5/dir.gif (image/gif)
     28        Part: msg5/face.jpg (image/jpeg)
     29    Message: msg6 (infile3)
     30        Part: msg6/filename-1.dat (text/plain)
     31
     32This was written as an example of the MIME:: modules in the
     33MIME-parser package I wrote.  It may prove useful as a quick-and-dirty
     34way of splitting a MIME message if you need to decode something, and
     35you don't have a MIME mail reader on hand.
     36
     37=head1 COMMAND LINE OPTIONS
     38
     39-d outdir
     40
     41=head1 AUTHOR
     42
     43Eryq C<eryq@zeegee.com>, in a big hurry...
     44Reini Urban C<rurban@x-ray.at>: -d option to always explode into the same dir
     45
     46=cut
     47
     48#BEGIN { unshift @INC, ".." }    # to test MIME:: stuff before installing it!
     49
     50require 5.001;
     51
     52use strict;
     53use vars;
     54
     55use MIME::Parser;
     56use Getopt::Std;
     57my %opts;
     58my $outbase = '';
     59my $postfix = '';
     60
     61#------------------------------------------------------------
     62# make_msg - make and return the name of a msgXXX directory
     63#------------------------------------------------------------
     64
     65#ignored
     66#sub make_msg {
     67#    while (-d "msg$Msgno") {
     68#       ++$Msgno;
     69#       die "self-imposed limit reached" if $Msgno == 256;
     70#    }
     71#   mkdir "msg$Msgno",0755 or die "couldn't make msg$Msgno: $!";
     72#    "msg$Msgno";
     73#}
     74
     75#------------------------------------------------------------
     76# dump_entity - dump an entity's file info
     77#------------------------------------------------------------
     78sub dump_entity {
     79    my $ent = shift;
     80    my @parts = $ent->parts;
     81
     82    if (@parts) {        # multipart...
     83        map { dump_entity($_) } @parts;
     84    }
     85    else {               # single part...
     86        print "    Part: ", $ent->bodyhandle->path,
     87              " (", scalar($ent->head->mime_type), ")\n";
     88    }
     89}
     90
     91#------------------------------------------------------------
     92# main
     93#------------------------------------------------------------
     94sub main {
     95    my $file;
     96    my $entity;
     97
     98    # make sure the same message gets exploded into the same dir
     99    getopts('d:', \%opts);
     100    $outbase = $opts{d} ? $opts{d} : "msg0";
     101    my $outdir = $outbase;
     102
     103    # Go through messages:
     104    @ARGV or unshift @ARGV, "-";
     105    while (defined($file = shift @ARGV)) {
     106
     107      # Sanity:
     108      (-d $outdir) or mkdir "$outdir",0755;
     109      (-w "$outdir") or die "cwd $outdir not writable!";
     110      #my $msgdir = make_msg();
     111      #print "Message: $msgdir ($file)\n";
     112
     113      # Create a new parser object:
     114      my $parser = new MIME::Parser;
     115      ### $parser->parse_nested_messages('REPLACE');
     116
     117      # Optional: set up parameters that will affect how it extracts
     118      #   documents from the input stream:
     119      $parser->output_dir($outdir);
     120
     121      # Parse an input stream:
     122      open FILE, $file or die "couldn't open $file";
     123      $entity = $parser->read(\*FILE) or
     124        print STDERR "Couldn't parse MIME in $file; continuing...\n";
     125      close FILE;
     126
     127      # Congratulations: you now have a (possibly multipart) MIME entity!
     128      dump_entity($entity) if $entity;
     129      ### $entity->dump_skeleton if $entity;
     130
     131      $postfix++;
     132      $outdir = $outbase.$postfix;
     133    }
     134    1;
     135}
     136
     137exit (&main ? 0 : -1);
     138#------------------------------------------------------------
     1391;
     140
  • new file xapian-applications/omega/msgconvert.pl

    diff --git a/xapian-applications/omega/msgconvert.pl b/xapian-applications/omega/msgconvert.pl
    new file mode 100644
    index 0000000..cf32079
    - +  
     1#!/usr/bin/perl -w
     2#
     3# msgconvert.pl:
     4#
     5# Convert .MSG files (made by Outlook (Express)) to multipart MIME messages.
     6#
     7# Copyright 2002, 2004, 2006 Matijs van Zuijlen
     8#
     9# This program is free software; you can redistribute it and/or modify it
     10# under the terms of the GNU General Public License as published by the
     11# Free Software Foundation; either version 2 of the License, or (at your
     12# option) any later version.
     13#
     14# This program is distributed in the hope that it will be useful, but
     15# WITHOUT ANY WARRANTY; without even the implied warranty of
     16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
     17# Public License for more details.
     18#
     19# CHANGES:
     20# 20020715  Recognize new items 'Cc', mime type of attachment, long
     21#           filename of attachment, and full headers. Attachments turn out
     22#           to be numbered, so a regexp is now used to recognize label of
     23#           items that are attachments.
     24# 20020831  long file name will definitely be used if present. Full headers
     25#           and mime type information are used when present. Created
     26#           generic system for specifying known items to be skipped.
     27#           Unexpected contents is never reason to bail out anymore. Added
     28#           support for usage message and option processing (--verbose).
     29# 20040104  Handle address data slightly better, make From line less fake,
     30#           make $verbose and $skippable_entries global vars, handle HTML
     31#           variant of body text if present (though not optimally).
     32# 20040214  Fix typos and incorrect comments.
     33# 20040307  - Complete rewrite: All functional parts are now in the package
     34#             MSGParser;
     35#           - Creation of MIME::Entity object is delayed until the output
     36#             routines, which means all data is known; This means I can
     37#             create a multipart/alternative body.
     38#           - Item names are parsed (thanks to bfrederi@alumni.sfu.ca for
     39#             the information).
     40# 20040514  Check if $self->{HEAD} actually exists before trying to add its
     41#           contents to the output Mime object's header data.
     42#           (Bug reported by Thomas Ng).
     43#           Don't produce multipart messages if not needed.
     44#           (Bug reported by Justin B. Scout).
     45# 20040529  Correctly format OLEDATE.
     46# 20040530  - Extract date from property 0047 (thanks, Marc Goodman).
     47#           - Use address data to make To: and Cc: lines complete
     48#           - Use the in-reply-to property
     49#           - More unknown properties named.
     50#           - Found another property containing an SMTP address.
     51#           - Put non-SMTP type addresses back in output.
     52# 20040825  Replace 'our' to declare globals with 'use vars'. This means
     53#           the globals our now properly scoped inside the package and not
     54#           the file.
     55#           This also fixes the bug that this program did not work on perl
     56#           versions below 5.6. (Bug reported by Tim Gustafson)
     57# 20060218  More sensible encoding warnings.
     58# 20060219  Move OLE parsing to main program.
     59#           Parse nested MSG files (Bug reported by Christof Lukas).
     60# 20060225  Simplify code.
     61#
     62
     63#
     64# Import modules.
     65#
     66package MSGParser;
     67use strict;
     68use OLE::Storage_Lite;
     69use MIME::Entity;
     70use MIME::Parser;
     71use Date::Format;
     72use POSIX qw(mktime);
     73use constant DIR_TYPE => 1;
     74use constant FILE_TYPE => 2;
     75
     76use vars qw($skipproperties $skipheaders);
     77#
     78# Descriptions partially based on mapitags.h
     79#
     80$skipproperties = {
     81  # Envelope properties
     82  '000B' => "Conversation key?",
     83  '001A' => "Type of message",
     84  '003B' => "Sender address variant",
     85  '003D' => "Contains 'Re: '",
     86  '003F' => "'recieved by' id",
     87  '0040' => "'recieved by' name",
     88  '0041' => "Sender variant address id",
     89  '0042' => "Sender variant name",
     90  '0043' => "'recieved representing' id",
     91  '0044' => "'recieved representing' name",
     92  '0046' => "Read receipt address id",
     93  '0051' => "'recieved by' search key",
     94  '0052' => "'recieved representing' search key",
     95  '0053' => "Read receipt search key",
     96  '0064' => "Sender variant address type",
     97  '0065' => "Sender variant address",
     98  '0070' => "Conversation topic",
     99  '0071' => "Conversation index",
     100  '0075' => "'recieved by' address type",
     101  '0076' => "'recieved by' email address",
     102  '0077' => "'recieved representing' address type",
     103  '0078' => "'recieved representing' email address",
     104  '007F' => "something like a message id",
     105  # Recipient properties
     106  '0C19' => "Reply address variant",
     107  '0C1D' => "Reply address variant",
     108  '0C1E' => "Reply address type",
     109  # Non-transmittable properties
     110  '0E02' => "?Should BCC be displayed",
     111  '0E0A' => "sent mail id",
     112  '0E1D' => "Subject w/o Re",
     113  '0E27' => "64 bytes: Unknown",
     114  '0FF6' => "Index",
     115  '0FF9' => "Index",
     116  '0FFF' => "Address variant",
     117  # Content properties
     118  '1008' => "Summary or something",
     119  '1009' => "RTF Compressed",
     120  # 'Common property'
     121  '3001' => "Display name",
     122  '3002' => "Address Type",
     123  '300B' => "'Search key'",
     124  # Attachment properties
     125  '3702' => "Attachment encoding",
     126  '3703' => "Attachment extension",
     127  '3709' => "'Attachment rendering'", # Maybe an icon or something?
     128  '3713' => "Icon URL?",
     129  # 'Mail user'
     130  '3A20' => "Address variant",
     131  # 3900 -- 39FF: 'Address book'
     132  '39FF' => "7 bit display name",
     133  # 'Display table properties'
     134  '3FF8' => "Routing data?",
     135  '3FF9' => "Routing data?",
     136  '3FFA' => "Routing data?",
     137  '3FFB' => "Routing data?",
     138  # 'Transport-defined envelope property'
     139  '4029' => "Sender variant address type",
     140  '402A' => "Sender variant address",
     141  '402B' => "Sender variant name",
     142  '5FF6' => "Recipient name",
     143  '5FF7' => "Recipient address variant",
     144  # 'Provider-defined internal non-transmittable property'
     145  '6740' => "Unknown, binary data",
     146  # User defined id's
     147  '8000' => "Content Class",
     148  '8002' => "Unknown, binary data",
     149};
     150
     151$skipheaders = {
     152  "MIME-Version" => 1,
     153  "Content-Type" => 1,
     154  "Content-Transfer-Encoding" => 1,
     155  "X-Mailer" => 1,
     156  "X-Msgconvert" => 1,
     157  "X-MS-Tnef-Correlator" => 1,
     158  "X-MS-Has-Attach" => 1,
     159};
     160
     161use constant ENCODING_UNICODE => '001F';
     162use constant KNOWN_ENCODINGS => {
     163    '000D' => 'Directory',
     164    '001F' => 'Unicode',
     165    '001E' => 'Ascii?',
     166    '0102' => 'Binary',
     167};
     168
     169use constant MAP_ATTACHMENT_FILE => {
     170  '3701' => ["DATA",        0], # Data
     171  '3704' => ["SHORTNAME",   1], # Short file name
     172  '3707' => ["LONGNAME",    1], # Long file name
     173  '370E' => ["MIMETYPE",    1], # mime type
     174  '3716' => ["DISPOSITION", 1], # disposition
     175};
     176
     177use constant MAP_SUBITEM_FILE => {
     178  '1000' => ["BODY_PLAIN",      0], # Body
     179  '1013' => ["BODY_HTML",       0], # HTML Version of body
     180  '0037' => ["SUBJECT",         1], # Subject
     181  '0047' => ["SUBMISSION_ID",   1], # Seems to contain the date
     182  '007D' => ["HEAD",            1], # Full headers
     183  '0C1A' => ["FROM",            1], # Reply-To: Name
     184  '0C1E' => ["FROM_ADDR_TYPE",  1], # From: Address type
     185  '0C1F' => ["FROM_ADDR",       1], # Reply-To: Address
     186  '0E04' => ["TO",              1], # To: Names
     187  '0E03' => ["CC",              1], # Cc: Names
     188  '1035' => ["MESSAGEID",       1], # Message-Id
     189  '1042' => ["INREPLYTO",       1], # In reply to Message-Id
     190};
     191
     192use constant MAP_ADDRESSITEM_FILE => {
     193  '3001' => ["NAME",            1], # Real name
     194  '3002' => ["TYPE",            1], # Address type
     195  '403D' => ["TYPE",            1], # Address type
     196  '3003' => ["ADDRESS",         1], # Address
     197  '403E' => ["ADDRESS",         1], # Address
     198  '39FE' => ["SMTPADDRESS",     1], # SMTP Address variant
     199};
     200
     201#
     202# Main body of module
     203#
     204
     205sub new {
     206  my $that = shift;
     207  my $class = ref $that || $that;
     208
     209  my $self = {
     210    ATTACHMENTS => [],
     211    ADDRESSES => [],
     212    VERBOSE => 0,
     213    HAS_UNICODE => 0,
     214    FROM_ADDR_TYPE => "",
     215  };
     216  bless $self, $class;
     217}
     218
     219#
     220# Main sub: parse the PPS tree, and return
     221#
     222sub parse {
     223  my $self = shift;
     224  my $PPS = shift or die "Internal error: No PPS tree";
     225  $self->_RootDir($PPS);
     226}
     227
     228sub mime_object {
     229  my $self = shift;
     230
     231  my $bodymime;
     232  my $mime;
     233
     234  if ($self->_IsMultiPart) {
     235    # Construct a multipart message object
     236
     237    $mime = MIME::Entity->build(Type => "multipart/mixed");
     238
     239    # Set the entity that we'll save the body parts to. If there's more than
     240    # one part, it's a new entity, otherwise, it's the main $mime object.
     241    if ($self->{BODY_HTML} and $self->{BODY_PLAIN}) {
     242      $bodymime = MIME::Entity->build(
     243        Type => "multipart/alternative",
     244        Encoding => "8bit",
     245      );
     246      $mime->add_part($bodymime);
     247    } else {
     248      $bodymime = $mime;
     249    }
     250    if ($self->{BODY_PLAIN}) {
     251      $self->_SaveAttachment($bodymime, {
     252        MIMETYPE => 'text/plain; charset=ISO-8859-1',
     253        ENCODING => '8bit',
     254        DATA => $self->{BODY_PLAIN},
     255        DISPOSITION => 'inline',
     256      });
     257    }
     258    if ($self->{BODY_HTML}) {
     259      $self->_SaveAttachment($bodymime, {
     260        MIMETYPE => 'text/html',
     261        ENCODING => '8bit',
     262        DATA => $self->{BODY_HTML},
     263        DISPOSITION => 'inline',
     264      });
     265    }
     266    foreach my $att (@{$self->{ATTACHMENTS}}) {
     267      $self->_SaveAttachment($mime, $att);
     268    }
     269  } elsif ($self->{BODY_PLAIN}) {
     270    # Construct a single part message object with a plain text body
     271    $mime = MIME::Entity->build(
     272      Type => "text/plain",
     273      Data => $self->{BODY_PLAIN}
     274    );
     275  } elsif ($self->{BODY_HTML}) {
     276    # Construct a single part message object with an HTML body
     277    $mime = MIME::Entity->build(
     278      Type => "text/html",
     279      Data => $self->{BODY_HTML}
     280    );
     281  }
     282
     283  $self->_CopyHeaderData($mime);
     284
     285  $self->_SetHeaderFields($mime);
     286
     287  return $mime;
     288}
     289
     290# Actually output the message in mbox format
     291sub print {
     292  my $self = shift;
     293
     294  my $mime = $self->mime_object;
     295
     296  # Construct From line from whatever we know.
     297  my $string = "";
     298  $string = (
     299    $self->{FROM_ADDR_TYPE} eq "SMTP" ?
     300    $self->{FROM_ADDR} :
     301    'someone@somewhere'
     302  );
     303  $string =~ s/\n//g;
     304
     305  # The date used here is not really important.
     306  print "From ", $string, " ", scalar localtime, "\n";
     307  $mime->print(\*STDOUT);
     308  print "\n";
     309}
     310
     311sub set_verbosity {
     312  my ($self, $verbosity) = @_;
     313  defined $verbosity or die "Internal error: no verbosity level";
     314  $self->{VERBOSE} = $verbosity;
     315}
     316
     317#
     318# Below are functions that walk the PPS tree. The *Dir functions handle
     319# processing the directory nodes of the tree (mainly, iterating over the
     320# children), whereas the *Item functions handle processing the items in the
     321# directory (if such an item is itself a directory, it will in turn be
     322# processed by the relevant *Dir function).
     323#
     324
     325#
     326# RootItem: Check Root Entry, parse sub-entries.
     327# The OLE file consists of a single entry called Root Entry, which has
     328# several children. These children are parsed in the sub SubItem.
     329#
     330sub _RootDir {
     331  my ($self, $PPS) = @_;
     332
     333  foreach my $child (@{$PPS->{Child}}) {
     334    $self->_SubItem($child);
     335  }
     336}
     337
     338sub _SubItem {
     339  my ($self, $PPS) = @_;
     340 
     341  if ($PPS->{Type} == DIR_TYPE) {
     342    $self->_SubItemDir($PPS);
     343  } elsif ($PPS->{Type} == FILE_TYPE) {
     344    $self->_SubItemFile($PPS);
     345  } else {
     346    warn "Unknown entry type: $PPS->{Type}";
     347  }
     348}
     349
     350sub _SubItemDir {
     351  my ($self, $PPS) = @_;
     352
     353  $self->_GetOLEDate($PPS);
     354
     355  my $name = $self->_GetName($PPS);
     356
     357  if ($name =~ /__recip_version1 0_ /) { # Address of one recipient
     358    $self->_AddressDir($PPS);
     359  } elsif ($name =~ '__attach_version1 0_ ') { # Attachment
     360    $self->_AttachmentDir($PPS);
     361  } else {
     362    $self->_UnknownDir($self->_GetName($PPS));
     363  }
     364}
     365
     366sub _SubItemFile {
     367  my ($self, $PPS) = @_;
     368
     369  my $name = $self->_GetName($PPS);
     370  my ($property, $encoding) = $self->_ParseItemName($name);
     371
     372  $self->_MapProperty($self, $PPS->{Data}, $property,
     373    MAP_SUBITEM_FILE) or $self->_UnknownFile($name);
     374}
     375
     376sub _AddressDir {
     377  my ($self, $PPS) = @_;
     378
     379  my $address = {
     380    NAME        => undef,
     381    ADDRESS     => undef,
     382    TYPE        => "",
     383  };
     384  foreach my $child (@{$PPS->{Child}}) {
     385    $self->_AddressItem($child, $address);
     386  }
     387  push @{$self->{ADDRESSES}}, $address;
     388}
     389
     390sub _AddressItem {
     391  my ($self, $PPS, $addr_info) = @_;
     392
     393  my $name = $self->_GetName($PPS);
     394
     395  # DIR Entries: There should be none.
     396  if ($PPS->{Type} == DIR_TYPE) {
     397    $self->_UnknownDir($name);
     398  } elsif ($PPS->{Type} == FILE_TYPE) {
     399    my ($property, $encoding) = $self->_ParseItemName($name);
     400    $self->_MapProperty($addr_info, $PPS->{Data}, $property,
     401      MAP_ADDRESSITEM_FILE) or $self->_UnknownFile($name);
     402  } else {
     403    warn "Unknown entry type: $PPS->{Type}";
     404  }
     405}
     406
     407sub _AttachmentDir {
     408  my ($self, $PPS) = @_;
     409
     410  my $attachment = {
     411    SHORTNAME   => undef,
     412    LONGNAME    => undef,
     413    MIMETYPE    => 'application/octet-stream',
     414    ENCODING    => 'base64',
     415    DISPOSITION => 'attachment',
     416    DATA        => undef
     417  };
     418  foreach my $child (@{$PPS->{Child}}) {
     419    $self->_AttachmentItem($child, $attachment);
     420  }
     421  push @{$self->{ATTACHMENTS}}, $attachment;
     422}
     423
     424sub _AttachmentItem {
     425  my ($self, $PPS, $att_info) = @_;
     426
     427  my $name = $self->_GetName($PPS);
     428
     429  my ($property, $encoding) = $self->_ParseItemName($name);
     430
     431  if ($PPS->{Type} == DIR_TYPE) {
     432
     433    if ($property eq '3701') {  # Nested MSG file
     434      my $msgp = new MSGParser();
     435      $msgp->parse($PPS);
     436      my $data = $msgp->mime_object->as_string;
     437      $att_info->{DATA} = $data;
     438      $att_info->{MIMETYPE} = 'message/rfc822';
     439      $att_info->{ENCODING} = '8bit';
     440    } else {
     441      $self->_UnknownDir($name);
     442    }
     443
     444  } elsif ($PPS->{Type} == FILE_TYPE) {
     445    $self->_MapProperty($att_info, $PPS->{Data}, $property,
     446      MAP_ATTACHMENT_FILE) or $self->_UnknownFile($name);
     447  } else {
     448    warn "Unknown entry type: $PPS->{Type}";
     449  }
     450}
     451
     452sub _MapProperty {
     453  my ($self, $hash, $data, $property, $map) = @_;
     454
     455  defined $property or return 0;
     456  my $arr = $map->{$property} or return 0;
     457
     458  $arr->[1] and $data =~ s/\000//g;
     459  $hash->{$arr->[0]} = $data;
     460
     461  return 1;
     462}
     463
     464sub _UnknownDir {
     465  my ($self, $name) = @_;
     466
     467  if ($name eq '__nameid_version1 0') {
     468    $self->{VERBOSE}
     469      and warn "Skipping DIR entry $name (Introductory stuff)\n";
     470    return;
     471  }
     472  warn "Unknown DIR entry $name\n";
     473}
     474
     475sub _UnknownFile {
     476  my ($self, $name) = @_;
     477
     478  if ($name eq '__properties_version1 0') {
     479    $self->{VERBOSE}
     480      and warn "Skipping FILE entry $name (Properties)\n";
     481    return;
     482  }
     483
     484  my ($property, $encoding) = $self->_ParseItemName($name);
     485  unless (defined $property) {
     486    warn "Unknown FILE entry $name\n";
     487    return;
     488  }
     489  if ($skipproperties->{$property}) {
     490    $self->{VERBOSE}
     491      and warn "Skipping property $property ($skipproperties->{$property})\n";
     492    return;
     493  } elsif ($property =~ /^80/) {
     494    $self->{VERBOSE}
     495      and warn "Skipping property $property (user-defined property)\n";
     496    return;
     497  } else {
     498    warn "Unknown property $property\n";
     499    return;
     500  }
     501}
     502
     503#
     504# Helper functions
     505#
     506
     507sub _GetName {
     508  my ($self, $PPS) = @_;
     509  return $self->_NormalizeWhiteSpace(OLE::Storage_Lite::Ucs2Asc($PPS->{Name}));
     510}
     511
     512sub _NormalizeWhiteSpace {
     513  my ($self, $name) = @_;
     514  $name =~ s/\W/ /g;
     515  return $name;
     516}
     517
     518sub _GetOLEDate {
     519  my ($self, $PPS) = @_;
     520  unless (defined ($self->{OLEDATE})) {
     521    # Make Date
     522    my $datearr;
     523    $datearr = $PPS->{Time2nd};
     524    $datearr = $PPS->{Time1st} unless($datearr);
     525    $self->{OLEDATE} = $self->_FormatDate($datearr) if $datearr;
     526  }
     527}
     528
     529sub _FormatDate {
     530  my ($self, $datearr) = @_;
     531
     532  # TODO: This is a little convoluted. Directly using strftime didn't seem
     533  # to work.
     534  my $datetime = mktime(@$datearr);
     535  return time2str("%a, %d %h %Y %X %z", $datetime);
     536}
     537
     538# If we didn't get the date from the original header data, we may be able
     539# to get it from the SUBMISSION_ID:
     540# It seems to have the format of a semicolon-separated list of key=value
     541# pairs. The key l has a value with the format:
     542# <SERVER>-<DATETIME>Z-<NUMBER>, where DATETIME is the date and time in
     543# the format YYMMDDHHMMSS.
     544sub _SubmissionIdDate {
     545  my $self = shift;
     546
     547  my $submission_id = $self->{SUBMISSION_ID} or return undef;
     548  $submission_id =~ m/l=.*-(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)Z-.*/
     549    or return undef;
     550  my $year = $1;
     551  $year += 100 if $year < 20;
     552  return $self->_FormatDate([$6,$5,$4,$3,$2-1,$year]);
     553}
     554
     555sub _ParseItemName {
     556  my ($self, $name) = @_;
     557
     558  if ($name =~ /^__substg1 0_(....)(....)$/) {
     559    my ($property, $encoding) = ($1, $2);
     560    if ($encoding eq ENCODING_UNICODE and not ($self->{HAS_UNICODE})) {
     561      warn "This MSG file contains Unicode fields."
     562        . " This is currently unsupported.\n";
     563      $self->{HAS_UNICODE} = 1;
     564    } elsif (not (KNOWN_ENCODINGS()->{$encoding})) {
     565      warn "Unknown encoding $encoding. Results may be strange or wrong.\n";
     566    }
     567    return ($property, $encoding);
     568  } else {
     569    return (undef, undef);
     570  }
     571}
     572
     573sub _SaveAttachment {
     574  my ($self, $mime, $att) = @_;
     575
     576  my $ent = $mime->attach(
     577    Type => $att->{MIMETYPE},
     578    Encoding => $att->{ENCODING},
     579    Data => [],
     580    Filename => ($att->{LONGNAME} ? $att->{LONGNAME} : $att->{SHORTNAME}),
     581    Disposition => $att->{DISPOSITION}
     582  );
     583
     584  my $handle;
     585  if ($handle = $ent->open("w")) {
     586    $handle->print($att->{DATA});
     587    $handle->close;
     588  } else {
     589    warn "Could not write data!";
     590  }
     591}
     592
     593sub _SetAddressPart {
     594  my ($self, $adrname, $partname, $data) = @_;
     595
     596  my $address = $self->{ADDRESSES}->{$adrname};
     597  $data =~ s/\000//g;
     598  #warn "Processing address data part $partname : $data\n";
     599  if (defined ($address->{$partname})) {
     600    if ($address->{$partname} eq $data) {
     601      warn "Skipping duplicate but identical address information for"
     602      . " $partname\n" if $self->{VERBOSE};
     603    } else {
     604      warn "Address information $partname inconsistent:\n";
     605      warn "    Original data: $address->{$partname}\n";
     606      warn "    New data: $data\n";
     607    }
     608  } else {
     609    $address->{$partname} = $data;
     610  }
     611}
     612
     613# Set header fields
     614sub _AddHeaderField {
     615  my ($self, $mime, $fieldname, $value) = @_;
     616
     617  my $oldvalue = $mime->head->get($fieldname);
     618  return if $oldvalue;
     619  $mime->head->add($fieldname, $value) if $value;
     620}
     621
     622sub _Address {
     623  my ($self, $tag) = @_;
     624  my $name = $self->{$tag} || "";
     625  my $address = $self->{$tag . "_ADDR"} || "";
     626  return "$name <$address>";
     627}
     628
     629# Find SMTP addresses for the given list of names
     630sub _ExpandAddressList {
     631  my ($self, $names) = @_;
     632
     633  my $addresspool = $self->{ADDRESSES};
     634  my @namelist = split /; */, $names;
     635  my @result;
     636  name: foreach my $name (@namelist) {
     637    foreach my $address (@$addresspool) {
     638      if ($name eq $address->{NAME}) {
     639        my $addresstext = $address->{NAME} . " <";
     640        if (defined ($address->{SMTPADDRESS})) {
     641          $addresstext .= $address->{SMTPADDRESS};
     642        } elsif ($address->{TYPE} eq "SMTP") {
     643          $addresstext .= $address->{ADDRESS};
     644        }
     645        $addresstext .= ">";
     646        push @result, $addresstext;
     647        next name;
     648      }
     649    }
     650    push @result, $name;
     651  }
     652  return join ", ", @result;
     653}
     654
     655sub _ParseHead {
     656  my ($self, $data) = @_;
     657  defined $data or return undef;
     658  # Parse full header date if we got that.
     659  my $parser = new MIME::Parser();
     660  $parser->output_to_core(1);
     661  $parser->decode_headers(1);
     662  $data =~ s/^Microsoft Mail.*$/X-MSGConvert: yes/m;
     663  my $entity = $parser->parse_data($data)
     664    or warn "Couldn't parse full headers!";
     665  my $head = $entity->head;
     666  $head->unfold;
     667  return $head;
     668}
     669
     670# Find out if we need to construct a multipart message
     671sub _IsMultiPart {
     672  my $self = shift;
     673
     674  return (
     675    ($self->{BODY_HTML} and $self->{BODY_PLAIN})
     676      or @{$self->{ATTACHMENTS}}>0
     677  );
     678}
     679
     680# Copy original header data.
     681# Note: This should contain the Date: header.
     682sub _CopyHeaderData {
     683  my ($self, $mime) = @_;
     684
     685  my $head = $self->_ParseHead($self->{HEAD}) or return;
     686
     687  foreach my $tag (grep {!$skipheaders->{$_}} $head->tags) {
     688    foreach my $value ($head->get_all($tag)) {
     689      $mime->head->add($tag, $value);
     690    }
     691  }
     692}
     693
     694# Set header fields
     695sub _SetHeaderFields {
     696  my ($self, $mime) = @_;
     697
     698  # If we didn't get the date from the original header data, we may be able
     699  # to get it from the SUBMISSION_ID:
     700  $self->_AddHeaderField($mime, 'Date', $self->_SubmissionIdDate());
     701
     702  # Third and last chance to set the Date: header; this uses the date the
     703  # MSG file was saved.
     704  $self->_AddHeaderField($mime, 'Date', $self->{OLEDATE});
     705  $self->_AddHeaderField($mime, 'Subject', $self->{SUBJECT});
     706  $self->_AddHeaderField($mime, 'From', $self->_Address("FROM"));
     707  #$self->_AddHeaderField($mime, 'Reply-To', $self->_Address("REPLYTO"));
     708  $self->_AddHeaderField($mime, 'To', $self->_ExpandAddressList($self->{TO}));
     709  $self->_AddHeaderField($mime, 'Cc', $self->_ExpandAddressList($self->{CC}));
     710  $self->_AddHeaderField($mime, 'Message-Id', $self->{MESSAGEID});
     711  $self->_AddHeaderField($mime, 'In-Reply-To', $self->{INREPLYTO});
     712}
     713
     714package main;
     715use Getopt::Long;
     716use Pod::Usage;
     717
     718# Setup command line processing.
     719my $verbose = '';
     720my $help = '';      # Print help message and exit.
     721GetOptions('verbose' => \$verbose, 'help|?' => \$help) or pod2usage(2);
     722pod2usage(1) if $help;
     723
     724# Get file name
     725my $file = $ARGV[0];
     726defined $file or pod2usage(2);
     727warn "Will parse file: $file\n" if $verbose;
     728
     729# Load and parse MSG file (is OLE)
     730my $Msg = OLE::Storage_Lite->new($file);
     731my $PPS = $Msg->getPpsTree(1);
     732$PPS or die "$file must be an OLE file";
     733
     734# parse PPS tree
     735my $parser = new MSGParser();
     736$parser->set_verbosity(1) if $verbose;
     737$parser->parse($PPS);
     738$parser->print();
     739
     740#
     741# Usage info follows.
     742#
     743__END__
     744
     745=head1 NAME
     746
     747msgconvert.pl - Convert Outlook .msg files to mbox format
     748
     749=head1 SYNOPSIS
     750
     751msgconvert.pl [options] <file.msg>
     752
     753  Options:
     754    --verbose   be verbose
     755    --help      help message
     756
     757=head1 OPTIONS
     758
     759=over 8
     760
     761=item B<--verbose>
     762
     763    Print information about skipped parts of the .msg file.
     764
     765=item B<--help>
     766
     767    Print a brief help message.
     768
     769=head1 DESCRIPTION
     770
     771This program will output the message contained in file.msg in mbox format
     772on stdout. It will complain about unrecognized OLE parts on
     773stderr.
     774
     775=head1 BUGS
     776
     777Not all data that's in the .MSG file is converted. There simply are some
     778parts whose meaning escapes me. One of these must contain the date the
     779message was sent, for example. Formatting of text messages will also be
     780lost. YMMV.
     781
     782=cut
  • xapian-applications/omega/omindex.cc

    diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc
    index 1ea8e77..bac6e54 100644
    a b  
    55 * Copyright 2001,2002 Ananova Ltd
    66 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012 Olly Betts
    77 * Copyright 2009 Frank J Bruzzaniti
     8 * Copyright 2006,2007,2008 AVL List GesmbH
    89 * Copyright 2012 Mihai Bivol
    910 *
    1011 * This program is free software; you can redistribute it and/or
     
    7071
    7172#include "gnu_getopt.h"
    7273
     74#ifndef LIBEXECDIR
     75// must have ending slash
     76//# define LIBEXECDIR "/usr/lib/omega/bin/"
     77# define LIBEXECDIR ""
     78#endif
     79#ifndef PKGDATADIR
     80// must have ending slash
     81# define PKGDATADIR "/usr/share/omega/"
     82#endif
     83
    7384using namespace std;
    7485
    7586#define TITLE_SIZE 128
    static bool ignore_exclusions = false;  
    8495static bool spelling = false;
    8596static off_t  max_size = 0;
    8697static bool verbose = false;
     98string error_log; /* used in runfilter.cc */
     99static string baseurl;
     100static string dbpath;
     101static string cache_dir;
    87102static enum {
    88103    EMPTY_BODY_WARN, EMPTY_BODY_INDEX, EMPTY_BODY_SKIP
    89104} empty_body = EMPTY_BODY_WARN;
    static time_t last_mod_max;  
    104119// text are common, so we handle these with a std::map.
    105120static map<string, string> commands;
    106121
     122static void
     123index_directory(const string &path, const string &url_, size_t depth_limit,
     124                map<string, string>& mime_map, size_t sample_size);
     125
    107126inline static bool
    108127p_notalnum(unsigned int c)
    109128{
    skip_unknown_mimetype(const string & file, const string & mimetype)  
    258277
    259278void
    260279index_mimetype(const string & file, const string & url, const string & ext,
    261                const string &mimetype, DirectoryIterator &d, size_t sample_size);
     280               const string &mimetype, DirectoryIterator &d, size_t sample_size,
     281               map<string, string>& mime_map, size_t depth_limit);
     282
     283static
     284void mkdir_p(const string &path, mode_t mode) {
     285    (void)mode; // FIXME
     286#ifdef __WIN32__
     287    string cmd = "mkdir ";
     288    append_filename_argument(cmd, path);
     289    system(cmd.c_str());
     290#else
     291    string cmd = "mkdir -p ";
     292    append_filename_argument(cmd, path);
     293    if (system(cmd.c_str()) < 0) { /* FIXME */ }
     294#endif
     295}
     296
     297/*
     298 * unpack .msg/.pst/.rar/.zip into local cache dir and recurse there
     299 */
     300static void
     301index_cached_directory(size_t depth_limit,
     302                       const string &file,
     303                       const string &url,
     304                       const string &ext,
     305                       const string &cmd,
     306                       map<string, string>& mime_map,
     307                       size_t sample_size)
     308{
     309    string oldroot = root;
     310    root = cache_dir;
     311    string cache = root+"/."+ext;
     312    string cachedir = cache+url;
     313    struct stat statfile, statcache;
     314    bool extract_cache;
     315#ifdef HAVE_LSTAT
     316    lstat(file.c_str(), &statfile);
     317    lstat(cachedir.c_str(), &statcache);
     318#else
     319    stat(file.c_str(), &statfile);
     320    stat(cachedir.c_str(), &statcache);
     321#endif
     322    extract_cache = true;
     323    // if cachedir exists and if file is older as cachedir and if cachedir existed 5 secs ago,
     324    // then it was already extracted.
     325    if (S_ISDIR(statcache.st_mode) &&
     326        S_ISREG(statfile.st_mode) &&
     327        (statfile.st_mtime < statcache.st_mtime) &&
     328        (statcache.st_mtime < (time_t)(time(NULL)-500))) // not created by nested mkdir call
     329    {
     330        // but is it in the database also? prevent from deleting skipped files
     331        if (verbose)
     332            cout << "Unchanged cache \"" << cachedir << "\" - \"" << file << "\" - skip extraction "
     333                 // << statfile.st_mtime << " < " << statcache.st_mtime
     334                 << endl;
     335        extract_cache = false;
     336    }
     337    if (S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode) ) {
     338        // If last_mod > last_mod_max, we know for sure that the file is new
     339        // or updated.
     340        if (statfile.st_mtime <= last_mod_max) {
     341            // check database timestamp for cached container, esp. for cleaned up caches.
     342            // if already in db we need not to extract again
     343            string urlterm("U");
     344            urlterm += baseurl;
     345            urlterm += "/."+ext+url;
     346            if (urlterm.length() > MAX_SAFE_TERM_LENGTH)
     347                urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH);
     348
     349            Xapian::PostingIterator p = db.postlist_begin(urlterm);
     350            if (p != db.postlist_end(urlterm)) {
     351                Xapian::docid docid = *p;
     352                Xapian::Document doc = db.get_document(docid);
     353                string value = doc.get_value(VALUE_LASTMOD);
     354                time_t old_last_mod = binary_string_to_int(value);
     355                if (statfile.st_mtime <= old_last_mod) {
     356                    if (verbose)
     357                        cout << "Cache "<< "."+ext+url << " not newer. Ignored." << endl;
     358                    // The docid should be in updated - the only valid
     359                    // exception is if the URL was long and hashed to the
     360                    // same URL as an existing document indexed in the same
     361                    // batch.
     362                    if (usual(docid < updated.size() && !updated[docid])) {
     363                        updated[docid] = true;
     364                        --old_docs_not_seen;
     365                    }
     366                    root = oldroot;
     367                    return;
     368                }
     369            }
     370        }
     371    }
     372
     373    if (extract_cache) {
     374        if (verbose)
     375            cout << "[EXTRACT into cache " << cachedir << "]" << endl;
     376        if (verbose && S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode))
     377            cout << " ...changed cache \"" << cachedir << "\" - \"" << file << "\" "
     378                 << statfile.st_mtime << " < " << statcache.st_mtime << " time: " << time(NULL)
     379                 << endl;
     380        if (!S_ISDIR(statcache.st_mode))
     381            mkdir_p(cachedir, 0755);
     382        stdout_to_string(cmd);
     383#ifndef __WIN32__
     384        string chmod_cmd = "chmod -R u+rwx ";
     385        append_filename_argument(chmod_cmd, cachedir);
     386        stdout_to_string(chmod_cmd);
     387#endif
     388#ifdef HAVE_LSTAT
     389        lstat(cachedir.c_str(), &statcache);
     390#else
     391        stat(cachedir.c_str(), &statcache);
     392#endif
     393    }
     394
     395    if (S_ISDIR(statcache.st_mode)) {
     396        if (depth_limit == 1) {
     397            cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl;
     398        } else {
     399            // max loop 5, magic start: /.ext+file
     400            index_directory(cachedir + "/", url, depth_limit + 5, mime_map, sample_size);
     401            if (verbose)
     402                cout << "[CLEANUP '" << cachedir << "']" << endl;
     403            rm_rf(cachedir);
     404        }
     405    } else { // no -p would be fatal here
     406        cout << "cachedir '" << cachedir << "' does not exist - skipped" << endl;
     407    }
     408    root = oldroot;
     409}
    262410
    263411static void
    264412index_file(const string &file, const string &url, DirectoryIterator & d,
    265            map<string, string>& mime_map, size_t sample_size)
     413           map<string, string>& mime_map, size_t sample_size,
     414           size_t depth_limit)
    266415{
    267416    string ext;
    268417    const char * dot_ptr = strrchr(d.leafname(), '.');
    index_file(const string &file, const string &url, DirectoryIterator & d,  
    290439
    291440    string mimetype;
    292441    if (mt == mime_map.end()) {
     442        if (strcasecmp(d.leafname(), "mbox") == 0) {
     443            // Special filename.
     444            mimetype = "message/rfc822";
     445            goto got_mimetype;
     446        }
     447
    293448        mimetype = d.get_magic_mimetype();
    294449        if (mimetype.empty()) {
    295450            skip(file, "Unknown extension and unrecognised format",
    index_file(const string &file, const string &url, DirectoryIterator & d,  
    302457        mimetype = mt->second;
    303458    }
    304459
     460got_mimetype:
     461
    305462    if (verbose)
    306463        cout << "Indexing \"" << file.substr(root.size()) << "\" as "
    307464             << mimetype << " ... ";
    index_file(const string &file, const string &url, DirectoryIterator & d,  
    319476        return;
    320477    }
    321478
    322     index_mimetype(file, url, ext, mimetype, d, sample_size);
     479    index_mimetype(file, url, ext, mimetype, d, sample_size, mime_map, depth_limit);
    323480}
    324481
    325482void
    326483index_mimetype(const string & file, const string & url, const string & ext,
    327                const string &mimetype, DirectoryIterator &d, size_t sample_size)
     484               const string &mimetype, DirectoryIterator &d, size_t sample_size,
     485               map<string, string>& mime_map, size_t depth_limit)
    328486{
    329487    string urlterm("U");
    330488    urlterm += url;
    index_mimetype(const string & file, const string & url, const string & ext,  
    373531            }
    374532        }
    375533    }
     534    // add the db basename to cache_dir
     535    {
     536        cache_dir = get_tmpdir();
     537        const char *p = strrchr(dbpath.c_str(), '/');
     538        // on windows only
     539        if (!p) p = strrchr(dbpath.c_str(), '\\');
     540        if (p) { p++; } else { p = dbpath.c_str(); }
     541        cache_dir += p;
     542    }
    376543
    377544    if (verbose) cout << flush;
    378545
    index_mimetype(const string & file, const string & url, const string & ext,  
    437604            } else {
    438605                // FIXME: What charset is the file?  Look at contents?
    439606            }
     607#if 0 // FIXME: this won't work as omindex will have the database locked...
     608    } else if (mimetype == "message/rfc822") { // // => mbox2script
     609        //for stemmer lang, parse stemmer.get_description => Xapian::Stem(bla)
     610        string cmd = LIBEXECDIR"mbox2omega";
     611        append_filename_argument(cmd, file);
     612        cmd += error_log+"| scriptindex";
     613        append_filename_argument(cmd, dbpath);
     614        cmd += " "PKGDATADIR"mbox2script.script";
     615        try {
     616            dump = stdout_to_string(cmd);
     617        } catch (ReadError) {
     618            cout << "\"" << cmd << "\" failed - skipping" << endl;
     619            return;
     620        }
     621#endif
    440622        } else if (mimetype == "application/pdf") {
    441623            string cmd = "pdftotext -enc UTF-8";
    442624            append_filename_argument(cmd, file);
    index_mimetype(const string & file, const string & url, const string & ext,  
    702884
    703885            generate_sample_from_csv(dump, sample, sample_size);
    704886        } else if (mimetype == "application/vnd.ms-outlook") {
    705             string cmd = get_pkglibbindir() + "/outlookmsg2html";
     887            string oldroot = root;
     888            struct stat statcache;
     889            char olddir[256];
     890
     891            if (depth_limit == 1) {
     892                skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME);
     893                return;
     894            }
     895            string cmd = LIBEXECDIR"outlook2text";
    706896            append_filename_argument(cmd, file);
    707             MyHtmlParser p;
    708             p.ignore_metarobots();
     897            // unpack multiparts and attachments. so we have to chdir first
     898            string fulldir = cache_dir+"/."+ext+url;
     899            if (getcwd(olddir, 256) == NULL) { /* FIXME */ }
     900#ifdef HAVE_LSTAT
     901            lstat(fulldir.c_str(), &statcache);
     902#else
     903            stat(fulldir.c_str(), &statcache);
     904#endif
     905            if (!S_ISDIR(statcache.st_mode)) {
     906                mkdir_p(fulldir, 0755);
     907            }
    709908            try {
    710                 dump = stdout_to_string(cmd);
    711                 // FIXME: what should the default charset be?
    712                 p.parse_html(dump, "iso-8859-1", false);
    713             } catch (const string & newcharset) {
    714                 p.reset();
    715                 p.ignore_metarobots();
    716                 p.parse_html(dump, newcharset, true);
     909                if (chdir(fulldir.c_str()) < 0) { /* FIXME */ }
     910                size_t new_limit = depth_limit;
     911                if (new_limit) --new_limit;
     912                index_cached_directory(new_limit, file, url, ext, cmd, mime_map, sample_size);
     913                if (chdir(olddir) < 0) { /* FIXME */ }
    717914            } catch (ReadError) {
    718                 skip_cmd_failed(file, cmd);
    719                 return;
     915                cout << "failed " << cmd << " << in index_cached_directory" << endl;
     916                if (chdir(olddir) < 0) { /* FIXME */ }
     917                root = oldroot;
     918            } catch (...) {
     919                cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     920                if (chdir(olddir) < 0) { /* FIXME */ }
     921                root = oldroot;
     922                throw;
    720923            }
    721             dump = p.dump;
    722             title = p.title;
    723             keywords = p.keywords;
    724             sample = p.sample;
    725             author = p.author;
     924            return;
    726925        } else if (mimetype == "image/svg+xml") {
    727926            SvgParser svgparser;
    728927            svgparser.parse_html(d.file_to_string());
    index_mimetype(const string & file, const string & url, const string & ext,  
    9371136                    cout << "added" << endl;
    9381137                }
    9391138            }
     1139        } else if (mimetype == "application/x-zip") {
     1140            string oldroot = root;
     1141            if (depth_limit == 1) {
     1142                skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME);
     1143                return;
     1144            }
     1145            // overwrite
     1146            string cmd = "unzip -u -P. -o";
     1147            append_filename_argument(cmd, file);
     1148            cmd += " -d";
     1149            append_filename_argument(cmd, cache_dir+"/."+ext+url+"/");
     1150            try {
     1151                size_t new_limit = depth_limit;
     1152                if (new_limit) --new_limit;
     1153                index_cached_directory(new_limit, file, url, ext, cmd, mime_map, sample_size);
     1154            } catch (ReadError) {
     1155                cout << "failed " << cmd << " << in index_cached_directory" << endl;
     1156                root = oldroot;
     1157            } catch (...) {
     1158                cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     1159                root = oldroot;
     1160                throw;
     1161            }
     1162            return;
     1163        } else if (mimetype == "application/x-rar") {
     1164            string oldroot = root;
     1165            if (depth_limit == 1) {
     1166                skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME);
     1167                return;
     1168            }
     1169
     1170            // overwrite
     1171            string cmd = "unrar x -o+";
     1172            append_filename_argument(cmd, file);
     1173            append_filename_argument(cmd, cache_dir+"/."+ext+url+"/");
     1174            try {
     1175                size_t new_limit = depth_limit;
     1176                if (new_limit) --new_limit;
     1177                index_cached_directory(new_limit, file, url, ext, cmd, mime_map, sample_size);
     1178            } catch (ReadError) {
     1179                cout << "failed " << cmd << " << in index_cached_directory" << endl;
     1180                root = oldroot;
     1181            } catch (...) {
     1182                cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     1183                root = oldroot;
     1184                throw;
     1185            }
     1186            return;
     1187        } else if (mimetype == "application/vnd.ms-outlook-pst") {
     1188            string oldroot = root;
     1189            if (depth_limit == 1) {
     1190                skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME);
     1191                return;
     1192            }
     1193            // unpack attachments also, together with mbox files
     1194            string cmd = "readpst -r -cv -w -o";
     1195            append_filename_argument(cmd, cache_dir+"/."+ext+url+"/");
     1196            append_filename_argument(cmd, file);
     1197            try {
     1198                size_t new_limit = depth_limit;
     1199                if (new_limit) --new_limit;
     1200                index_cached_directory(new_limit, file, url, ext, cmd, mime_map, sample_size);
     1201            } catch (ReadError) {
     1202                root = oldroot;
     1203                cout << "failed " << cmd << " << in index_cached_directory" << endl;
     1204            } catch (...) {
     1205                root = oldroot;
     1206                cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     1207                throw;
     1208            }
     1209            return;
    9401210        } else {
    9411211            // If this were a duplicate, we'd have skipped it above.
    9421212            db.add_document(newdocument);
    index_directory(const string &path, const string &url_, size_t depth_limit,  
    9841254                        break;
    9851255                    }
    9861256                    case DirectoryIterator::REGULAR_FILE:
    987                         index_file(file, url, d, mime_map, sample_size);
     1257                        index_file(file, url, d, mime_map, sample_size, depth_limit);
    9881258                        break;
    9891259                    default:
    9901260                        skip(file, "Not a regular file",
    main(int argc, char **argv)  
    10371307    bool overwrite = false;
    10381308    // If delete_removed_documents is true, delete any documents we don't see.
    10391309    bool delete_removed_documents = true;
    1040     string baseurl;
    10411310    size_t depth_limit = 0;
    10421311    size_t sample_size = SAMPLE_SIZE;
    10431312
    main(int argc, char **argv)  
    11651434    mime_map["pps"] = "application/vnd.ms-powerpoint"; // Powerpoint slideshow
    11661435    mime_map["msg"] = "application/vnd.ms-outlook"; // Outlook .msg email
    11671436
     1437    //  Outlook message folder:
     1438    mime_map["pst"] = "application/vnd.ms-outlook-pst"; // readpst | uudeview (libpst)
     1439
     1440    // Miscellaneous compound formats:
     1441    mime_map["mbox"] = "message/rfc822";                // => mbox2omega
     1442    mime_map["mbx"] = "message/rfc822";                // => mbox2omega
     1443#ifndef _MSC_VER
     1444    mime_map["zip"] = "application/x-zip"; // recursive scanning
     1445    mime_map["rar"] = "application/x-rar"; // recursive scanning
     1446#endif
     1447
    11681448    // Perl:
    11691449    mime_map["pl"] = "text/x-perl";
    11701450    mime_map["pm"] = "text/x-perl";
    main(int argc, char **argv)  
    12381518        argv[1] = const_cast<char *>("--version");
    12391519    }
    12401520
    1241     string dbpath;
    12421521    int getopt_ret;
    12431522    while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:F:l:s:pfSVe:im:E:",
    12441523                                         longopts, NULL)) != -1) {
    main(int argc, char **argv)  
    14131692        baseurl += '/';
    14141693    }
    14151694
     1695    string log_dir = "./"; // FIXME: need to set log_dir to something appropriate.
     1696    error_log = " 2>>"+log_dir+"omindex-error.log";
     1697
    14161698    if (optind >= argc || optind + 2 < argc) {
    14171699        cerr << PROG_NAME": you must specify a directory to index.\n"
    14181700"Do this either as a single directory (corresponding to the base URL)\n"
  • new file xapian-applications/omega/outlook2text.in

    diff --git a/xapian-applications/omega/outlook2text.in b/xapian-applications/omega/outlook2text.in
    new file mode 100644
    index 0000000..b7cf3e2
    - +  
     1#! /bin/sh
     2# converts msg to mbox and extract attachments
     3# either be in the cache dir, or accept it as 2nd arg
     4if [ -n $2 ]; then
     5  @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "$2"
     6else
     7  # already is in the cache dir
     8  base=`basename "$1" .msg`
     9  @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "${base}"
     10fi
  • xapian-applications/omega/runfilter.cc

    diff --git a/xapian-applications/omega/runfilter.cc b/xapian-applications/omega/runfilter.cc
    index c2a24bd..ed4f8cd 100644
    a b  
    5555
    5656using namespace std;
    5757
     58extern string error_log;
     59
    5860string
    5961stdout_to_string(const string &cmd)
    6062{
    6163    string out;
     64    string tmp = cmd;
     65    tmp += error_log;
    6266#if defined HAVE_FORK && defined HAVE_SOCKETPAIR && defined HAVE_SETRLIMIT
    6367    // We want to be able to get the exit status of the child process.
    6468    signal(SIGCHLD, SIG_DFL);
    stdout_to_string(const string &cmd)  
    104108        }
    105109#endif
    106110
    107         execl("/bin/sh", "/bin/sh", "-c", cmd.c_str(), (void*)NULL);
     111        execl("/bin/sh", "/bin/sh", "-c", tmp.c_str(), (void*)NULL);
    108112        _exit(-1);
    109113    }
    110114
    stdout_to_string(const string &cmd)  
    171175        throw ReadError();
    172176    }
    173177#else
    174     FILE * fh = popen(cmd.c_str(), "r");
     178    FILE * fh = popen(tmp.c_str(), "r");
    175179    if (fh == NULL) throw ReadError();
    176180    while (!feof(fh)) {
    177181        char buf[4096];
  • xapian-applications/omega/utils.cc

    diff --git a/xapian-applications/omega/utils.cc b/xapian-applications/omega/utils.cc
    index 797c47d..92b5c76 100644
    a b  
    2323
    2424#include "utils.h"
    2525
     26#include "append_filename_arg.h"
     27
    2628#include <cassert>
    2729#include <stdio.h> // for sprintf/snprintf
    2830#include <cstdlib>
     31#include <cstring>
     32#include "safesysstat.h"
    2933
    3034#include <string>
    3135
    3236using namespace std;
    3337
     38#ifdef __WIN32__
     39#include "safewindows.h"
     40#endif
     41
     42/// Remove a directory and contents.
     43void
     44rm_rf(const string &filename)
     45{
     46    // Check filename exists and is actually a directory
     47    struct stat sb;
     48    if (stat(filename.c_str(), &sb) != 0 || !S_ISDIR(sb.st_mode)) return;
     49
     50#ifdef __WIN32__
     51    static int win95 = -1;
     52    if (win95 == -1) {
     53        OSVERSIONINFO info;
     54        memset(&info, 0, sizeof(OSVERSIONINFO));
     55        info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
     56        if (GetVersionEx(&info)) {
     57            win95 = (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS);
     58        }
     59    }
     60
     61    string cmd = win95 ?
     62        "deltree /y" : // for 95-like systems.
     63        "rd /s /q"; // for NT-like systems.
     64#else
     65    string cmd = "rm -rf";
     66#endif
     67    append_filename_argument(cmd, filename);
     68    if (system(cmd.c_str())) { /* FIXME */ }
     69}
     70
    3471// This ought to be enough for any of the conversions below.
    3572#define BUFSIZE 100
    3673
  • xapian-applications/omega/utils.h

    diff --git a/xapian-applications/omega/utils.h b/xapian-applications/omega/utils.h
    index a54a4f8..b2241b7 100644
    a b int string_to_int(const std::string & s);  
    3434/** Remove any leading and/or trailing whitespace from @a s. */
    3535void trim(std::string & s);
    3636
     37void rm_rf(const std::string &filename);
     38
    3739#endif