Ticket #282: xapian-omega-trunk-r16879-from-ticket-285-and-cleaned-up-updated-2012-11-13.patch
File xapian-omega-trunk-r16879-from-ticket-285-and-cleaned-up-updated-2012-11-13.patch, 46.8 KB (added by , 11 years ago) |
---|
-
xapian-applications/omega/ChangeLog
diff --git a/xapian-applications/omega/ChangeLog b/xapian-applications/omega/ChangeLog index a8e4b7e..33bc585 100644
a b 1 2006-08-22 09:30:12 Reini Urban <reinhard.urban@avl.com> 2 3 omega-0.9.6c: 4 * omindex.cc: Fix wrong timestamp comparison in cache logic 5 * outlook2text.in: New script 6 7 2006-08-17 18:06:26 Reini Urban <reinhard.urban@avl.com> 8 9 omega-0.9.6a: 10 * omindex.cc: Added cached virtual directories zip,msg,pst,...). 11 Consistently log stderr to /var/log/omega/omindex-error.log. 12 1 13 Tue Sep 25 23:57:12 GMT 2012 Olly Betts <olly@survex.com> 2 14 3 15 * Makefile.am,omindex.cc: Replace shell_protect() with -
xapian-applications/omega/Makefile.am
diff --git a/xapian-applications/omega/Makefile.am b/xapian-applications/omega/Makefile.am index 3599376..9b29a02 100644
a b pkglibbindir = $(pkglibdir)/bin 78 78 pkglibbin_PROGRAMS = omega 79 79 dist_pkglibbin_SCRIPTS = outlookmsg2html 80 80 bin_PROGRAMS = omindex scriptindex 81 dist_libexec_SCRIPTS = outlook2text mimeexplode msgconvert.pl 81 82 dist_bin_SCRIPTS = dbi2omega htdig2omega mbox2omega 82 83 83 84 check_PROGRAMS = atomparsetest htmlparsetest md5test urlenctest utf8converttest … … dist_man_MANS = omindex.1 scriptindex.1 173 174 MAINTAINERCLEANFILES = $(dist_man_MANS) 174 175 endif 175 176 177 CLEANFILES = outlook2text 178 179 outlook2text: $(srcdir)/outlook2text.in Makefile 180 sed "s,@MSGCONVERT@,$(pkglibbindir)/msgconvert.pl,;s,@MIMEEXPLODE@,$(pkglibbindir)/mimeexplode," $(srcdir)/outlook2text.in > $@ 181 176 182 if DOCUMENTATION_RULES 177 183 omindex.1: omindex$(EXEEXT) makemanpage 178 184 ./makemanpage ./omindex $(srcdir)/omindex.cc omindex.1 -
new file xapian-applications/omega/mimeexplode
diff --git a/xapian-applications/omega/mimeexplode b/xapian-applications/omega/mimeexplode new file mode 100644 index 0000000..70743ab
- + 1 #!/usr/bin/perl -w 2 3 =head1 NAME 4 5 mimeexplode - explode one or more MIME messages 6 7 =head1 SYNOPSIS 8 9 mimeexplode [-d <dir>] <mime-msg-file> <mime-msg-file> ... 10 11 someprocess | mimeexplode - 12 13 =head1 DESCRIPTION 14 15 Takes one or more files from the command line that contain MIME 16 messages, and explodes their contents out into subdirectories 17 of the current working directory. The subdirectories are 18 just called C<msg0>, C<msg1>, C<msg2>, etc. Existing directories are 19 skipped over. 20 21 The message information is output to the stdout, like this: 22 23 Message: msg3 (inputfile1.msg) 24 Part: msg3/filename-1.dat (text/plain) 25 Part: msg3/filename-2.dat (text/plain) 26 Message: msg5 (input-file2.msg) 27 Part: msg5/dir.gif (image/gif) 28 Part: msg5/face.jpg (image/jpeg) 29 Message: msg6 (infile3) 30 Part: msg6/filename-1.dat (text/plain) 31 32 This was written as an example of the MIME:: modules in the 33 MIME-parser package I wrote. It may prove useful as a quick-and-dirty 34 way of splitting a MIME message if you need to decode something, and 35 you don't have a MIME mail reader on hand. 36 37 =head1 COMMAND LINE OPTIONS 38 39 -d outdir 40 41 =head1 AUTHOR 42 43 Eryq C<eryq@zeegee.com>, in a big hurry... 44 Reini Urban C<rurban@x-ray.at>: -d option to always explode into the same dir 45 46 =cut 47 48 #BEGIN { unshift @INC, ".." } # to test MIME:: stuff before installing it! 49 50 require 5.001; 51 52 use strict; 53 use vars; 54 55 use MIME::Parser; 56 use Getopt::Std; 57 my %opts; 58 my $outbase = ''; 59 my $postfix = ''; 60 61 #------------------------------------------------------------ 62 # make_msg - make and return the name of a msgXXX directory 63 #------------------------------------------------------------ 64 65 #ignored 66 #sub make_msg { 67 # while (-d "msg$Msgno") { 68 # ++$Msgno; 69 # die "self-imposed limit reached" if $Msgno == 256; 70 # } 71 # mkdir "msg$Msgno",0755 or die "couldn't make msg$Msgno: $!"; 72 # "msg$Msgno"; 73 #} 74 75 #------------------------------------------------------------ 76 # dump_entity - dump an entity's file info 77 #------------------------------------------------------------ 78 sub dump_entity { 79 my $ent = shift; 80 my @parts = $ent->parts; 81 82 if (@parts) { # multipart... 83 map { dump_entity($_) } @parts; 84 } 85 else { # single part... 86 print " Part: ", $ent->bodyhandle->path, 87 " (", scalar($ent->head->mime_type), ")\n"; 88 } 89 } 90 91 #------------------------------------------------------------ 92 # main 93 #------------------------------------------------------------ 94 sub main { 95 my $file; 96 my $entity; 97 98 # make sure the same message gets exploded into the same dir 99 getopts('d:', \%opts); 100 $outbase = $opts{d} ? $opts{d} : "msg0"; 101 my $outdir = $outbase; 102 103 # Go through messages: 104 @ARGV or unshift @ARGV, "-"; 105 while (defined($file = shift @ARGV)) { 106 107 # Sanity: 108 (-d $outdir) or mkdir "$outdir",0755; 109 (-w "$outdir") or die "cwd $outdir not writable!"; 110 #my $msgdir = make_msg(); 111 #print "Message: $msgdir ($file)\n"; 112 113 # Create a new parser object: 114 my $parser = new MIME::Parser; 115 ### $parser->parse_nested_messages('REPLACE'); 116 117 # Optional: set up parameters that will affect how it extracts 118 # documents from the input stream: 119 $parser->output_dir($outdir); 120 121 # Parse an input stream: 122 open FILE, $file or die "couldn't open $file"; 123 $entity = $parser->read(\*FILE) or 124 print STDERR "Couldn't parse MIME in $file; continuing...\n"; 125 close FILE; 126 127 # Congratulations: you now have a (possibly multipart) MIME entity! 128 dump_entity($entity) if $entity; 129 ### $entity->dump_skeleton if $entity; 130 131 $postfix++; 132 $outdir = $outbase.$postfix; 133 } 134 1; 135 } 136 137 exit (&main ? 0 : -1); 138 #------------------------------------------------------------ 139 1; 140 -
new file xapian-applications/omega/msgconvert.pl
diff --git a/xapian-applications/omega/msgconvert.pl b/xapian-applications/omega/msgconvert.pl new file mode 100644 index 0000000..cf32079
- + 1 #!/usr/bin/perl -w 2 # 3 # msgconvert.pl: 4 # 5 # Convert .MSG files (made by Outlook (Express)) to multipart MIME messages. 6 # 7 # Copyright 2002, 2004, 2006 Matijs van Zuijlen 8 # 9 # This program is free software; you can redistribute it and/or modify it 10 # under the terms of the GNU General Public License as published by the 11 # Free Software Foundation; either version 2 of the License, or (at your 12 # option) any later version. 13 # 14 # This program is distributed in the hope that it will be useful, but 15 # WITHOUT ANY WARRANTY; without even the implied warranty of 16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 17 # Public License for more details. 18 # 19 # CHANGES: 20 # 20020715 Recognize new items 'Cc', mime type of attachment, long 21 # filename of attachment, and full headers. Attachments turn out 22 # to be numbered, so a regexp is now used to recognize label of 23 # items that are attachments. 24 # 20020831 long file name will definitely be used if present. Full headers 25 # and mime type information are used when present. Created 26 # generic system for specifying known items to be skipped. 27 # Unexpected contents is never reason to bail out anymore. Added 28 # support for usage message and option processing (--verbose). 29 # 20040104 Handle address data slightly better, make From line less fake, 30 # make $verbose and $skippable_entries global vars, handle HTML 31 # variant of body text if present (though not optimally). 32 # 20040214 Fix typos and incorrect comments. 33 # 20040307 - Complete rewrite: All functional parts are now in the package 34 # MSGParser; 35 # - Creation of MIME::Entity object is delayed until the output 36 # routines, which means all data is known; This means I can 37 # create a multipart/alternative body. 38 # - Item names are parsed (thanks to bfrederi@alumni.sfu.ca for 39 # the information). 40 # 20040514 Check if $self->{HEAD} actually exists before trying to add its 41 # contents to the output Mime object's header data. 42 # (Bug reported by Thomas Ng). 43 # Don't produce multipart messages if not needed. 44 # (Bug reported by Justin B. Scout). 45 # 20040529 Correctly format OLEDATE. 46 # 20040530 - Extract date from property 0047 (thanks, Marc Goodman). 47 # - Use address data to make To: and Cc: lines complete 48 # - Use the in-reply-to property 49 # - More unknown properties named. 50 # - Found another property containing an SMTP address. 51 # - Put non-SMTP type addresses back in output. 52 # 20040825 Replace 'our' to declare globals with 'use vars'. This means 53 # the globals our now properly scoped inside the package and not 54 # the file. 55 # This also fixes the bug that this program did not work on perl 56 # versions below 5.6. (Bug reported by Tim Gustafson) 57 # 20060218 More sensible encoding warnings. 58 # 20060219 Move OLE parsing to main program. 59 # Parse nested MSG files (Bug reported by Christof Lukas). 60 # 20060225 Simplify code. 61 # 62 63 # 64 # Import modules. 65 # 66 package MSGParser; 67 use strict; 68 use OLE::Storage_Lite; 69 use MIME::Entity; 70 use MIME::Parser; 71 use Date::Format; 72 use POSIX qw(mktime); 73 use constant DIR_TYPE => 1; 74 use constant FILE_TYPE => 2; 75 76 use vars qw($skipproperties $skipheaders); 77 # 78 # Descriptions partially based on mapitags.h 79 # 80 $skipproperties = { 81 # Envelope properties 82 '000B' => "Conversation key?", 83 '001A' => "Type of message", 84 '003B' => "Sender address variant", 85 '003D' => "Contains 'Re: '", 86 '003F' => "'recieved by' id", 87 '0040' => "'recieved by' name", 88 '0041' => "Sender variant address id", 89 '0042' => "Sender variant name", 90 '0043' => "'recieved representing' id", 91 '0044' => "'recieved representing' name", 92 '0046' => "Read receipt address id", 93 '0051' => "'recieved by' search key", 94 '0052' => "'recieved representing' search key", 95 '0053' => "Read receipt search key", 96 '0064' => "Sender variant address type", 97 '0065' => "Sender variant address", 98 '0070' => "Conversation topic", 99 '0071' => "Conversation index", 100 '0075' => "'recieved by' address type", 101 '0076' => "'recieved by' email address", 102 '0077' => "'recieved representing' address type", 103 '0078' => "'recieved representing' email address", 104 '007F' => "something like a message id", 105 # Recipient properties 106 '0C19' => "Reply address variant", 107 '0C1D' => "Reply address variant", 108 '0C1E' => "Reply address type", 109 # Non-transmittable properties 110 '0E02' => "?Should BCC be displayed", 111 '0E0A' => "sent mail id", 112 '0E1D' => "Subject w/o Re", 113 '0E27' => "64 bytes: Unknown", 114 '0FF6' => "Index", 115 '0FF9' => "Index", 116 '0FFF' => "Address variant", 117 # Content properties 118 '1008' => "Summary or something", 119 '1009' => "RTF Compressed", 120 # 'Common property' 121 '3001' => "Display name", 122 '3002' => "Address Type", 123 '300B' => "'Search key'", 124 # Attachment properties 125 '3702' => "Attachment encoding", 126 '3703' => "Attachment extension", 127 '3709' => "'Attachment rendering'", # Maybe an icon or something? 128 '3713' => "Icon URL?", 129 # 'Mail user' 130 '3A20' => "Address variant", 131 # 3900 -- 39FF: 'Address book' 132 '39FF' => "7 bit display name", 133 # 'Display table properties' 134 '3FF8' => "Routing data?", 135 '3FF9' => "Routing data?", 136 '3FFA' => "Routing data?", 137 '3FFB' => "Routing data?", 138 # 'Transport-defined envelope property' 139 '4029' => "Sender variant address type", 140 '402A' => "Sender variant address", 141 '402B' => "Sender variant name", 142 '5FF6' => "Recipient name", 143 '5FF7' => "Recipient address variant", 144 # 'Provider-defined internal non-transmittable property' 145 '6740' => "Unknown, binary data", 146 # User defined id's 147 '8000' => "Content Class", 148 '8002' => "Unknown, binary data", 149 }; 150 151 $skipheaders = { 152 "MIME-Version" => 1, 153 "Content-Type" => 1, 154 "Content-Transfer-Encoding" => 1, 155 "X-Mailer" => 1, 156 "X-Msgconvert" => 1, 157 "X-MS-Tnef-Correlator" => 1, 158 "X-MS-Has-Attach" => 1, 159 }; 160 161 use constant ENCODING_UNICODE => '001F'; 162 use constant KNOWN_ENCODINGS => { 163 '000D' => 'Directory', 164 '001F' => 'Unicode', 165 '001E' => 'Ascii?', 166 '0102' => 'Binary', 167 }; 168 169 use constant MAP_ATTACHMENT_FILE => { 170 '3701' => ["DATA", 0], # Data 171 '3704' => ["SHORTNAME", 1], # Short file name 172 '3707' => ["LONGNAME", 1], # Long file name 173 '370E' => ["MIMETYPE", 1], # mime type 174 '3716' => ["DISPOSITION", 1], # disposition 175 }; 176 177 use constant MAP_SUBITEM_FILE => { 178 '1000' => ["BODY_PLAIN", 0], # Body 179 '1013' => ["BODY_HTML", 0], # HTML Version of body 180 '0037' => ["SUBJECT", 1], # Subject 181 '0047' => ["SUBMISSION_ID", 1], # Seems to contain the date 182 '007D' => ["HEAD", 1], # Full headers 183 '0C1A' => ["FROM", 1], # Reply-To: Name 184 '0C1E' => ["FROM_ADDR_TYPE", 1], # From: Address type 185 '0C1F' => ["FROM_ADDR", 1], # Reply-To: Address 186 '0E04' => ["TO", 1], # To: Names 187 '0E03' => ["CC", 1], # Cc: Names 188 '1035' => ["MESSAGEID", 1], # Message-Id 189 '1042' => ["INREPLYTO", 1], # In reply to Message-Id 190 }; 191 192 use constant MAP_ADDRESSITEM_FILE => { 193 '3001' => ["NAME", 1], # Real name 194 '3002' => ["TYPE", 1], # Address type 195 '403D' => ["TYPE", 1], # Address type 196 '3003' => ["ADDRESS", 1], # Address 197 '403E' => ["ADDRESS", 1], # Address 198 '39FE' => ["SMTPADDRESS", 1], # SMTP Address variant 199 }; 200 201 # 202 # Main body of module 203 # 204 205 sub new { 206 my $that = shift; 207 my $class = ref $that || $that; 208 209 my $self = { 210 ATTACHMENTS => [], 211 ADDRESSES => [], 212 VERBOSE => 0, 213 HAS_UNICODE => 0, 214 FROM_ADDR_TYPE => "", 215 }; 216 bless $self, $class; 217 } 218 219 # 220 # Main sub: parse the PPS tree, and return 221 # 222 sub parse { 223 my $self = shift; 224 my $PPS = shift or die "Internal error: No PPS tree"; 225 $self->_RootDir($PPS); 226 } 227 228 sub mime_object { 229 my $self = shift; 230 231 my $bodymime; 232 my $mime; 233 234 if ($self->_IsMultiPart) { 235 # Construct a multipart message object 236 237 $mime = MIME::Entity->build(Type => "multipart/mixed"); 238 239 # Set the entity that we'll save the body parts to. If there's more than 240 # one part, it's a new entity, otherwise, it's the main $mime object. 241 if ($self->{BODY_HTML} and $self->{BODY_PLAIN}) { 242 $bodymime = MIME::Entity->build( 243 Type => "multipart/alternative", 244 Encoding => "8bit", 245 ); 246 $mime->add_part($bodymime); 247 } else { 248 $bodymime = $mime; 249 } 250 if ($self->{BODY_PLAIN}) { 251 $self->_SaveAttachment($bodymime, { 252 MIMETYPE => 'text/plain; charset=ISO-8859-1', 253 ENCODING => '8bit', 254 DATA => $self->{BODY_PLAIN}, 255 DISPOSITION => 'inline', 256 }); 257 } 258 if ($self->{BODY_HTML}) { 259 $self->_SaveAttachment($bodymime, { 260 MIMETYPE => 'text/html', 261 ENCODING => '8bit', 262 DATA => $self->{BODY_HTML}, 263 DISPOSITION => 'inline', 264 }); 265 } 266 foreach my $att (@{$self->{ATTACHMENTS}}) { 267 $self->_SaveAttachment($mime, $att); 268 } 269 } elsif ($self->{BODY_PLAIN}) { 270 # Construct a single part message object with a plain text body 271 $mime = MIME::Entity->build( 272 Type => "text/plain", 273 Data => $self->{BODY_PLAIN} 274 ); 275 } elsif ($self->{BODY_HTML}) { 276 # Construct a single part message object with an HTML body 277 $mime = MIME::Entity->build( 278 Type => "text/html", 279 Data => $self->{BODY_HTML} 280 ); 281 } 282 283 $self->_CopyHeaderData($mime); 284 285 $self->_SetHeaderFields($mime); 286 287 return $mime; 288 } 289 290 # Actually output the message in mbox format 291 sub print { 292 my $self = shift; 293 294 my $mime = $self->mime_object; 295 296 # Construct From line from whatever we know. 297 my $string = ""; 298 $string = ( 299 $self->{FROM_ADDR_TYPE} eq "SMTP" ? 300 $self->{FROM_ADDR} : 301 'someone@somewhere' 302 ); 303 $string =~ s/\n//g; 304 305 # The date used here is not really important. 306 print "From ", $string, " ", scalar localtime, "\n"; 307 $mime->print(\*STDOUT); 308 print "\n"; 309 } 310 311 sub set_verbosity { 312 my ($self, $verbosity) = @_; 313 defined $verbosity or die "Internal error: no verbosity level"; 314 $self->{VERBOSE} = $verbosity; 315 } 316 317 # 318 # Below are functions that walk the PPS tree. The *Dir functions handle 319 # processing the directory nodes of the tree (mainly, iterating over the 320 # children), whereas the *Item functions handle processing the items in the 321 # directory (if such an item is itself a directory, it will in turn be 322 # processed by the relevant *Dir function). 323 # 324 325 # 326 # RootItem: Check Root Entry, parse sub-entries. 327 # The OLE file consists of a single entry called Root Entry, which has 328 # several children. These children are parsed in the sub SubItem. 329 # 330 sub _RootDir { 331 my ($self, $PPS) = @_; 332 333 foreach my $child (@{$PPS->{Child}}) { 334 $self->_SubItem($child); 335 } 336 } 337 338 sub _SubItem { 339 my ($self, $PPS) = @_; 340 341 if ($PPS->{Type} == DIR_TYPE) { 342 $self->_SubItemDir($PPS); 343 } elsif ($PPS->{Type} == FILE_TYPE) { 344 $self->_SubItemFile($PPS); 345 } else { 346 warn "Unknown entry type: $PPS->{Type}"; 347 } 348 } 349 350 sub _SubItemDir { 351 my ($self, $PPS) = @_; 352 353 $self->_GetOLEDate($PPS); 354 355 my $name = $self->_GetName($PPS); 356 357 if ($name =~ /__recip_version1 0_ /) { # Address of one recipient 358 $self->_AddressDir($PPS); 359 } elsif ($name =~ '__attach_version1 0_ ') { # Attachment 360 $self->_AttachmentDir($PPS); 361 } else { 362 $self->_UnknownDir($self->_GetName($PPS)); 363 } 364 } 365 366 sub _SubItemFile { 367 my ($self, $PPS) = @_; 368 369 my $name = $self->_GetName($PPS); 370 my ($property, $encoding) = $self->_ParseItemName($name); 371 372 $self->_MapProperty($self, $PPS->{Data}, $property, 373 MAP_SUBITEM_FILE) or $self->_UnknownFile($name); 374 } 375 376 sub _AddressDir { 377 my ($self, $PPS) = @_; 378 379 my $address = { 380 NAME => undef, 381 ADDRESS => undef, 382 TYPE => "", 383 }; 384 foreach my $child (@{$PPS->{Child}}) { 385 $self->_AddressItem($child, $address); 386 } 387 push @{$self->{ADDRESSES}}, $address; 388 } 389 390 sub _AddressItem { 391 my ($self, $PPS, $addr_info) = @_; 392 393 my $name = $self->_GetName($PPS); 394 395 # DIR Entries: There should be none. 396 if ($PPS->{Type} == DIR_TYPE) { 397 $self->_UnknownDir($name); 398 } elsif ($PPS->{Type} == FILE_TYPE) { 399 my ($property, $encoding) = $self->_ParseItemName($name); 400 $self->_MapProperty($addr_info, $PPS->{Data}, $property, 401 MAP_ADDRESSITEM_FILE) or $self->_UnknownFile($name); 402 } else { 403 warn "Unknown entry type: $PPS->{Type}"; 404 } 405 } 406 407 sub _AttachmentDir { 408 my ($self, $PPS) = @_; 409 410 my $attachment = { 411 SHORTNAME => undef, 412 LONGNAME => undef, 413 MIMETYPE => 'application/octet-stream', 414 ENCODING => 'base64', 415 DISPOSITION => 'attachment', 416 DATA => undef 417 }; 418 foreach my $child (@{$PPS->{Child}}) { 419 $self->_AttachmentItem($child, $attachment); 420 } 421 push @{$self->{ATTACHMENTS}}, $attachment; 422 } 423 424 sub _AttachmentItem { 425 my ($self, $PPS, $att_info) = @_; 426 427 my $name = $self->_GetName($PPS); 428 429 my ($property, $encoding) = $self->_ParseItemName($name); 430 431 if ($PPS->{Type} == DIR_TYPE) { 432 433 if ($property eq '3701') { # Nested MSG file 434 my $msgp = new MSGParser(); 435 $msgp->parse($PPS); 436 my $data = $msgp->mime_object->as_string; 437 $att_info->{DATA} = $data; 438 $att_info->{MIMETYPE} = 'message/rfc822'; 439 $att_info->{ENCODING} = '8bit'; 440 } else { 441 $self->_UnknownDir($name); 442 } 443 444 } elsif ($PPS->{Type} == FILE_TYPE) { 445 $self->_MapProperty($att_info, $PPS->{Data}, $property, 446 MAP_ATTACHMENT_FILE) or $self->_UnknownFile($name); 447 } else { 448 warn "Unknown entry type: $PPS->{Type}"; 449 } 450 } 451 452 sub _MapProperty { 453 my ($self, $hash, $data, $property, $map) = @_; 454 455 defined $property or return 0; 456 my $arr = $map->{$property} or return 0; 457 458 $arr->[1] and $data =~ s/\000//g; 459 $hash->{$arr->[0]} = $data; 460 461 return 1; 462 } 463 464 sub _UnknownDir { 465 my ($self, $name) = @_; 466 467 if ($name eq '__nameid_version1 0') { 468 $self->{VERBOSE} 469 and warn "Skipping DIR entry $name (Introductory stuff)\n"; 470 return; 471 } 472 warn "Unknown DIR entry $name\n"; 473 } 474 475 sub _UnknownFile { 476 my ($self, $name) = @_; 477 478 if ($name eq '__properties_version1 0') { 479 $self->{VERBOSE} 480 and warn "Skipping FILE entry $name (Properties)\n"; 481 return; 482 } 483 484 my ($property, $encoding) = $self->_ParseItemName($name); 485 unless (defined $property) { 486 warn "Unknown FILE entry $name\n"; 487 return; 488 } 489 if ($skipproperties->{$property}) { 490 $self->{VERBOSE} 491 and warn "Skipping property $property ($skipproperties->{$property})\n"; 492 return; 493 } elsif ($property =~ /^80/) { 494 $self->{VERBOSE} 495 and warn "Skipping property $property (user-defined property)\n"; 496 return; 497 } else { 498 warn "Unknown property $property\n"; 499 return; 500 } 501 } 502 503 # 504 # Helper functions 505 # 506 507 sub _GetName { 508 my ($self, $PPS) = @_; 509 return $self->_NormalizeWhiteSpace(OLE::Storage_Lite::Ucs2Asc($PPS->{Name})); 510 } 511 512 sub _NormalizeWhiteSpace { 513 my ($self, $name) = @_; 514 $name =~ s/\W/ /g; 515 return $name; 516 } 517 518 sub _GetOLEDate { 519 my ($self, $PPS) = @_; 520 unless (defined ($self->{OLEDATE})) { 521 # Make Date 522 my $datearr; 523 $datearr = $PPS->{Time2nd}; 524 $datearr = $PPS->{Time1st} unless($datearr); 525 $self->{OLEDATE} = $self->_FormatDate($datearr) if $datearr; 526 } 527 } 528 529 sub _FormatDate { 530 my ($self, $datearr) = @_; 531 532 # TODO: This is a little convoluted. Directly using strftime didn't seem 533 # to work. 534 my $datetime = mktime(@$datearr); 535 return time2str("%a, %d %h %Y %X %z", $datetime); 536 } 537 538 # If we didn't get the date from the original header data, we may be able 539 # to get it from the SUBMISSION_ID: 540 # It seems to have the format of a semicolon-separated list of key=value 541 # pairs. The key l has a value with the format: 542 # <SERVER>-<DATETIME>Z-<NUMBER>, where DATETIME is the date and time in 543 # the format YYMMDDHHMMSS. 544 sub _SubmissionIdDate { 545 my $self = shift; 546 547 my $submission_id = $self->{SUBMISSION_ID} or return undef; 548 $submission_id =~ m/l=.*-(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)Z-.*/ 549 or return undef; 550 my $year = $1; 551 $year += 100 if $year < 20; 552 return $self->_FormatDate([$6,$5,$4,$3,$2-1,$year]); 553 } 554 555 sub _ParseItemName { 556 my ($self, $name) = @_; 557 558 if ($name =~ /^__substg1 0_(....)(....)$/) { 559 my ($property, $encoding) = ($1, $2); 560 if ($encoding eq ENCODING_UNICODE and not ($self->{HAS_UNICODE})) { 561 warn "This MSG file contains Unicode fields." 562 . " This is currently unsupported.\n"; 563 $self->{HAS_UNICODE} = 1; 564 } elsif (not (KNOWN_ENCODINGS()->{$encoding})) { 565 warn "Unknown encoding $encoding. Results may be strange or wrong.\n"; 566 } 567 return ($property, $encoding); 568 } else { 569 return (undef, undef); 570 } 571 } 572 573 sub _SaveAttachment { 574 my ($self, $mime, $att) = @_; 575 576 my $ent = $mime->attach( 577 Type => $att->{MIMETYPE}, 578 Encoding => $att->{ENCODING}, 579 Data => [], 580 Filename => ($att->{LONGNAME} ? $att->{LONGNAME} : $att->{SHORTNAME}), 581 Disposition => $att->{DISPOSITION} 582 ); 583 584 my $handle; 585 if ($handle = $ent->open("w")) { 586 $handle->print($att->{DATA}); 587 $handle->close; 588 } else { 589 warn "Could not write data!"; 590 } 591 } 592 593 sub _SetAddressPart { 594 my ($self, $adrname, $partname, $data) = @_; 595 596 my $address = $self->{ADDRESSES}->{$adrname}; 597 $data =~ s/\000//g; 598 #warn "Processing address data part $partname : $data\n"; 599 if (defined ($address->{$partname})) { 600 if ($address->{$partname} eq $data) { 601 warn "Skipping duplicate but identical address information for" 602 . " $partname\n" if $self->{VERBOSE}; 603 } else { 604 warn "Address information $partname inconsistent:\n"; 605 warn " Original data: $address->{$partname}\n"; 606 warn " New data: $data\n"; 607 } 608 } else { 609 $address->{$partname} = $data; 610 } 611 } 612 613 # Set header fields 614 sub _AddHeaderField { 615 my ($self, $mime, $fieldname, $value) = @_; 616 617 my $oldvalue = $mime->head->get($fieldname); 618 return if $oldvalue; 619 $mime->head->add($fieldname, $value) if $value; 620 } 621 622 sub _Address { 623 my ($self, $tag) = @_; 624 my $name = $self->{$tag} || ""; 625 my $address = $self->{$tag . "_ADDR"} || ""; 626 return "$name <$address>"; 627 } 628 629 # Find SMTP addresses for the given list of names 630 sub _ExpandAddressList { 631 my ($self, $names) = @_; 632 633 my $addresspool = $self->{ADDRESSES}; 634 my @namelist = split /; */, $names; 635 my @result; 636 name: foreach my $name (@namelist) { 637 foreach my $address (@$addresspool) { 638 if ($name eq $address->{NAME}) { 639 my $addresstext = $address->{NAME} . " <"; 640 if (defined ($address->{SMTPADDRESS})) { 641 $addresstext .= $address->{SMTPADDRESS}; 642 } elsif ($address->{TYPE} eq "SMTP") { 643 $addresstext .= $address->{ADDRESS}; 644 } 645 $addresstext .= ">"; 646 push @result, $addresstext; 647 next name; 648 } 649 } 650 push @result, $name; 651 } 652 return join ", ", @result; 653 } 654 655 sub _ParseHead { 656 my ($self, $data) = @_; 657 defined $data or return undef; 658 # Parse full header date if we got that. 659 my $parser = new MIME::Parser(); 660 $parser->output_to_core(1); 661 $parser->decode_headers(1); 662 $data =~ s/^Microsoft Mail.*$/X-MSGConvert: yes/m; 663 my $entity = $parser->parse_data($data) 664 or warn "Couldn't parse full headers!"; 665 my $head = $entity->head; 666 $head->unfold; 667 return $head; 668 } 669 670 # Find out if we need to construct a multipart message 671 sub _IsMultiPart { 672 my $self = shift; 673 674 return ( 675 ($self->{BODY_HTML} and $self->{BODY_PLAIN}) 676 or @{$self->{ATTACHMENTS}}>0 677 ); 678 } 679 680 # Copy original header data. 681 # Note: This should contain the Date: header. 682 sub _CopyHeaderData { 683 my ($self, $mime) = @_; 684 685 my $head = $self->_ParseHead($self->{HEAD}) or return; 686 687 foreach my $tag (grep {!$skipheaders->{$_}} $head->tags) { 688 foreach my $value ($head->get_all($tag)) { 689 $mime->head->add($tag, $value); 690 } 691 } 692 } 693 694 # Set header fields 695 sub _SetHeaderFields { 696 my ($self, $mime) = @_; 697 698 # If we didn't get the date from the original header data, we may be able 699 # to get it from the SUBMISSION_ID: 700 $self->_AddHeaderField($mime, 'Date', $self->_SubmissionIdDate()); 701 702 # Third and last chance to set the Date: header; this uses the date the 703 # MSG file was saved. 704 $self->_AddHeaderField($mime, 'Date', $self->{OLEDATE}); 705 $self->_AddHeaderField($mime, 'Subject', $self->{SUBJECT}); 706 $self->_AddHeaderField($mime, 'From', $self->_Address("FROM")); 707 #$self->_AddHeaderField($mime, 'Reply-To', $self->_Address("REPLYTO")); 708 $self->_AddHeaderField($mime, 'To', $self->_ExpandAddressList($self->{TO})); 709 $self->_AddHeaderField($mime, 'Cc', $self->_ExpandAddressList($self->{CC})); 710 $self->_AddHeaderField($mime, 'Message-Id', $self->{MESSAGEID}); 711 $self->_AddHeaderField($mime, 'In-Reply-To', $self->{INREPLYTO}); 712 } 713 714 package main; 715 use Getopt::Long; 716 use Pod::Usage; 717 718 # Setup command line processing. 719 my $verbose = ''; 720 my $help = ''; # Print help message and exit. 721 GetOptions('verbose' => \$verbose, 'help|?' => \$help) or pod2usage(2); 722 pod2usage(1) if $help; 723 724 # Get file name 725 my $file = $ARGV[0]; 726 defined $file or pod2usage(2); 727 warn "Will parse file: $file\n" if $verbose; 728 729 # Load and parse MSG file (is OLE) 730 my $Msg = OLE::Storage_Lite->new($file); 731 my $PPS = $Msg->getPpsTree(1); 732 $PPS or die "$file must be an OLE file"; 733 734 # parse PPS tree 735 my $parser = new MSGParser(); 736 $parser->set_verbosity(1) if $verbose; 737 $parser->parse($PPS); 738 $parser->print(); 739 740 # 741 # Usage info follows. 742 # 743 __END__ 744 745 =head1 NAME 746 747 msgconvert.pl - Convert Outlook .msg files to mbox format 748 749 =head1 SYNOPSIS 750 751 msgconvert.pl [options] <file.msg> 752 753 Options: 754 --verbose be verbose 755 --help help message 756 757 =head1 OPTIONS 758 759 =over 8 760 761 =item B<--verbose> 762 763 Print information about skipped parts of the .msg file. 764 765 =item B<--help> 766 767 Print a brief help message. 768 769 =head1 DESCRIPTION 770 771 This program will output the message contained in file.msg in mbox format 772 on stdout. It will complain about unrecognized OLE parts on 773 stderr. 774 775 =head1 BUGS 776 777 Not all data that's in the .MSG file is converted. There simply are some 778 parts whose meaning escapes me. One of these must contain the date the 779 message was sent, for example. Formatting of text messages will also be 780 lost. YMMV. 781 782 =cut -
xapian-applications/omega/omindex.cc
diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc index 1ea8e77..bac6e54 100644
a b 5 5 * Copyright 2001,2002 Ananova Ltd 6 6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012 Olly Betts 7 7 * Copyright 2009 Frank J Bruzzaniti 8 * Copyright 2006,2007,2008 AVL List GesmbH 8 9 * Copyright 2012 Mihai Bivol 9 10 * 10 11 * This program is free software; you can redistribute it and/or … … 70 71 71 72 #include "gnu_getopt.h" 72 73 74 #ifndef LIBEXECDIR 75 // must have ending slash 76 //# define LIBEXECDIR "/usr/lib/omega/bin/" 77 # define LIBEXECDIR "" 78 #endif 79 #ifndef PKGDATADIR 80 // must have ending slash 81 # define PKGDATADIR "/usr/share/omega/" 82 #endif 83 73 84 using namespace std; 74 85 75 86 #define TITLE_SIZE 128 … … static bool ignore_exclusions = false; 84 95 static bool spelling = false; 85 96 static off_t max_size = 0; 86 97 static bool verbose = false; 98 string error_log; /* used in runfilter.cc */ 99 static string baseurl; 100 static string dbpath; 101 static string cache_dir; 87 102 static enum { 88 103 EMPTY_BODY_WARN, EMPTY_BODY_INDEX, EMPTY_BODY_SKIP 89 104 } empty_body = EMPTY_BODY_WARN; … … static time_t last_mod_max; 104 119 // text are common, so we handle these with a std::map. 105 120 static map<string, string> commands; 106 121 122 static void 123 index_directory(const string &path, const string &url_, size_t depth_limit, 124 map<string, string>& mime_map, size_t sample_size); 125 107 126 inline static bool 108 127 p_notalnum(unsigned int c) 109 128 { … … skip_unknown_mimetype(const string & file, const string & mimetype) 258 277 259 278 void 260 279 index_mimetype(const string & file, const string & url, const string & ext, 261 const string &mimetype, DirectoryIterator &d, size_t sample_size); 280 const string &mimetype, DirectoryIterator &d, size_t sample_size, 281 map<string, string>& mime_map, size_t depth_limit); 282 283 static 284 void mkdir_p(const string &path, mode_t mode) { 285 (void)mode; // FIXME 286 #ifdef __WIN32__ 287 string cmd = "mkdir "; 288 append_filename_argument(cmd, path); 289 system(cmd.c_str()); 290 #else 291 string cmd = "mkdir -p "; 292 append_filename_argument(cmd, path); 293 if (system(cmd.c_str()) < 0) { /* FIXME */ } 294 #endif 295 } 296 297 /* 298 * unpack .msg/.pst/.rar/.zip into local cache dir and recurse there 299 */ 300 static void 301 index_cached_directory(size_t depth_limit, 302 const string &file, 303 const string &url, 304 const string &ext, 305 const string &cmd, 306 map<string, string>& mime_map, 307 size_t sample_size) 308 { 309 string oldroot = root; 310 root = cache_dir; 311 string cache = root+"/."+ext; 312 string cachedir = cache+url; 313 struct stat statfile, statcache; 314 bool extract_cache; 315 #ifdef HAVE_LSTAT 316 lstat(file.c_str(), &statfile); 317 lstat(cachedir.c_str(), &statcache); 318 #else 319 stat(file.c_str(), &statfile); 320 stat(cachedir.c_str(), &statcache); 321 #endif 322 extract_cache = true; 323 // if cachedir exists and if file is older as cachedir and if cachedir existed 5 secs ago, 324 // then it was already extracted. 325 if (S_ISDIR(statcache.st_mode) && 326 S_ISREG(statfile.st_mode) && 327 (statfile.st_mtime < statcache.st_mtime) && 328 (statcache.st_mtime < (time_t)(time(NULL)-500))) // not created by nested mkdir call 329 { 330 // but is it in the database also? prevent from deleting skipped files 331 if (verbose) 332 cout << "Unchanged cache \"" << cachedir << "\" - \"" << file << "\" - skip extraction " 333 // << statfile.st_mtime << " < " << statcache.st_mtime 334 << endl; 335 extract_cache = false; 336 } 337 if (S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode) ) { 338 // If last_mod > last_mod_max, we know for sure that the file is new 339 // or updated. 340 if (statfile.st_mtime <= last_mod_max) { 341 // check database timestamp for cached container, esp. for cleaned up caches. 342 // if already in db we need not to extract again 343 string urlterm("U"); 344 urlterm += baseurl; 345 urlterm += "/."+ext+url; 346 if (urlterm.length() > MAX_SAFE_TERM_LENGTH) 347 urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH); 348 349 Xapian::PostingIterator p = db.postlist_begin(urlterm); 350 if (p != db.postlist_end(urlterm)) { 351 Xapian::docid docid = *p; 352 Xapian::Document doc = db.get_document(docid); 353 string value = doc.get_value(VALUE_LASTMOD); 354 time_t old_last_mod = binary_string_to_int(value); 355 if (statfile.st_mtime <= old_last_mod) { 356 if (verbose) 357 cout << "Cache "<< "."+ext+url << " not newer. Ignored." << endl; 358 // The docid should be in updated - the only valid 359 // exception is if the URL was long and hashed to the 360 // same URL as an existing document indexed in the same 361 // batch. 362 if (usual(docid < updated.size() && !updated[docid])) { 363 updated[docid] = true; 364 --old_docs_not_seen; 365 } 366 root = oldroot; 367 return; 368 } 369 } 370 } 371 } 372 373 if (extract_cache) { 374 if (verbose) 375 cout << "[EXTRACT into cache " << cachedir << "]" << endl; 376 if (verbose && S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode)) 377 cout << " ...changed cache \"" << cachedir << "\" - \"" << file << "\" " 378 << statfile.st_mtime << " < " << statcache.st_mtime << " time: " << time(NULL) 379 << endl; 380 if (!S_ISDIR(statcache.st_mode)) 381 mkdir_p(cachedir, 0755); 382 stdout_to_string(cmd); 383 #ifndef __WIN32__ 384 string chmod_cmd = "chmod -R u+rwx "; 385 append_filename_argument(chmod_cmd, cachedir); 386 stdout_to_string(chmod_cmd); 387 #endif 388 #ifdef HAVE_LSTAT 389 lstat(cachedir.c_str(), &statcache); 390 #else 391 stat(cachedir.c_str(), &statcache); 392 #endif 393 } 394 395 if (S_ISDIR(statcache.st_mode)) { 396 if (depth_limit == 1) { 397 cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl; 398 } else { 399 // max loop 5, magic start: /.ext+file 400 index_directory(cachedir + "/", url, depth_limit + 5, mime_map, sample_size); 401 if (verbose) 402 cout << "[CLEANUP '" << cachedir << "']" << endl; 403 rm_rf(cachedir); 404 } 405 } else { // no -p would be fatal here 406 cout << "cachedir '" << cachedir << "' does not exist - skipped" << endl; 407 } 408 root = oldroot; 409 } 262 410 263 411 static void 264 412 index_file(const string &file, const string &url, DirectoryIterator & d, 265 map<string, string>& mime_map, size_t sample_size) 413 map<string, string>& mime_map, size_t sample_size, 414 size_t depth_limit) 266 415 { 267 416 string ext; 268 417 const char * dot_ptr = strrchr(d.leafname(), '.'); … … index_file(const string &file, const string &url, DirectoryIterator & d, 290 439 291 440 string mimetype; 292 441 if (mt == mime_map.end()) { 442 if (strcasecmp(d.leafname(), "mbox") == 0) { 443 // Special filename. 444 mimetype = "message/rfc822"; 445 goto got_mimetype; 446 } 447 293 448 mimetype = d.get_magic_mimetype(); 294 449 if (mimetype.empty()) { 295 450 skip(file, "Unknown extension and unrecognised format", … … index_file(const string &file, const string &url, DirectoryIterator & d, 302 457 mimetype = mt->second; 303 458 } 304 459 460 got_mimetype: 461 305 462 if (verbose) 306 463 cout << "Indexing \"" << file.substr(root.size()) << "\" as " 307 464 << mimetype << " ... "; … … index_file(const string &file, const string &url, DirectoryIterator & d, 319 476 return; 320 477 } 321 478 322 index_mimetype(file, url, ext, mimetype, d, sample_size );479 index_mimetype(file, url, ext, mimetype, d, sample_size, mime_map, depth_limit); 323 480 } 324 481 325 482 void 326 483 index_mimetype(const string & file, const string & url, const string & ext, 327 const string &mimetype, DirectoryIterator &d, size_t sample_size) 484 const string &mimetype, DirectoryIterator &d, size_t sample_size, 485 map<string, string>& mime_map, size_t depth_limit) 328 486 { 329 487 string urlterm("U"); 330 488 urlterm += url; … … index_mimetype(const string & file, const string & url, const string & ext, 373 531 } 374 532 } 375 533 } 534 // add the db basename to cache_dir 535 { 536 cache_dir = get_tmpdir(); 537 const char *p = strrchr(dbpath.c_str(), '/'); 538 // on windows only 539 if (!p) p = strrchr(dbpath.c_str(), '\\'); 540 if (p) { p++; } else { p = dbpath.c_str(); } 541 cache_dir += p; 542 } 376 543 377 544 if (verbose) cout << flush; 378 545 … … index_mimetype(const string & file, const string & url, const string & ext, 437 604 } else { 438 605 // FIXME: What charset is the file? Look at contents? 439 606 } 607 #if 0 // FIXME: this won't work as omindex will have the database locked... 608 } else if (mimetype == "message/rfc822") { // // => mbox2script 609 //for stemmer lang, parse stemmer.get_description => Xapian::Stem(bla) 610 string cmd = LIBEXECDIR"mbox2omega"; 611 append_filename_argument(cmd, file); 612 cmd += error_log+"| scriptindex"; 613 append_filename_argument(cmd, dbpath); 614 cmd += " "PKGDATADIR"mbox2script.script"; 615 try { 616 dump = stdout_to_string(cmd); 617 } catch (ReadError) { 618 cout << "\"" << cmd << "\" failed - skipping" << endl; 619 return; 620 } 621 #endif 440 622 } else if (mimetype == "application/pdf") { 441 623 string cmd = "pdftotext -enc UTF-8"; 442 624 append_filename_argument(cmd, file); … … index_mimetype(const string & file, const string & url, const string & ext, 702 884 703 885 generate_sample_from_csv(dump, sample, sample_size); 704 886 } else if (mimetype == "application/vnd.ms-outlook") { 705 string cmd = get_pkglibbindir() + "/outlookmsg2html"; 887 string oldroot = root; 888 struct stat statcache; 889 char olddir[256]; 890 891 if (depth_limit == 1) { 892 skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME); 893 return; 894 } 895 string cmd = LIBEXECDIR"outlook2text"; 706 896 append_filename_argument(cmd, file); 707 MyHtmlParser p; 708 p.ignore_metarobots(); 897 // unpack multiparts and attachments. so we have to chdir first 898 string fulldir = cache_dir+"/."+ext+url; 899 if (getcwd(olddir, 256) == NULL) { /* FIXME */ } 900 #ifdef HAVE_LSTAT 901 lstat(fulldir.c_str(), &statcache); 902 #else 903 stat(fulldir.c_str(), &statcache); 904 #endif 905 if (!S_ISDIR(statcache.st_mode)) { 906 mkdir_p(fulldir, 0755); 907 } 709 908 try { 710 dump = stdout_to_string(cmd); 711 // FIXME: what should the default charset be? 712 p.parse_html(dump, "iso-8859-1", false); 713 } catch (const string & newcharset) { 714 p.reset(); 715 p.ignore_metarobots(); 716 p.parse_html(dump, newcharset, true); 909 if (chdir(fulldir.c_str()) < 0) { /* FIXME */ } 910 size_t new_limit = depth_limit; 911 if (new_limit) --new_limit; 912 index_cached_directory(new_limit, file, url, ext, cmd, mime_map, sample_size); 913 if (chdir(olddir) < 0) { /* FIXME */ } 717 914 } catch (ReadError) { 718 skip_cmd_failed(file, cmd); 719 return; 915 cout << "failed " << cmd << " << in index_cached_directory" << endl; 916 if (chdir(olddir) < 0) { /* FIXME */ } 917 root = oldroot; 918 } catch (...) { 919 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 920 if (chdir(olddir) < 0) { /* FIXME */ } 921 root = oldroot; 922 throw; 720 923 } 721 dump = p.dump; 722 title = p.title; 723 keywords = p.keywords; 724 sample = p.sample; 725 author = p.author; 924 return; 726 925 } else if (mimetype == "image/svg+xml") { 727 926 SvgParser svgparser; 728 927 svgparser.parse_html(d.file_to_string()); … … index_mimetype(const string & file, const string & url, const string & ext, 937 1136 cout << "added" << endl; 938 1137 } 939 1138 } 1139 } else if (mimetype == "application/x-zip") { 1140 string oldroot = root; 1141 if (depth_limit == 1) { 1142 skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME); 1143 return; 1144 } 1145 // overwrite 1146 string cmd = "unzip -u -P. -o"; 1147 append_filename_argument(cmd, file); 1148 cmd += " -d"; 1149 append_filename_argument(cmd, cache_dir+"/."+ext+url+"/"); 1150 try { 1151 size_t new_limit = depth_limit; 1152 if (new_limit) --new_limit; 1153 index_cached_directory(new_limit, file, url, ext, cmd, mime_map, sample_size); 1154 } catch (ReadError) { 1155 cout << "failed " << cmd << " << in index_cached_directory" << endl; 1156 root = oldroot; 1157 } catch (...) { 1158 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 1159 root = oldroot; 1160 throw; 1161 } 1162 return; 1163 } else if (mimetype == "application/x-rar") { 1164 string oldroot = root; 1165 if (depth_limit == 1) { 1166 skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME); 1167 return; 1168 } 1169 1170 // overwrite 1171 string cmd = "unrar x -o+"; 1172 append_filename_argument(cmd, file); 1173 append_filename_argument(cmd, cache_dir+"/."+ext+url+"/"); 1174 try { 1175 size_t new_limit = depth_limit; 1176 if (new_limit) --new_limit; 1177 index_cached_directory(new_limit, file, url, ext, cmd, mime_map, sample_size); 1178 } catch (ReadError) { 1179 cout << "failed " << cmd << " << in index_cached_directory" << endl; 1180 root = oldroot; 1181 } catch (...) { 1182 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 1183 root = oldroot; 1184 throw; 1185 } 1186 return; 1187 } else if (mimetype == "application/vnd.ms-outlook-pst") { 1188 string oldroot = root; 1189 if (depth_limit == 1) { 1190 skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME); 1191 return; 1192 } 1193 // unpack attachments also, together with mbox files 1194 string cmd = "readpst -r -cv -w -o"; 1195 append_filename_argument(cmd, cache_dir+"/."+ext+url+"/"); 1196 append_filename_argument(cmd, file); 1197 try { 1198 size_t new_limit = depth_limit; 1199 if (new_limit) --new_limit; 1200 index_cached_directory(new_limit, file, url, ext, cmd, mime_map, sample_size); 1201 } catch (ReadError) { 1202 root = oldroot; 1203 cout << "failed " << cmd << " << in index_cached_directory" << endl; 1204 } catch (...) { 1205 root = oldroot; 1206 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 1207 throw; 1208 } 1209 return; 940 1210 } else { 941 1211 // If this were a duplicate, we'd have skipped it above. 942 1212 db.add_document(newdocument); … … index_directory(const string &path, const string &url_, size_t depth_limit, 984 1254 break; 985 1255 } 986 1256 case DirectoryIterator::REGULAR_FILE: 987 index_file(file, url, d, mime_map, sample_size );1257 index_file(file, url, d, mime_map, sample_size, depth_limit); 988 1258 break; 989 1259 default: 990 1260 skip(file, "Not a regular file", … … main(int argc, char **argv) 1037 1307 bool overwrite = false; 1038 1308 // If delete_removed_documents is true, delete any documents we don't see. 1039 1309 bool delete_removed_documents = true; 1040 string baseurl;1041 1310 size_t depth_limit = 0; 1042 1311 size_t sample_size = SAMPLE_SIZE; 1043 1312 … … main(int argc, char **argv) 1165 1434 mime_map["pps"] = "application/vnd.ms-powerpoint"; // Powerpoint slideshow 1166 1435 mime_map["msg"] = "application/vnd.ms-outlook"; // Outlook .msg email 1167 1436 1437 // Outlook message folder: 1438 mime_map["pst"] = "application/vnd.ms-outlook-pst"; // readpst | uudeview (libpst) 1439 1440 // Miscellaneous compound formats: 1441 mime_map["mbox"] = "message/rfc822"; // => mbox2omega 1442 mime_map["mbx"] = "message/rfc822"; // => mbox2omega 1443 #ifndef _MSC_VER 1444 mime_map["zip"] = "application/x-zip"; // recursive scanning 1445 mime_map["rar"] = "application/x-rar"; // recursive scanning 1446 #endif 1447 1168 1448 // Perl: 1169 1449 mime_map["pl"] = "text/x-perl"; 1170 1450 mime_map["pm"] = "text/x-perl"; … … main(int argc, char **argv) 1238 1518 argv[1] = const_cast<char *>("--version"); 1239 1519 } 1240 1520 1241 string dbpath;1242 1521 int getopt_ret; 1243 1522 while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:F:l:s:pfSVe:im:E:", 1244 1523 longopts, NULL)) != -1) { … … main(int argc, char **argv) 1413 1692 baseurl += '/'; 1414 1693 } 1415 1694 1695 string log_dir = "./"; // FIXME: need to set log_dir to something appropriate. 1696 error_log = " 2>>"+log_dir+"omindex-error.log"; 1697 1416 1698 if (optind >= argc || optind + 2 < argc) { 1417 1699 cerr << PROG_NAME": you must specify a directory to index.\n" 1418 1700 "Do this either as a single directory (corresponding to the base URL)\n" -
new file xapian-applications/omega/outlook2text.in
diff --git a/xapian-applications/omega/outlook2text.in b/xapian-applications/omega/outlook2text.in new file mode 100644 index 0000000..b7cf3e2
- + 1 #! /bin/sh 2 # converts msg to mbox and extract attachments 3 # either be in the cache dir, or accept it as 2nd arg 4 if [ -n $2 ]; then 5 @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "$2" 6 else 7 # already is in the cache dir 8 base=`basename "$1" .msg` 9 @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "${base}" 10 fi -
xapian-applications/omega/runfilter.cc
diff --git a/xapian-applications/omega/runfilter.cc b/xapian-applications/omega/runfilter.cc index c2a24bd..ed4f8cd 100644
a b 55 55 56 56 using namespace std; 57 57 58 extern string error_log; 59 58 60 string 59 61 stdout_to_string(const string &cmd) 60 62 { 61 63 string out; 64 string tmp = cmd; 65 tmp += error_log; 62 66 #if defined HAVE_FORK && defined HAVE_SOCKETPAIR && defined HAVE_SETRLIMIT 63 67 // We want to be able to get the exit status of the child process. 64 68 signal(SIGCHLD, SIG_DFL); … … stdout_to_string(const string &cmd) 104 108 } 105 109 #endif 106 110 107 execl("/bin/sh", "/bin/sh", "-c", cmd.c_str(), (void*)NULL);111 execl("/bin/sh", "/bin/sh", "-c", tmp.c_str(), (void*)NULL); 108 112 _exit(-1); 109 113 } 110 114 … … stdout_to_string(const string &cmd) 171 175 throw ReadError(); 172 176 } 173 177 #else 174 FILE * fh = popen( cmd.c_str(), "r");178 FILE * fh = popen(tmp.c_str(), "r"); 175 179 if (fh == NULL) throw ReadError(); 176 180 while (!feof(fh)) { 177 181 char buf[4096]; -
xapian-applications/omega/utils.cc
diff --git a/xapian-applications/omega/utils.cc b/xapian-applications/omega/utils.cc index 797c47d..92b5c76 100644
a b 23 23 24 24 #include "utils.h" 25 25 26 #include "append_filename_arg.h" 27 26 28 #include <cassert> 27 29 #include <stdio.h> // for sprintf/snprintf 28 30 #include <cstdlib> 31 #include <cstring> 32 #include "safesysstat.h" 29 33 30 34 #include <string> 31 35 32 36 using namespace std; 33 37 38 #ifdef __WIN32__ 39 #include "safewindows.h" 40 #endif 41 42 /// Remove a directory and contents. 43 void 44 rm_rf(const string &filename) 45 { 46 // Check filename exists and is actually a directory 47 struct stat sb; 48 if (stat(filename.c_str(), &sb) != 0 || !S_ISDIR(sb.st_mode)) return; 49 50 #ifdef __WIN32__ 51 static int win95 = -1; 52 if (win95 == -1) { 53 OSVERSIONINFO info; 54 memset(&info, 0, sizeof(OSVERSIONINFO)); 55 info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); 56 if (GetVersionEx(&info)) { 57 win95 = (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS); 58 } 59 } 60 61 string cmd = win95 ? 62 "deltree /y" : // for 95-like systems. 63 "rd /s /q"; // for NT-like systems. 64 #else 65 string cmd = "rm -rf"; 66 #endif 67 append_filename_argument(cmd, filename); 68 if (system(cmd.c_str())) { /* FIXME */ } 69 } 70 34 71 // This ought to be enough for any of the conversions below. 35 72 #define BUFSIZE 100 36 73 -
xapian-applications/omega/utils.h
diff --git a/xapian-applications/omega/utils.h b/xapian-applications/omega/utils.h index a54a4f8..b2241b7 100644
a b int string_to_int(const std::string & s); 34 34 /** Remove any leading and/or trailing whitespace from @a s. */ 35 35 void trim(std::string & s); 36 36 37 void rm_rf(const std::string &filename); 38 37 39 #endif