Ticket #282: xapian-omega-trunk-r16058-from-ticket-285-and-cleaned-up-updated-2011-12-06.patch
File xapian-omega-trunk-r16058-from-ticket-285-and-cleaned-up-updated-2011-12-06.patch, 47.9 KB (added by , 13 years ago) |
---|
-
xapian-applications/omega/ChangeLog
diff --git a/xapian-applications/omega/ChangeLog b/xapian-applications/omega/ChangeLog index 598304a..9c0abdc 100644
a b 1 2006-08-22 09:30:12 Reini Urban <reinhard.urban@avl.com> 2 3 omega-0.9.6c: 4 * omindex.cc: Fix wrong timestamp comparison in cache logic 5 * outlook2text.in: New script 6 7 2006-08-17 18:06:26 Reini Urban <reinhard.urban@avl.com> 8 9 omega-0.9.6a: 10 * omindex.cc: Added cached virtual directories zip,msg,pst,...). 11 Consistently log stderr to /var/log/omega/omindex-error.log. 12 1 13 Sat Oct 29 14:49:40 GMT 2011 Olly Betts <olly@survex.com> 2 14 3 15 * docs/omegascript.rst: Add note to discourage use of percentage -
xapian-applications/omega/Makefile.am
diff --git a/xapian-applications/omega/Makefile.am b/xapian-applications/omega/Makefile.am index 6ee0a07..2505a5a 100644
a b pkglibbindir = $(pkglibdir)/bin 78 78 pkglibbin_PROGRAMS = omega 79 79 dist_pkglibbin_SCRIPTS = outlookmsg2html 80 80 bin_PROGRAMS = omindex scriptindex 81 dist_libexec_SCRIPTS = outlook2text mimeexplode msgconvert.pl 81 82 dist_bin_SCRIPTS = dbi2omega htdig2omega mbox2omega 82 83 83 84 check_PROGRAMS = htmlparsetest md5test utf8converttest … … dist_man_MANS = omindex.1 scriptindex.1 160 161 MAINTAINERCLEANFILES = $(dist_man_MANS) 161 162 endif 162 163 164 CLEANFILES = outlook2text 165 166 outlook2text: $(srcdir)/outlook2text.in Makefile 167 sed "s,@MSGCONVERT@,$(MSGCONVERT),;s,@MIMEEXPLODE@,$(pkglibbindir)/mimeexplode," $(srcdir)/outlook2text.in > $@ 168 163 169 if DOCUMENTATION_RULES 164 170 omindex.1: omindex$(EXEEXT) makemanpage 165 171 ./makemanpage ./omindex $(srcdir)/omindex.cc omindex.1 -
new file xapian-applications/omega/mimeexplode
diff --git a/xapian-applications/omega/mimeexplode b/xapian-applications/omega/mimeexplode new file mode 100644 index 0000000..70743ab
- + 1 #!/usr/bin/perl -w 2 3 =head1 NAME 4 5 mimeexplode - explode one or more MIME messages 6 7 =head1 SYNOPSIS 8 9 mimeexplode [-d <dir>] <mime-msg-file> <mime-msg-file> ... 10 11 someprocess | mimeexplode - 12 13 =head1 DESCRIPTION 14 15 Takes one or more files from the command line that contain MIME 16 messages, and explodes their contents out into subdirectories 17 of the current working directory. The subdirectories are 18 just called C<msg0>, C<msg1>, C<msg2>, etc. Existing directories are 19 skipped over. 20 21 The message information is output to the stdout, like this: 22 23 Message: msg3 (inputfile1.msg) 24 Part: msg3/filename-1.dat (text/plain) 25 Part: msg3/filename-2.dat (text/plain) 26 Message: msg5 (input-file2.msg) 27 Part: msg5/dir.gif (image/gif) 28 Part: msg5/face.jpg (image/jpeg) 29 Message: msg6 (infile3) 30 Part: msg6/filename-1.dat (text/plain) 31 32 This was written as an example of the MIME:: modules in the 33 MIME-parser package I wrote. It may prove useful as a quick-and-dirty 34 way of splitting a MIME message if you need to decode something, and 35 you don't have a MIME mail reader on hand. 36 37 =head1 COMMAND LINE OPTIONS 38 39 -d outdir 40 41 =head1 AUTHOR 42 43 Eryq C<eryq@zeegee.com>, in a big hurry... 44 Reini Urban C<rurban@x-ray.at>: -d option to always explode into the same dir 45 46 =cut 47 48 #BEGIN { unshift @INC, ".." } # to test MIME:: stuff before installing it! 49 50 require 5.001; 51 52 use strict; 53 use vars; 54 55 use MIME::Parser; 56 use Getopt::Std; 57 my %opts; 58 my $outbase = ''; 59 my $postfix = ''; 60 61 #------------------------------------------------------------ 62 # make_msg - make and return the name of a msgXXX directory 63 #------------------------------------------------------------ 64 65 #ignored 66 #sub make_msg { 67 # while (-d "msg$Msgno") { 68 # ++$Msgno; 69 # die "self-imposed limit reached" if $Msgno == 256; 70 # } 71 # mkdir "msg$Msgno",0755 or die "couldn't make msg$Msgno: $!"; 72 # "msg$Msgno"; 73 #} 74 75 #------------------------------------------------------------ 76 # dump_entity - dump an entity's file info 77 #------------------------------------------------------------ 78 sub dump_entity { 79 my $ent = shift; 80 my @parts = $ent->parts; 81 82 if (@parts) { # multipart... 83 map { dump_entity($_) } @parts; 84 } 85 else { # single part... 86 print " Part: ", $ent->bodyhandle->path, 87 " (", scalar($ent->head->mime_type), ")\n"; 88 } 89 } 90 91 #------------------------------------------------------------ 92 # main 93 #------------------------------------------------------------ 94 sub main { 95 my $file; 96 my $entity; 97 98 # make sure the same message gets exploded into the same dir 99 getopts('d:', \%opts); 100 $outbase = $opts{d} ? $opts{d} : "msg0"; 101 my $outdir = $outbase; 102 103 # Go through messages: 104 @ARGV or unshift @ARGV, "-"; 105 while (defined($file = shift @ARGV)) { 106 107 # Sanity: 108 (-d $outdir) or mkdir "$outdir",0755; 109 (-w "$outdir") or die "cwd $outdir not writable!"; 110 #my $msgdir = make_msg(); 111 #print "Message: $msgdir ($file)\n"; 112 113 # Create a new parser object: 114 my $parser = new MIME::Parser; 115 ### $parser->parse_nested_messages('REPLACE'); 116 117 # Optional: set up parameters that will affect how it extracts 118 # documents from the input stream: 119 $parser->output_dir($outdir); 120 121 # Parse an input stream: 122 open FILE, $file or die "couldn't open $file"; 123 $entity = $parser->read(\*FILE) or 124 print STDERR "Couldn't parse MIME in $file; continuing...\n"; 125 close FILE; 126 127 # Congratulations: you now have a (possibly multipart) MIME entity! 128 dump_entity($entity) if $entity; 129 ### $entity->dump_skeleton if $entity; 130 131 $postfix++; 132 $outdir = $outbase.$postfix; 133 } 134 1; 135 } 136 137 exit (&main ? 0 : -1); 138 #------------------------------------------------------------ 139 1; 140 -
new file xapian-applications/omega/msgconvert.pl
diff --git a/xapian-applications/omega/msgconvert.pl b/xapian-applications/omega/msgconvert.pl new file mode 100644 index 0000000..cf32079
- + 1 #!/usr/bin/perl -w 2 # 3 # msgconvert.pl: 4 # 5 # Convert .MSG files (made by Outlook (Express)) to multipart MIME messages. 6 # 7 # Copyright 2002, 2004, 2006 Matijs van Zuijlen 8 # 9 # This program is free software; you can redistribute it and/or modify it 10 # under the terms of the GNU General Public License as published by the 11 # Free Software Foundation; either version 2 of the License, or (at your 12 # option) any later version. 13 # 14 # This program is distributed in the hope that it will be useful, but 15 # WITHOUT ANY WARRANTY; without even the implied warranty of 16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 17 # Public License for more details. 18 # 19 # CHANGES: 20 # 20020715 Recognize new items 'Cc', mime type of attachment, long 21 # filename of attachment, and full headers. Attachments turn out 22 # to be numbered, so a regexp is now used to recognize label of 23 # items that are attachments. 24 # 20020831 long file name will definitely be used if present. Full headers 25 # and mime type information are used when present. Created 26 # generic system for specifying known items to be skipped. 27 # Unexpected contents is never reason to bail out anymore. Added 28 # support for usage message and option processing (--verbose). 29 # 20040104 Handle address data slightly better, make From line less fake, 30 # make $verbose and $skippable_entries global vars, handle HTML 31 # variant of body text if present (though not optimally). 32 # 20040214 Fix typos and incorrect comments. 33 # 20040307 - Complete rewrite: All functional parts are now in the package 34 # MSGParser; 35 # - Creation of MIME::Entity object is delayed until the output 36 # routines, which means all data is known; This means I can 37 # create a multipart/alternative body. 38 # - Item names are parsed (thanks to bfrederi@alumni.sfu.ca for 39 # the information). 40 # 20040514 Check if $self->{HEAD} actually exists before trying to add its 41 # contents to the output Mime object's header data. 42 # (Bug reported by Thomas Ng). 43 # Don't produce multipart messages if not needed. 44 # (Bug reported by Justin B. Scout). 45 # 20040529 Correctly format OLEDATE. 46 # 20040530 - Extract date from property 0047 (thanks, Marc Goodman). 47 # - Use address data to make To: and Cc: lines complete 48 # - Use the in-reply-to property 49 # - More unknown properties named. 50 # - Found another property containing an SMTP address. 51 # - Put non-SMTP type addresses back in output. 52 # 20040825 Replace 'our' to declare globals with 'use vars'. This means 53 # the globals our now properly scoped inside the package and not 54 # the file. 55 # This also fixes the bug that this program did not work on perl 56 # versions below 5.6. (Bug reported by Tim Gustafson) 57 # 20060218 More sensible encoding warnings. 58 # 20060219 Move OLE parsing to main program. 59 # Parse nested MSG files (Bug reported by Christof Lukas). 60 # 20060225 Simplify code. 61 # 62 63 # 64 # Import modules. 65 # 66 package MSGParser; 67 use strict; 68 use OLE::Storage_Lite; 69 use MIME::Entity; 70 use MIME::Parser; 71 use Date::Format; 72 use POSIX qw(mktime); 73 use constant DIR_TYPE => 1; 74 use constant FILE_TYPE => 2; 75 76 use vars qw($skipproperties $skipheaders); 77 # 78 # Descriptions partially based on mapitags.h 79 # 80 $skipproperties = { 81 # Envelope properties 82 '000B' => "Conversation key?", 83 '001A' => "Type of message", 84 '003B' => "Sender address variant", 85 '003D' => "Contains 'Re: '", 86 '003F' => "'recieved by' id", 87 '0040' => "'recieved by' name", 88 '0041' => "Sender variant address id", 89 '0042' => "Sender variant name", 90 '0043' => "'recieved representing' id", 91 '0044' => "'recieved representing' name", 92 '0046' => "Read receipt address id", 93 '0051' => "'recieved by' search key", 94 '0052' => "'recieved representing' search key", 95 '0053' => "Read receipt search key", 96 '0064' => "Sender variant address type", 97 '0065' => "Sender variant address", 98 '0070' => "Conversation topic", 99 '0071' => "Conversation index", 100 '0075' => "'recieved by' address type", 101 '0076' => "'recieved by' email address", 102 '0077' => "'recieved representing' address type", 103 '0078' => "'recieved representing' email address", 104 '007F' => "something like a message id", 105 # Recipient properties 106 '0C19' => "Reply address variant", 107 '0C1D' => "Reply address variant", 108 '0C1E' => "Reply address type", 109 # Non-transmittable properties 110 '0E02' => "?Should BCC be displayed", 111 '0E0A' => "sent mail id", 112 '0E1D' => "Subject w/o Re", 113 '0E27' => "64 bytes: Unknown", 114 '0FF6' => "Index", 115 '0FF9' => "Index", 116 '0FFF' => "Address variant", 117 # Content properties 118 '1008' => "Summary or something", 119 '1009' => "RTF Compressed", 120 # 'Common property' 121 '3001' => "Display name", 122 '3002' => "Address Type", 123 '300B' => "'Search key'", 124 # Attachment properties 125 '3702' => "Attachment encoding", 126 '3703' => "Attachment extension", 127 '3709' => "'Attachment rendering'", # Maybe an icon or something? 128 '3713' => "Icon URL?", 129 # 'Mail user' 130 '3A20' => "Address variant", 131 # 3900 -- 39FF: 'Address book' 132 '39FF' => "7 bit display name", 133 # 'Display table properties' 134 '3FF8' => "Routing data?", 135 '3FF9' => "Routing data?", 136 '3FFA' => "Routing data?", 137 '3FFB' => "Routing data?", 138 # 'Transport-defined envelope property' 139 '4029' => "Sender variant address type", 140 '402A' => "Sender variant address", 141 '402B' => "Sender variant name", 142 '5FF6' => "Recipient name", 143 '5FF7' => "Recipient address variant", 144 # 'Provider-defined internal non-transmittable property' 145 '6740' => "Unknown, binary data", 146 # User defined id's 147 '8000' => "Content Class", 148 '8002' => "Unknown, binary data", 149 }; 150 151 $skipheaders = { 152 "MIME-Version" => 1, 153 "Content-Type" => 1, 154 "Content-Transfer-Encoding" => 1, 155 "X-Mailer" => 1, 156 "X-Msgconvert" => 1, 157 "X-MS-Tnef-Correlator" => 1, 158 "X-MS-Has-Attach" => 1, 159 }; 160 161 use constant ENCODING_UNICODE => '001F'; 162 use constant KNOWN_ENCODINGS => { 163 '000D' => 'Directory', 164 '001F' => 'Unicode', 165 '001E' => 'Ascii?', 166 '0102' => 'Binary', 167 }; 168 169 use constant MAP_ATTACHMENT_FILE => { 170 '3701' => ["DATA", 0], # Data 171 '3704' => ["SHORTNAME", 1], # Short file name 172 '3707' => ["LONGNAME", 1], # Long file name 173 '370E' => ["MIMETYPE", 1], # mime type 174 '3716' => ["DISPOSITION", 1], # disposition 175 }; 176 177 use constant MAP_SUBITEM_FILE => { 178 '1000' => ["BODY_PLAIN", 0], # Body 179 '1013' => ["BODY_HTML", 0], # HTML Version of body 180 '0037' => ["SUBJECT", 1], # Subject 181 '0047' => ["SUBMISSION_ID", 1], # Seems to contain the date 182 '007D' => ["HEAD", 1], # Full headers 183 '0C1A' => ["FROM", 1], # Reply-To: Name 184 '0C1E' => ["FROM_ADDR_TYPE", 1], # From: Address type 185 '0C1F' => ["FROM_ADDR", 1], # Reply-To: Address 186 '0E04' => ["TO", 1], # To: Names 187 '0E03' => ["CC", 1], # Cc: Names 188 '1035' => ["MESSAGEID", 1], # Message-Id 189 '1042' => ["INREPLYTO", 1], # In reply to Message-Id 190 }; 191 192 use constant MAP_ADDRESSITEM_FILE => { 193 '3001' => ["NAME", 1], # Real name 194 '3002' => ["TYPE", 1], # Address type 195 '403D' => ["TYPE", 1], # Address type 196 '3003' => ["ADDRESS", 1], # Address 197 '403E' => ["ADDRESS", 1], # Address 198 '39FE' => ["SMTPADDRESS", 1], # SMTP Address variant 199 }; 200 201 # 202 # Main body of module 203 # 204 205 sub new { 206 my $that = shift; 207 my $class = ref $that || $that; 208 209 my $self = { 210 ATTACHMENTS => [], 211 ADDRESSES => [], 212 VERBOSE => 0, 213 HAS_UNICODE => 0, 214 FROM_ADDR_TYPE => "", 215 }; 216 bless $self, $class; 217 } 218 219 # 220 # Main sub: parse the PPS tree, and return 221 # 222 sub parse { 223 my $self = shift; 224 my $PPS = shift or die "Internal error: No PPS tree"; 225 $self->_RootDir($PPS); 226 } 227 228 sub mime_object { 229 my $self = shift; 230 231 my $bodymime; 232 my $mime; 233 234 if ($self->_IsMultiPart) { 235 # Construct a multipart message object 236 237 $mime = MIME::Entity->build(Type => "multipart/mixed"); 238 239 # Set the entity that we'll save the body parts to. If there's more than 240 # one part, it's a new entity, otherwise, it's the main $mime object. 241 if ($self->{BODY_HTML} and $self->{BODY_PLAIN}) { 242 $bodymime = MIME::Entity->build( 243 Type => "multipart/alternative", 244 Encoding => "8bit", 245 ); 246 $mime->add_part($bodymime); 247 } else { 248 $bodymime = $mime; 249 } 250 if ($self->{BODY_PLAIN}) { 251 $self->_SaveAttachment($bodymime, { 252 MIMETYPE => 'text/plain; charset=ISO-8859-1', 253 ENCODING => '8bit', 254 DATA => $self->{BODY_PLAIN}, 255 DISPOSITION => 'inline', 256 }); 257 } 258 if ($self->{BODY_HTML}) { 259 $self->_SaveAttachment($bodymime, { 260 MIMETYPE => 'text/html', 261 ENCODING => '8bit', 262 DATA => $self->{BODY_HTML}, 263 DISPOSITION => 'inline', 264 }); 265 } 266 foreach my $att (@{$self->{ATTACHMENTS}}) { 267 $self->_SaveAttachment($mime, $att); 268 } 269 } elsif ($self->{BODY_PLAIN}) { 270 # Construct a single part message object with a plain text body 271 $mime = MIME::Entity->build( 272 Type => "text/plain", 273 Data => $self->{BODY_PLAIN} 274 ); 275 } elsif ($self->{BODY_HTML}) { 276 # Construct a single part message object with an HTML body 277 $mime = MIME::Entity->build( 278 Type => "text/html", 279 Data => $self->{BODY_HTML} 280 ); 281 } 282 283 $self->_CopyHeaderData($mime); 284 285 $self->_SetHeaderFields($mime); 286 287 return $mime; 288 } 289 290 # Actually output the message in mbox format 291 sub print { 292 my $self = shift; 293 294 my $mime = $self->mime_object; 295 296 # Construct From line from whatever we know. 297 my $string = ""; 298 $string = ( 299 $self->{FROM_ADDR_TYPE} eq "SMTP" ? 300 $self->{FROM_ADDR} : 301 'someone@somewhere' 302 ); 303 $string =~ s/\n//g; 304 305 # The date used here is not really important. 306 print "From ", $string, " ", scalar localtime, "\n"; 307 $mime->print(\*STDOUT); 308 print "\n"; 309 } 310 311 sub set_verbosity { 312 my ($self, $verbosity) = @_; 313 defined $verbosity or die "Internal error: no verbosity level"; 314 $self->{VERBOSE} = $verbosity; 315 } 316 317 # 318 # Below are functions that walk the PPS tree. The *Dir functions handle 319 # processing the directory nodes of the tree (mainly, iterating over the 320 # children), whereas the *Item functions handle processing the items in the 321 # directory (if such an item is itself a directory, it will in turn be 322 # processed by the relevant *Dir function). 323 # 324 325 # 326 # RootItem: Check Root Entry, parse sub-entries. 327 # The OLE file consists of a single entry called Root Entry, which has 328 # several children. These children are parsed in the sub SubItem. 329 # 330 sub _RootDir { 331 my ($self, $PPS) = @_; 332 333 foreach my $child (@{$PPS->{Child}}) { 334 $self->_SubItem($child); 335 } 336 } 337 338 sub _SubItem { 339 my ($self, $PPS) = @_; 340 341 if ($PPS->{Type} == DIR_TYPE) { 342 $self->_SubItemDir($PPS); 343 } elsif ($PPS->{Type} == FILE_TYPE) { 344 $self->_SubItemFile($PPS); 345 } else { 346 warn "Unknown entry type: $PPS->{Type}"; 347 } 348 } 349 350 sub _SubItemDir { 351 my ($self, $PPS) = @_; 352 353 $self->_GetOLEDate($PPS); 354 355 my $name = $self->_GetName($PPS); 356 357 if ($name =~ /__recip_version1 0_ /) { # Address of one recipient 358 $self->_AddressDir($PPS); 359 } elsif ($name =~ '__attach_version1 0_ ') { # Attachment 360 $self->_AttachmentDir($PPS); 361 } else { 362 $self->_UnknownDir($self->_GetName($PPS)); 363 } 364 } 365 366 sub _SubItemFile { 367 my ($self, $PPS) = @_; 368 369 my $name = $self->_GetName($PPS); 370 my ($property, $encoding) = $self->_ParseItemName($name); 371 372 $self->_MapProperty($self, $PPS->{Data}, $property, 373 MAP_SUBITEM_FILE) or $self->_UnknownFile($name); 374 } 375 376 sub _AddressDir { 377 my ($self, $PPS) = @_; 378 379 my $address = { 380 NAME => undef, 381 ADDRESS => undef, 382 TYPE => "", 383 }; 384 foreach my $child (@{$PPS->{Child}}) { 385 $self->_AddressItem($child, $address); 386 } 387 push @{$self->{ADDRESSES}}, $address; 388 } 389 390 sub _AddressItem { 391 my ($self, $PPS, $addr_info) = @_; 392 393 my $name = $self->_GetName($PPS); 394 395 # DIR Entries: There should be none. 396 if ($PPS->{Type} == DIR_TYPE) { 397 $self->_UnknownDir($name); 398 } elsif ($PPS->{Type} == FILE_TYPE) { 399 my ($property, $encoding) = $self->_ParseItemName($name); 400 $self->_MapProperty($addr_info, $PPS->{Data}, $property, 401 MAP_ADDRESSITEM_FILE) or $self->_UnknownFile($name); 402 } else { 403 warn "Unknown entry type: $PPS->{Type}"; 404 } 405 } 406 407 sub _AttachmentDir { 408 my ($self, $PPS) = @_; 409 410 my $attachment = { 411 SHORTNAME => undef, 412 LONGNAME => undef, 413 MIMETYPE => 'application/octet-stream', 414 ENCODING => 'base64', 415 DISPOSITION => 'attachment', 416 DATA => undef 417 }; 418 foreach my $child (@{$PPS->{Child}}) { 419 $self->_AttachmentItem($child, $attachment); 420 } 421 push @{$self->{ATTACHMENTS}}, $attachment; 422 } 423 424 sub _AttachmentItem { 425 my ($self, $PPS, $att_info) = @_; 426 427 my $name = $self->_GetName($PPS); 428 429 my ($property, $encoding) = $self->_ParseItemName($name); 430 431 if ($PPS->{Type} == DIR_TYPE) { 432 433 if ($property eq '3701') { # Nested MSG file 434 my $msgp = new MSGParser(); 435 $msgp->parse($PPS); 436 my $data = $msgp->mime_object->as_string; 437 $att_info->{DATA} = $data; 438 $att_info->{MIMETYPE} = 'message/rfc822'; 439 $att_info->{ENCODING} = '8bit'; 440 } else { 441 $self->_UnknownDir($name); 442 } 443 444 } elsif ($PPS->{Type} == FILE_TYPE) { 445 $self->_MapProperty($att_info, $PPS->{Data}, $property, 446 MAP_ATTACHMENT_FILE) or $self->_UnknownFile($name); 447 } else { 448 warn "Unknown entry type: $PPS->{Type}"; 449 } 450 } 451 452 sub _MapProperty { 453 my ($self, $hash, $data, $property, $map) = @_; 454 455 defined $property or return 0; 456 my $arr = $map->{$property} or return 0; 457 458 $arr->[1] and $data =~ s/\000//g; 459 $hash->{$arr->[0]} = $data; 460 461 return 1; 462 } 463 464 sub _UnknownDir { 465 my ($self, $name) = @_; 466 467 if ($name eq '__nameid_version1 0') { 468 $self->{VERBOSE} 469 and warn "Skipping DIR entry $name (Introductory stuff)\n"; 470 return; 471 } 472 warn "Unknown DIR entry $name\n"; 473 } 474 475 sub _UnknownFile { 476 my ($self, $name) = @_; 477 478 if ($name eq '__properties_version1 0') { 479 $self->{VERBOSE} 480 and warn "Skipping FILE entry $name (Properties)\n"; 481 return; 482 } 483 484 my ($property, $encoding) = $self->_ParseItemName($name); 485 unless (defined $property) { 486 warn "Unknown FILE entry $name\n"; 487 return; 488 } 489 if ($skipproperties->{$property}) { 490 $self->{VERBOSE} 491 and warn "Skipping property $property ($skipproperties->{$property})\n"; 492 return; 493 } elsif ($property =~ /^80/) { 494 $self->{VERBOSE} 495 and warn "Skipping property $property (user-defined property)\n"; 496 return; 497 } else { 498 warn "Unknown property $property\n"; 499 return; 500 } 501 } 502 503 # 504 # Helper functions 505 # 506 507 sub _GetName { 508 my ($self, $PPS) = @_; 509 return $self->_NormalizeWhiteSpace(OLE::Storage_Lite::Ucs2Asc($PPS->{Name})); 510 } 511 512 sub _NormalizeWhiteSpace { 513 my ($self, $name) = @_; 514 $name =~ s/\W/ /g; 515 return $name; 516 } 517 518 sub _GetOLEDate { 519 my ($self, $PPS) = @_; 520 unless (defined ($self->{OLEDATE})) { 521 # Make Date 522 my $datearr; 523 $datearr = $PPS->{Time2nd}; 524 $datearr = $PPS->{Time1st} unless($datearr); 525 $self->{OLEDATE} = $self->_FormatDate($datearr) if $datearr; 526 } 527 } 528 529 sub _FormatDate { 530 my ($self, $datearr) = @_; 531 532 # TODO: This is a little convoluted. Directly using strftime didn't seem 533 # to work. 534 my $datetime = mktime(@$datearr); 535 return time2str("%a, %d %h %Y %X %z", $datetime); 536 } 537 538 # If we didn't get the date from the original header data, we may be able 539 # to get it from the SUBMISSION_ID: 540 # It seems to have the format of a semicolon-separated list of key=value 541 # pairs. The key l has a value with the format: 542 # <SERVER>-<DATETIME>Z-<NUMBER>, where DATETIME is the date and time in 543 # the format YYMMDDHHMMSS. 544 sub _SubmissionIdDate { 545 my $self = shift; 546 547 my $submission_id = $self->{SUBMISSION_ID} or return undef; 548 $submission_id =~ m/l=.*-(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)Z-.*/ 549 or return undef; 550 my $year = $1; 551 $year += 100 if $year < 20; 552 return $self->_FormatDate([$6,$5,$4,$3,$2-1,$year]); 553 } 554 555 sub _ParseItemName { 556 my ($self, $name) = @_; 557 558 if ($name =~ /^__substg1 0_(....)(....)$/) { 559 my ($property, $encoding) = ($1, $2); 560 if ($encoding eq ENCODING_UNICODE and not ($self->{HAS_UNICODE})) { 561 warn "This MSG file contains Unicode fields." 562 . " This is currently unsupported.\n"; 563 $self->{HAS_UNICODE} = 1; 564 } elsif (not (KNOWN_ENCODINGS()->{$encoding})) { 565 warn "Unknown encoding $encoding. Results may be strange or wrong.\n"; 566 } 567 return ($property, $encoding); 568 } else { 569 return (undef, undef); 570 } 571 } 572 573 sub _SaveAttachment { 574 my ($self, $mime, $att) = @_; 575 576 my $ent = $mime->attach( 577 Type => $att->{MIMETYPE}, 578 Encoding => $att->{ENCODING}, 579 Data => [], 580 Filename => ($att->{LONGNAME} ? $att->{LONGNAME} : $att->{SHORTNAME}), 581 Disposition => $att->{DISPOSITION} 582 ); 583 584 my $handle; 585 if ($handle = $ent->open("w")) { 586 $handle->print($att->{DATA}); 587 $handle->close; 588 } else { 589 warn "Could not write data!"; 590 } 591 } 592 593 sub _SetAddressPart { 594 my ($self, $adrname, $partname, $data) = @_; 595 596 my $address = $self->{ADDRESSES}->{$adrname}; 597 $data =~ s/\000//g; 598 #warn "Processing address data part $partname : $data\n"; 599 if (defined ($address->{$partname})) { 600 if ($address->{$partname} eq $data) { 601 warn "Skipping duplicate but identical address information for" 602 . " $partname\n" if $self->{VERBOSE}; 603 } else { 604 warn "Address information $partname inconsistent:\n"; 605 warn " Original data: $address->{$partname}\n"; 606 warn " New data: $data\n"; 607 } 608 } else { 609 $address->{$partname} = $data; 610 } 611 } 612 613 # Set header fields 614 sub _AddHeaderField { 615 my ($self, $mime, $fieldname, $value) = @_; 616 617 my $oldvalue = $mime->head->get($fieldname); 618 return if $oldvalue; 619 $mime->head->add($fieldname, $value) if $value; 620 } 621 622 sub _Address { 623 my ($self, $tag) = @_; 624 my $name = $self->{$tag} || ""; 625 my $address = $self->{$tag . "_ADDR"} || ""; 626 return "$name <$address>"; 627 } 628 629 # Find SMTP addresses for the given list of names 630 sub _ExpandAddressList { 631 my ($self, $names) = @_; 632 633 my $addresspool = $self->{ADDRESSES}; 634 my @namelist = split /; */, $names; 635 my @result; 636 name: foreach my $name (@namelist) { 637 foreach my $address (@$addresspool) { 638 if ($name eq $address->{NAME}) { 639 my $addresstext = $address->{NAME} . " <"; 640 if (defined ($address->{SMTPADDRESS})) { 641 $addresstext .= $address->{SMTPADDRESS}; 642 } elsif ($address->{TYPE} eq "SMTP") { 643 $addresstext .= $address->{ADDRESS}; 644 } 645 $addresstext .= ">"; 646 push @result, $addresstext; 647 next name; 648 } 649 } 650 push @result, $name; 651 } 652 return join ", ", @result; 653 } 654 655 sub _ParseHead { 656 my ($self, $data) = @_; 657 defined $data or return undef; 658 # Parse full header date if we got that. 659 my $parser = new MIME::Parser(); 660 $parser->output_to_core(1); 661 $parser->decode_headers(1); 662 $data =~ s/^Microsoft Mail.*$/X-MSGConvert: yes/m; 663 my $entity = $parser->parse_data($data) 664 or warn "Couldn't parse full headers!"; 665 my $head = $entity->head; 666 $head->unfold; 667 return $head; 668 } 669 670 # Find out if we need to construct a multipart message 671 sub _IsMultiPart { 672 my $self = shift; 673 674 return ( 675 ($self->{BODY_HTML} and $self->{BODY_PLAIN}) 676 or @{$self->{ATTACHMENTS}}>0 677 ); 678 } 679 680 # Copy original header data. 681 # Note: This should contain the Date: header. 682 sub _CopyHeaderData { 683 my ($self, $mime) = @_; 684 685 my $head = $self->_ParseHead($self->{HEAD}) or return; 686 687 foreach my $tag (grep {!$skipheaders->{$_}} $head->tags) { 688 foreach my $value ($head->get_all($tag)) { 689 $mime->head->add($tag, $value); 690 } 691 } 692 } 693 694 # Set header fields 695 sub _SetHeaderFields { 696 my ($self, $mime) = @_; 697 698 # If we didn't get the date from the original header data, we may be able 699 # to get it from the SUBMISSION_ID: 700 $self->_AddHeaderField($mime, 'Date', $self->_SubmissionIdDate()); 701 702 # Third and last chance to set the Date: header; this uses the date the 703 # MSG file was saved. 704 $self->_AddHeaderField($mime, 'Date', $self->{OLEDATE}); 705 $self->_AddHeaderField($mime, 'Subject', $self->{SUBJECT}); 706 $self->_AddHeaderField($mime, 'From', $self->_Address("FROM")); 707 #$self->_AddHeaderField($mime, 'Reply-To', $self->_Address("REPLYTO")); 708 $self->_AddHeaderField($mime, 'To', $self->_ExpandAddressList($self->{TO})); 709 $self->_AddHeaderField($mime, 'Cc', $self->_ExpandAddressList($self->{CC})); 710 $self->_AddHeaderField($mime, 'Message-Id', $self->{MESSAGEID}); 711 $self->_AddHeaderField($mime, 'In-Reply-To', $self->{INREPLYTO}); 712 } 713 714 package main; 715 use Getopt::Long; 716 use Pod::Usage; 717 718 # Setup command line processing. 719 my $verbose = ''; 720 my $help = ''; # Print help message and exit. 721 GetOptions('verbose' => \$verbose, 'help|?' => \$help) or pod2usage(2); 722 pod2usage(1) if $help; 723 724 # Get file name 725 my $file = $ARGV[0]; 726 defined $file or pod2usage(2); 727 warn "Will parse file: $file\n" if $verbose; 728 729 # Load and parse MSG file (is OLE) 730 my $Msg = OLE::Storage_Lite->new($file); 731 my $PPS = $Msg->getPpsTree(1); 732 $PPS or die "$file must be an OLE file"; 733 734 # parse PPS tree 735 my $parser = new MSGParser(); 736 $parser->set_verbosity(1) if $verbose; 737 $parser->parse($PPS); 738 $parser->print(); 739 740 # 741 # Usage info follows. 742 # 743 __END__ 744 745 =head1 NAME 746 747 msgconvert.pl - Convert Outlook .msg files to mbox format 748 749 =head1 SYNOPSIS 750 751 msgconvert.pl [options] <file.msg> 752 753 Options: 754 --verbose be verbose 755 --help help message 756 757 =head1 OPTIONS 758 759 =over 8 760 761 =item B<--verbose> 762 763 Print information about skipped parts of the .msg file. 764 765 =item B<--help> 766 767 Print a brief help message. 768 769 =head1 DESCRIPTION 770 771 This program will output the message contained in file.msg in mbox format 772 on stdout. It will complain about unrecognized OLE parts on 773 stderr. 774 775 =head1 BUGS 776 777 Not all data that's in the .MSG file is converted. There simply are some 778 parts whose meaning escapes me. One of these must contain the date the 779 message was sent, for example. Formatting of text messages will also be 780 lost. YMMV. 781 782 =cut -
xapian-applications/omega/omindex.cc
diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc index 5d91036..90224fb 100644
a b 5 5 * Copyright 2001,2002 Ananova Ltd 6 6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011 Olly Betts 7 7 * Copyright 2009 Frank J Bruzzaniti 8 * Copyright 2006,2007,2008 AVL List GesmbH 8 9 * 9 10 * This program is free software; you can redistribute it and/or 10 11 * modify it under the terms of the GNU General Public License as … … 65 66 66 67 #include "gnu_getopt.h" 67 68 69 #ifndef LIBEXECDIR 70 // must have ending slash 71 //# define LIBEXECDIR "/usr/lib/omega/bin/" 72 # define LIBEXECDIR "" 73 #endif 74 #ifndef PKGDATADIR 75 // must have ending slash 76 # define PKGDATADIR "/usr/share/omega/" 77 #endif 78 68 79 using namespace std; 69 80 70 81 #define TITLE_SIZE 128 … … static bool follow_symlinks = false; 78 89 static bool ignore_exclusions = false; 79 90 static bool spelling = false; 80 91 static bool verbose = false; 92 string error_log; /* used in runfilter.cc */ 93 static string baseurl; 94 static string dbpath; 95 static string cache_dir; 81 96 static enum { 82 97 EMPTY_BODY_WARN, EMPTY_BODY_INDEX, EMPTY_BODY_SKIP 83 98 } empty_body = EMPTY_BODY_WARN; … … static time_t last_mod_max; 98 113 // text are common, so we handle these with a std::map. 99 114 static map<string, string> commands; 100 115 116 static void 117 index_directory(const string &path, const string &url_, size_t depth_limit, 118 map<string, string>& mime_map); 119 101 120 inline static bool 102 121 p_notalnum(unsigned int c) 103 122 { … … void 310 329 index_mimetype(const string & file, const string & url, const string & ext, 311 330 const string &mimetype, DirectoryIterator &d); 312 331 332 static 333 void mkdir_p(const string &path, mode_t mode) { 334 (void)mode; // FIXME 335 #ifdef __WIN32__ 336 system(("mkdir \"" + shell_protect(path) + "\"").c_str()); 337 #else 338 if (system(("mkdir -p " + shell_protect(path)).c_str()) < 0) { /* FIXME */ } 339 #endif 340 } 341 342 /* 343 * unpack .msg/.pst/.rar/.zip into local cache dir and recurse there 344 */ 345 static void 346 index_cached_directory(size_t depth_limit, 347 const string &file, 348 const string &url, 349 const string &ext, 350 const string &cmd, 351 map<string, string>& mime_map) 352 { 353 string oldroot = root; 354 root = cache_dir; 355 string cache = root+"/."+ext; 356 string cachedir = cache+url; 357 struct stat statfile, statcache; 358 bool extract_cache; 359 #ifdef HAVE_LSTAT 360 lstat(file.c_str(), &statfile); 361 lstat(cachedir.c_str(), &statcache); 362 #else 363 stat(file.c_str(), &statfile); 364 stat(cachedir.c_str(), &statcache); 365 #endif 366 extract_cache = true; 367 // if cachedir exists and if file is older as cachedir and if cachedir existed 5 secs ago, 368 // then it was already extracted. 369 if (S_ISDIR(statcache.st_mode) 370 && S_ISREG(statfile.st_mode) 371 && (statfile.st_mtime < statcache.st_mtime) 372 && (statcache.st_mtime < (time_t)(time(NULL)-500))) // not created by nested mkdir call 373 { 374 // but is it in the database also? prevent from deleting skipped files 375 if (verbose) 376 cout << "Unchanged cache \"" << cachedir << "\" - \"" << file << "\" - skip extraction " 377 // << statfile.st_mtime << " < " << statcache.st_mtime 378 << endl; 379 extract_cache = false; 380 } 381 if (S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode) ) { 382 // If last_mod > last_mod_max, we know for sure that the file is new 383 // or updated. 384 if (statfile.st_mtime <= last_mod_max) { 385 // check database timestamp for cached container, esp. for cleaned up caches. 386 // if already in db we need not to extract again 387 string urlterm("U"); 388 urlterm += baseurl; 389 urlterm += "/."+ext+url; 390 if (urlterm.length() > MAX_SAFE_TERM_LENGTH) 391 urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH); 392 393 Xapian::PostingIterator p = db.postlist_begin(urlterm); 394 if (p != db.postlist_end(urlterm)) { 395 Xapian::docid docid = *p; 396 Xapian::Document doc = db.get_document(docid); 397 string value = doc.get_value(VALUE_LASTMOD); 398 time_t old_last_mod = binary_string_to_int(value); 399 if (statfile.st_mtime <= old_last_mod) { 400 if (verbose) 401 cout << "Cache "<< "."+ext+url << " not newer. Ignored." << endl; 402 // The docid should be in updated - the only valid 403 // exception is if the URL was long and hashed to the 404 // same URL as an existing document indexed in the same 405 // batch. 406 if (usual(docid < updated.size() && !updated[docid])) { 407 updated[docid] = true; 408 --old_docs_not_seen; 409 } 410 root = oldroot; 411 return; 412 } 413 } 414 } 415 } 416 417 if (extract_cache) { 418 if (verbose) 419 cout << "[EXTRACT into cache " << cachedir << "]" << endl; 420 if (verbose && S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode)) 421 cout << " ...changed cache \"" << cachedir << "\" - \"" << file << "\" " 422 << statfile.st_mtime << " < " << statcache.st_mtime << " time: " << time(NULL) 423 << endl; 424 if (!S_ISDIR(statcache.st_mode)) 425 mkdir_p(cachedir, 0755); 426 stdout_to_string(cmd); 427 #ifndef __WIN32__ 428 stdout_to_string("chmod -R u+rwx " + shell_protect(cachedir)); 429 #endif 430 #ifdef HAVE_LSTAT 431 lstat(cachedir.c_str(), &statcache); 432 #else 433 stat(cachedir.c_str(), &statcache); 434 #endif 435 } 436 437 if (S_ISDIR(statcache.st_mode)) { 438 if (depth_limit == 1) { 439 cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl; 440 } else { 441 // max loop 5, magic start: /.ext+file 442 index_directory(file + "/."+ext+url, url, depth_limit + 5, mime_map); 443 if (verbose) 444 cout << "[CLEANUP " << "rm -rf " << shell_protect(cachedir) << "]" << endl; 445 rm_rf(cachedir); 446 } 447 } 448 else { // no -p would be fatal here 449 cout << "cachedir " << shell_protect(cachedir) << " does not exist - skipped" << endl; 450 } 451 root = oldroot; 452 } 453 313 454 static void 314 455 index_file(const string &file, const string &url, DirectoryIterator & d, 315 map<string, string>& mime_map )456 map<string, string>& mime_map, size_t depth_limit) 316 457 { 317 458 string ext; 318 459 const char * dot_ptr = strrchr(d.leafname(), '.'); … … index_file(const string &file, const string &url, DirectoryIterator & d, 340 481 341 482 string mimetype; 342 483 if (mt == mime_map.end()) { 484 if (strcasecmp(d.leafname(), "mbox") == 0) { 485 // Special filename. 486 mimetype = "message/rfc822"; 487 goto got_mimetype; 488 } 489 343 490 mimetype = d.get_magic_mimetype(); 344 491 if (mimetype.empty()) { 345 492 skip(file, "Unknown extension and unrecognised format", … … index_file(const string &file, const string &url, DirectoryIterator & d, 352 499 mimetype = mt->second; 353 500 } 354 501 502 got_mimetype: 503 355 504 if (verbose) 356 505 cout << "Indexing \"" << file.substr(root.size()) << "\" as " 357 506 << mimetype << " ... "; … … index_mimetype(const string & file, const string & url, const string & ext, 418 567 } 419 568 } 420 569 } 570 // add the db basename to cache_dir 571 { 572 ensure_tmpdir(); // FIXME: be lazy! 573 cache_dir = tmpdir; 574 const char *p = strrchr(dbpath.c_str(), '/'); 575 // on windows only 576 if (!p) p = strrchr(dbpath.c_str(), '\\'); 577 if (p) { p++; } else { p = dbpath.c_str(); } 578 cache_dir += p; 579 } 421 580 422 581 if (verbose) cout << flush; 423 582 … … index_mimetype(const string & file, const string & url, const string & ext, 482 641 } else { 483 642 // FIXME: What charset is the file? Look at contents? 484 643 } 644 #if 0 // FIXME: this won't work as omindex will have the database locked... 645 } else if (mimetype == "message/rfc822") { // // => mbox2script 646 //for stemmer lang, parse stemmer.get_description => Xapian::Stem(bla) 647 string cmd = LIBEXECDIR"mbox2omega " + shell_protect(file) + error_log+"| " 648 "scriptindex " + shell_protect(dbpath) + " "PKGDATADIR"mbox2script.script"; 649 try { 650 dump = stdout_to_string(cmd); 651 } catch (ReadError) { 652 cout << "\"" << cmd << "\" failed - skipping" << endl; 653 return; 654 } 655 #endif 485 656 } else if (mimetype == "application/pdf") { 486 657 string safefile = shell_protect(file); 487 658 string cmd = "pdftotext -enc UTF-8 " + safefile + " -"; … … index_mimetype(const string & file, const string & url, const string & ext, 711 882 712 883 generate_sample_from_csv(dump, sample); 713 884 } else if (mimetype == "application/vnd.ms-outlook") { 714 string cmd = get_pkglibbindir() + "/outlookmsg2html " + shell_protect(file); 715 MyHtmlParser p; 716 p.ignore_metarobots(); 885 string oldroot = root; 886 struct stat statcache; 887 char olddir[256]; 888 889 if (depth_limit == 1) { 890 skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME); 891 return; 892 } 893 string cmd = LIBEXECDIR"outlook2text "+shell_protect(file); 894 // unpack multiparts and attachments. so we have to chdir first 895 string fulldir = cache_dir+"/.msg"+url; 896 if (getcwd(olddir, 256) == NULL) { /* FIXME */ } 897 #ifdef HAVE_LSTAT 898 lstat(fulldir.c_str(), &statcache); 899 #else 900 stat(fulldir.c_str(), &statcache); 901 #endif 902 if (!S_ISDIR(statcache.st_mode)) { 903 mkdir_p(fulldir, 0755); 904 } 717 905 try { 718 dump = stdout_to_string(cmd); 719 // FIXME: what should the default charset be? 720 p.parse_html(dump, "iso-8859-1", false); 721 } catch (const string & newcharset) { 722 p.reset(); 723 p.ignore_metarobots(); 724 p.parse_html(dump, newcharset, true); 906 if (chdir(fulldir.c_str()) < 0) { /* FIXME */ } 907 size_t new_limit = depth_limit; 908 if (new_limit) --new_limit; 909 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 910 if (chdir(olddir) < 0) { /* FIXME */ } 725 911 } catch (ReadError) { 726 skip_cmd_failed(file, cmd); 727 return; 912 cout << "failed " << cmd << " << in index_cached_directory" << endl; 913 if (chdir(olddir) < 0) { /* FIXME */ } 914 root = oldroot; 915 } catch (...) { 916 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 917 if (chdir(olddir) < 0) { /* FIXME */ } 918 root = oldroot; 919 throw; 728 920 } 729 dump = p.dump; 730 title = p.title; 731 keywords = p.keywords; 732 sample = p.sample; 733 author = p.author; 921 return; 734 922 } else if (mimetype == "image/svg+xml") { 735 923 SvgParser svgparser; 736 924 svgparser.parse_html(d.file_to_string()); … … index_mimetype(const string & file, const string & url, const string & ext, 759 947 if (idx != string::npos) { 760 948 dump.assign(desc, idx + 1, string::npos); 761 949 } 950 } else if (mimetype == "application/x-zip") { 951 string oldroot = root; 952 if (depth_limit == 1) { 953 skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME); 954 return; 955 } 956 // overwrite 957 string cmd = "unzip -u -P. -o " +shell_protect(file) + " -d " +shell_protect(cache_dir+"/.zip"+url+"/"); 958 try { 959 size_t new_limit = depth_limit; 960 if (new_limit) --new_limit; 961 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 962 } catch (ReadError) { 963 cout << "failed " << cmd << " << in index_cached_directory" << endl; 964 root = oldroot; 965 } catch (...) { 966 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 967 root = oldroot; 968 throw; 969 } 970 return; 971 } else if (mimetype == "application/x-rar") { 972 string oldroot = root; 973 if (depth_limit == 1) { 974 skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME); 975 return; 976 } 977 978 // overwrite 979 string cmd = "unrar x -o+ " +shell_protect(file) + " " 980 + shell_protect(cache_dir+"/.rar"+url+"/"); 981 try { 982 size_t new_limit = depth_limit; 983 if (new_limit) --new_limit; 984 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 985 } catch (ReadError) { 986 cout << "failed " << cmd << " << in index_cached_directory" << endl; 987 root = oldroot; 988 } catch (...) { 989 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 990 root = oldroot; 991 throw; 992 } 993 return; 994 } else if (mimetype == "application/vnd.ms-outlook-pst") { 995 string oldroot = root; 996 if (depth_limit == 1) { 997 skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME); 998 return; 999 } 1000 // unpack attachments also, together with mbox files 1001 string cmd = "readpst -r -cv -w -o " 1002 + shell_protect(cache_dir+"/.pst"+url+"/")+" "+shell_protect(file); 1003 try { 1004 size_t new_limit = depth_limit; 1005 if (new_limit) --new_limit; 1006 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 1007 } catch (ReadError) { 1008 root = oldroot; 1009 cout << "failed " << cmd << " << in index_cached_directory" << endl; 1010 } catch (...) { 1011 root = oldroot; 1012 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 1013 throw; 1014 } 1015 return; 762 1016 } else { 763 1017 // Don't know how to index this type. 764 1018 skip_unknown_mimetype(file, mimetype); … … index_directory(const string &path, const string &url_, size_t depth_limit, 975 1229 break; 976 1230 } 977 1231 case DirectoryIterator::REGULAR_FILE: 978 index_file(file, url, d, mime_map );1232 index_file(file, url, d, mime_map, depth_limit); 979 1233 break; 980 1234 default: 981 1235 skip(file, "Not a regular file", … … main(int argc, char **argv) 999 1253 bool overwrite = false; 1000 1254 // If delete_removed_documents is true, delete any documents we don't see. 1001 1255 bool delete_removed_documents = true; 1002 string baseurl;1003 1256 size_t depth_limit = 0; 1004 1257 1005 1258 static const struct option longopts[] = { … … main(int argc, char **argv) 1124 1377 mime_map["pps"] = "application/vnd.ms-powerpoint"; // Powerpoint slideshow 1125 1378 mime_map["msg"] = "application/vnd.ms-outlook"; // Outlook .msg email 1126 1379 1380 // Outlook message folder: 1381 mime_map["pst"] = "application/vnd.ms-outlook-pst"; // readpst | uudeview (libpst) 1382 1383 // Miscellaneous compound formats: 1384 mime_map["mbox"] = "message/rfc822"; // => mbox2omega 1385 mime_map["mbx"] = "message/rfc822"; // => mbox2omega 1386 #ifndef _MSC_VER 1387 mime_map["zip"] = "application/x-zip"; // recursive scanning 1388 mime_map["rar"] = "application/x-rar"; // recursive scanning 1389 #endif 1390 1127 1391 // Perl: 1128 1392 mime_map["pl"] = "text/x-perl"; 1129 1393 mime_map["pm"] = "text/x-perl"; … … main(int argc, char **argv) 1180 1444 argv[1] = const_cast<char *>("--version"); 1181 1445 } 1182 1446 1183 string dbpath;1184 1447 int getopt_ret; 1185 1448 while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:F:l:s:pfSVe:i", 1186 1449 longopts, NULL)) != -1) { … … main(int argc, char **argv) 1330 1593 baseurl += '/'; 1331 1594 } 1332 1595 1596 string log_dir = "./"; // FIXME: need to set log_dir to something appropriate. 1597 error_log = " 2>>"+log_dir+"omindex-error.log"; 1598 1333 1599 if (optind >= argc || optind + 2 < argc) { 1334 1600 cerr << PROG_NAME": you must specify a directory to index.\n" 1335 1601 "Do this either as a single directory (corresponding to the base URL)\n" -
new file xapian-applications/omega/outlook2text.in
diff --git a/xapian-applications/omega/outlook2text.in b/xapian-applications/omega/outlook2text.in new file mode 100644 index 0000000..b7cf3e2
- + 1 #! /bin/sh 2 # converts msg to mbox and extract attachments 3 # either be in the cache dir, or accept it as 2nd arg 4 if [ -n $2 ]; then 5 @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "$2" 6 else 7 # already is in the cache dir 8 base=`basename "$1" .msg` 9 @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "${base}" 10 fi -
xapian-applications/omega/runfilter.cc
diff --git a/xapian-applications/omega/runfilter.cc b/xapian-applications/omega/runfilter.cc index bb5e4fd..e77bc56 100644
a b 55 55 56 56 using namespace std; 57 57 58 extern string error_log; 59 58 60 string 59 61 stdout_to_string(const string &cmd) 60 62 { 61 63 string out; 64 string tmp = cmd; 65 tmp += error_log; 62 66 #if defined HAVE_FORK && defined HAVE_SOCKETPAIR && defined HAVE_SETRLIMIT 63 67 // We want to be able to get the exit status of the child process. 64 68 signal(SIGCHLD, SIG_DFL); … … stdout_to_string(const string &cmd) 101 105 } 102 106 #endif 103 107 104 execl("/bin/sh", "/bin/sh", "-c", cmd.c_str(), (void*)NULL);108 execl("/bin/sh", "/bin/sh", "-c", tmp.c_str(), (void*)NULL); 105 109 _exit(-1); 106 110 } 107 111 … … stdout_to_string(const string &cmd) 168 172 throw ReadError(); 169 173 } 170 174 #else 171 FILE * fh = popen( cmd.c_str(), "r");175 FILE * fh = popen(tmp.c_str(), "r"); 172 176 if (fh == NULL) throw ReadError(); 173 177 while (!feof(fh)) { 174 178 char buf[4096]; -
xapian-applications/omega/utils.cc
diff --git a/xapian-applications/omega/utils.cc b/xapian-applications/omega/utils.cc index 797c47d..3e94268 100644
a b 26 26 #include <cassert> 27 27 #include <stdio.h> // for sprintf/snprintf 28 28 #include <cstdlib> 29 #include <cstring> 30 #include "safesysstat.h" 29 31 30 32 #include <string> 31 33 32 34 using namespace std; 33 35 36 #ifdef __WIN32__ 37 #include "safewindows.h" 38 #endif 39 34 40 // This ought to be enough for any of the conversions below. 35 41 #define BUFSIZE 100 36 42 … … using namespace std; 40 46 int len = SNPRINTF(buf, BUFSIZE, (FMT), val);\ 41 47 if (len == -1 || len > BUFSIZE) return string(buf, BUFSIZE);\ 42 48 return string(buf, len); 49 /// Allow system to work directly on C++ strings. 50 inline int system(const string &command) { return system(command.c_str()); } 51 52 // Duplicated from omindex.cc - FIXME 53 static string 54 shell_protect(const string & file) 55 { 56 string safefile = file; 57 #ifdef __WIN32__ 58 bool need_to_quote = false; 59 for (string::iterator i = safefile.begin(); i != safefile.end(); ++i) { 60 unsigned char ch = *i; 61 if (!isalnum(ch) && ch < 128) { 62 if (ch == '/') { 63 // Convert Unix path separators to backslashes. C library 64 // functions understand "/" in paths, but external commands 65 // generally don't, and also may interpret a leading '/' as 66 // introducing a command line option. 67 *i = '\\'; 68 } else if (ch == ' ') { 69 need_to_quote = true; 70 } else if (ch < 32 || strchr("<>\"|*?", ch)) { 71 // Check for invalid characters in the filename. 72 string m("Invalid character '"); 73 m += ch; 74 m += "' in filename \""; 75 m += file; 76 m += '"'; 77 throw m; 78 } 79 } 80 } 81 if (safefile[0] == '-') { 82 // If the filename starts with a '-', protect it from being treated as 83 // an option by prepending ".\". 84 safefile.insert(0, ".\\"); 85 } 86 if (need_to_quote) { 87 safefile.insert(0, "\""); 88 safefile += '"'; 89 } 90 #else 91 string::size_type p = 0; 92 if (!safefile.empty() && safefile[0] == '-') { 93 // If the filename starts with a '-', protect it from being treated as 94 // an option by prepending "./". 95 safefile.insert(0, "./"); 96 p = 2; 97 } 98 while (p < safefile.size()) { 99 // Don't escape some safe characters which are common in filenames. 100 unsigned char ch = safefile[p]; 101 if (!isalnum(ch) && strchr("/._-", ch) == NULL) { 102 safefile.insert(p, "\\"); 103 ++p; 104 } 105 ++p; 106 } 107 #endif 108 return safefile; 109 } 110 111 /// Remove a directory and contents. 112 void 113 rm_rf(const string &filename) 114 { 115 // Check filename exists and is actually a directory 116 struct stat sb; 117 if (stat(filename.c_str(), &sb) != 0 || !S_ISDIR(sb.st_mode)) return; 118 119 string safefile = shell_protect(filename); 120 #ifdef __WIN32__ 121 # if 1 122 static int win95 = -1; 123 if (win95 == -1) { 124 OSVERSIONINFO info; 125 memset(&info, 0, sizeof(OSVERSIONINFO)); 126 info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); 127 if (GetVersionEx(&info)) { 128 win95 = (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS); 129 } 130 } 131 132 if (win95) { 133 // for 95 like systems: 134 system("deltree /y \"" + safefile + "\""); 135 } else { 136 // for NT like systems: 137 system("rd /s /q \"" + safefile + "\""); 138 } 139 # else 140 safefile.append("\0", 2); 141 SHFILEOPSTRUCT shfo; 142 memset((void*)&shfo, 0, sizeof(shfo)); 143 shfo.hwnd = 0; 144 shfo.wFunc = FO_DELETE; 145 shfo.pFrom = safefile.data(); 146 shfo.fFlags = FOF_NOCONFIRMATION|FOF_NOERRORUI|FOF_SILENT; 147 (void)SHFileOperation(&shfo); 148 # endif 149 #else 150 system("rm -rf " + safefile); 151 #endif 152 } 43 153 #else 44 154 #define CONVERT_TO_STRING(FMT) \ 45 155 char buf[BUFSIZE];\ -
xapian-applications/omega/utils.h
diff --git a/xapian-applications/omega/utils.h b/xapian-applications/omega/utils.h index a54a4f8..b2241b7 100644
a b int string_to_int(const std::string & s); 34 34 /** Remove any leading and/or trailing whitespace from @a s. */ 35 35 void trim(std::string & s); 36 36 37 void rm_rf(const std::string &filename); 38 37 39 #endif