Ticket #282: xapian-omega-1.0.7a-from-ticket-285-and-cleaned-up-updated-2010-10-29.patch
File xapian-omega-1.0.7a-from-ticket-285-and-cleaned-up-updated-2010-10-29.patch, 48.2 KB (added by , 14 years ago) |
---|
-
xapian-omega-1.0.7a/ChangeLog
diff -u xapian-omega-1.0.7a/ChangeLog.orig
old new 1526 2006-08-22 09:30:12 Reini Urban <reinhard.urban@avl.com> 1527 1528 omega-0.9.6c: 1529 * omindex.cc: Fix wrong timestamp comparison in cache logic 1530 * scriptindex.cc: Add lastmod and size records and values. 1531 * excel2text, outlook2text.in: New scripts 1532 1533 2006-08-18 15:13:32 Reini Urban <reinhard.urban@avl.com> 1534 1535 omega-0.9.6b: 1536 * omindex.cc: Add HAVE_UNRAR, HAVE_MSGCONVERT, HAVE_READPST checks. 1537 Add options --silent 1538 1539 2006-08-17 18:06:26 Reini Urban <reinhard.urban@avl.com> 1540 1541 omega-0.9.6a: 1542 * omindex.cc: Added cached virtual directories zip,msg,pst,...). 1543 Consistently log stderr to /var/log/omega/omindex-error.log. 1544 * configure.ac: Add HAVE_UNRAR, HAVE_MSGCONVERT, HAVE_READPST. 1545 1526 1546 Sun Jul 09 01:40:09 BST 2006 Olly Betts <olly@survex.com> 1527 1547 1528 1548 * docs/omegascript.txt: Note that (by design) an omegascript template -
xapian-omega-1.0.7a/Makefile.am
diff -u xapian-omega-1.0.7a/Makefile.am.orig
old new 61 61 pkglibbindir = $(pkglibdir)/bin 62 62 pkglibbin_PROGRAMS = omega 63 63 bin_PROGRAMS = omindex scriptindex 64 dist_libexec_SCRIPTS = outlook2text excel2text mimeexplode msgconvert.pl 64 65 dist_bin_SCRIPTS = dbi2omega htdig2omega mbox2omega 65 66 66 67 check_PROGRAMS = htmlparsetest md5test utf8converttest … … 92 93 common/safewindows.h\ 93 94 common/stringutils.h 94 95 95 AM_LDFLAGS = $(ICONV_LDFLAGS)96 AM_LDFLAGS = -no-undefined $(ICONV_LDFLAGS) 96 97 97 98 omega_SOURCES = omega.cc query.cc cgiparam.cc utils.cc configfile.cc date.cc\ 98 99 cdb_init.cc cdb_find.cc cdb_hash.cc cdb_unpack.cc loadfile.cc\ … … 127 128 MAINTAINERCLEANFILES = $(dist_man_MANS) 128 129 endif 129 130 131 CLEANFILES = $(dist_libexec_SCRIPTS) $(dist_bin_SCRIPTS) 132 133 omega.conf: $(srcdir)/omega.conf.in Makefile 134 sed "s,@localstatedir@,$(localstatedir)," $(srcdir)/omega.conf.in > $@ 135 outlook2text: $(srcdir)/outlook2text.in mimeexplode Makefile 136 sed "s,@MSGCONVERT@,$(MSGCONVERT),;s,@MIMEEXPLODE@,$(pkglibbindir)/mimeexplode," $(srcdir)/outlook2text.in > $@ 137 130 138 if DOCUMENTATION_RULES -
xapian-omega-1.0.7a/excel2text
diff -u xapian-omega-1.0.7a/excel2text.orig
old new 1 #! /bin/sh 2 # strip numbers, to stdout 3 xls2csv -q0 "$1" | sed -re's/[0123456789.]+,//g' -
xapian-omega-1.0.7a/mimeexplode
diff -u xapian-omega-1.0.7a/mimeexplode.orig
old new 1 #!/usr/bin/perl -w 2 3 =head1 NAME 4 5 mimeexplode - explode one or more MIME messages 6 7 =head1 SYNOPSIS 8 9 mimeexplode [-d <dir>] <mime-msg-file> <mime-msg-file> ... 10 11 someprocess | mimeexplode - 12 13 =head1 DESCRIPTION 14 15 Takes one or more files from the command line that contain MIME 16 messages, and explodes their contents out into subdirectories 17 of the current working directory. The subdirectories are 18 just called C<msg0>, C<msg1>, C<msg2>, etc. Existing directories are 19 skipped over. 20 21 The message information is output to the stdout, like this: 22 23 Message: msg3 (inputfile1.msg) 24 Part: msg3/filename-1.dat (text/plain) 25 Part: msg3/filename-2.dat (text/plain) 26 Message: msg5 (input-file2.msg) 27 Part: msg5/dir.gif (image/gif) 28 Part: msg5/face.jpg (image/jpeg) 29 Message: msg6 (infile3) 30 Part: msg6/filename-1.dat (text/plain) 31 32 This was written as an example of the MIME:: modules in the 33 MIME-parser package I wrote. It may prove useful as a quick-and-dirty 34 way of splitting a MIME message if you need to decode something, and 35 you don't have a MIME mail reader on hand. 36 37 =head1 COMMAND LINE OPTIONS 38 39 -d outdir 40 41 =head1 AUTHOR 42 43 Eryq C<eryq@zeegee.com>, in a big hurry... 44 Reini Urban C<rurban@x-ray.at>: -d option to always explode into the same dir 45 46 =cut 47 48 #BEGIN { unshift @INC, ".." } # to test MIME:: stuff before installing it! 49 50 require 5.001; 51 52 use strict; 53 use vars; 54 55 use MIME::Parser; 56 use Getopt::Std; 57 my %opts; 58 my $outbase = ''; 59 my $postfix = ''; 60 61 #------------------------------------------------------------ 62 # make_msg - make and return the name of a msgXXX directory 63 #------------------------------------------------------------ 64 65 #ignored 66 #sub make_msg { 67 # while (-d "msg$Msgno") { 68 # ++$Msgno; 69 # die "self-imposed limit reached" if $Msgno == 256; 70 # } 71 # mkdir "msg$Msgno",0755 or die "couldn't make msg$Msgno: $!"; 72 # "msg$Msgno"; 73 #} 74 75 #------------------------------------------------------------ 76 # dump_entity - dump an entity's file info 77 #------------------------------------------------------------ 78 sub dump_entity { 79 my $ent = shift; 80 my @parts = $ent->parts; 81 82 if (@parts) { # multipart... 83 map { dump_entity($_) } @parts; 84 } 85 else { # single part... 86 print " Part: ", $ent->bodyhandle->path, 87 " (", scalar($ent->head->mime_type), ")\n"; 88 } 89 } 90 91 #------------------------------------------------------------ 92 # main 93 #------------------------------------------------------------ 94 sub main { 95 my $file; 96 my $entity; 97 98 # make sure the same message gets exploded into the same dir 99 getopts('d:', \%opts); 100 $outbase = $opts{d} ? $opts{d} : "msg0"; 101 my $outdir = $outbase; 102 103 # Go through messages: 104 @ARGV or unshift @ARGV, "-"; 105 while (defined($file = shift @ARGV)) { 106 107 # Sanity: 108 (-d $outdir) or mkdir "$outdir",0755; 109 (-w "$outdir") or die "cwd $outdir not writable!"; 110 #my $msgdir = make_msg(); 111 #print "Message: $msgdir ($file)\n"; 112 113 # Create a new parser object: 114 my $parser = new MIME::Parser; 115 ### $parser->parse_nested_messages('REPLACE'); 116 117 # Optional: set up parameters that will affect how it extracts 118 # documents from the input stream: 119 $parser->output_dir($outdir); 120 121 # Parse an input stream: 122 open FILE, $file or die "couldn't open $file"; 123 $entity = $parser->read(\*FILE) or 124 print STDERR "Couldn't parse MIME in $file; continuing...\n"; 125 close FILE; 126 127 # Congratulations: you now have a (possibly multipart) MIME entity! 128 dump_entity($entity) if $entity; 129 ### $entity->dump_skeleton if $entity; 130 131 $postfix++; 132 $outdir = $outbase.$postfix; 133 } 134 1; 135 } 136 137 exit (&main ? 0 : -1); 138 #------------------------------------------------------------ 139 1; 140 -
xapian-omega-1.0.7a/msgconvert.pl
diff -u xapian-omega-1.0.7a/msgconvert.pl.orig
old new 1 #!/usr/bin/perl -w 2 # 3 # msgconvert.pl: 4 # 5 # Convert .MSG files (made by Outlook (Express)) to multipart MIME messages. 6 # 7 # Copyright 2002, 2004, 2006 Matijs van Zuijlen 8 # 9 # This program is free software; you can redistribute it and/or modify it 10 # under the terms of the GNU General Public License as published by the 11 # Free Software Foundation; either version 2 of the License, or (at your 12 # option) any later version. 13 # 14 # This program is distributed in the hope that it will be useful, but 15 # WITHOUT ANY WARRANTY; without even the implied warranty of 16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 17 # Public License for more details. 18 # 19 # CHANGES: 20 # 20020715 Recognize new items 'Cc', mime type of attachment, long 21 # filename of attachment, and full headers. Attachments turn out 22 # to be numbered, so a regexp is now used to recognize label of 23 # items that are attachments. 24 # 20020831 long file name will definitely be used if present. Full headers 25 # and mime type information are used when present. Created 26 # generic system for specifying known items to be skipped. 27 # Unexpected contents is never reason to bail out anymore. Added 28 # support for usage message and option processing (--verbose). 29 # 20040104 Handle address data slightly better, make From line less fake, 30 # make $verbose and $skippable_entries global vars, handle HTML 31 # variant of body text if present (though not optimally). 32 # 20040214 Fix typos and incorrect comments. 33 # 20040307 - Complete rewrite: All functional parts are now in the package 34 # MSGParser; 35 # - Creation of MIME::Entity object is delayed until the output 36 # routines, which means all data is known; This means I can 37 # create a multipart/alternative body. 38 # - Item names are parsed (thanks to bfrederi@alumni.sfu.ca for 39 # the information). 40 # 20040514 Check if $self->{HEAD} actually exists before trying to add its 41 # contents to the output Mime object's header data. 42 # (Bug reported by Thomas Ng). 43 # Don't produce multipart messages if not needed. 44 # (Bug reported by Justin B. Scout). 45 # 20040529 Correctly format OLEDATE. 46 # 20040530 - Extract date from property 0047 (thanks, Marc Goodman). 47 # - Use address data to make To: and Cc: lines complete 48 # - Use the in-reply-to property 49 # - More unknown properties named. 50 # - Found another property containing an SMTP address. 51 # - Put non-SMTP type addresses back in output. 52 # 20040825 Replace 'our' to declare globals with 'use vars'. This means 53 # the globals our now properly scoped inside the package and not 54 # the file. 55 # This also fixes the bug that this program did not work on perl 56 # versions below 5.6. (Bug reported by Tim Gustafson) 57 # 20060218 More sensible encoding warnings. 58 # 20060219 Move OLE parsing to main program. 59 # Parse nested MSG files (Bug reported by Christof Lukas). 60 # 20060225 Simplify code. 61 # 62 63 # 64 # Import modules. 65 # 66 package MSGParser; 67 use strict; 68 use OLE::Storage_Lite; 69 use MIME::Entity; 70 use MIME::Parser; 71 use Date::Format; 72 use POSIX qw(mktime); 73 use constant DIR_TYPE => 1; 74 use constant FILE_TYPE => 2; 75 76 use vars qw($skipproperties $skipheaders); 77 # 78 # Descriptions partially based on mapitags.h 79 # 80 $skipproperties = { 81 # Envelope properties 82 '000B' => "Conversation key?", 83 '001A' => "Type of message", 84 '003B' => "Sender address variant", 85 '003D' => "Contains 'Re: '", 86 '003F' => "'recieved by' id", 87 '0040' => "'recieved by' name", 88 '0041' => "Sender variant address id", 89 '0042' => "Sender variant name", 90 '0043' => "'recieved representing' id", 91 '0044' => "'recieved representing' name", 92 '0046' => "Read receipt address id", 93 '0051' => "'recieved by' search key", 94 '0052' => "'recieved representing' search key", 95 '0053' => "Read receipt search key", 96 '0064' => "Sender variant address type", 97 '0065' => "Sender variant address", 98 '0070' => "Conversation topic", 99 '0071' => "Conversation index", 100 '0075' => "'recieved by' address type", 101 '0076' => "'recieved by' email address", 102 '0077' => "'recieved representing' address type", 103 '0078' => "'recieved representing' email address", 104 '007F' => "something like a message id", 105 # Recipient properties 106 '0C19' => "Reply address variant", 107 '0C1D' => "Reply address variant", 108 '0C1E' => "Reply address type", 109 # Non-transmittable properties 110 '0E02' => "?Should BCC be displayed", 111 '0E0A' => "sent mail id", 112 '0E1D' => "Subject w/o Re", 113 '0E27' => "64 bytes: Unknown", 114 '0FF6' => "Index", 115 '0FF9' => "Index", 116 '0FFF' => "Address variant", 117 # Content properties 118 '1008' => "Summary or something", 119 '1009' => "RTF Compressed", 120 # 'Common property' 121 '3001' => "Display name", 122 '3002' => "Address Type", 123 '300B' => "'Search key'", 124 # Attachment properties 125 '3702' => "Attachment encoding", 126 '3703' => "Attachment extension", 127 '3709' => "'Attachment rendering'", # Maybe an icon or something? 128 '3713' => "Icon URL?", 129 # 'Mail user' 130 '3A20' => "Address variant", 131 # 3900 -- 39FF: 'Address book' 132 '39FF' => "7 bit display name", 133 # 'Display table properties' 134 '3FF8' => "Routing data?", 135 '3FF9' => "Routing data?", 136 '3FFA' => "Routing data?", 137 '3FFB' => "Routing data?", 138 # 'Transport-defined envelope property' 139 '4029' => "Sender variant address type", 140 '402A' => "Sender variant address", 141 '402B' => "Sender variant name", 142 '5FF6' => "Recipient name", 143 '5FF7' => "Recipient address variant", 144 # 'Provider-defined internal non-transmittable property' 145 '6740' => "Unknown, binary data", 146 # User defined id's 147 '8000' => "Content Class", 148 '8002' => "Unknown, binary data", 149 }; 150 151 $skipheaders = { 152 "MIME-Version" => 1, 153 "Content-Type" => 1, 154 "Content-Transfer-Encoding" => 1, 155 "X-Mailer" => 1, 156 "X-Msgconvert" => 1, 157 "X-MS-Tnef-Correlator" => 1, 158 "X-MS-Has-Attach" => 1, 159 }; 160 161 use constant ENCODING_UNICODE => '001F'; 162 use constant KNOWN_ENCODINGS => { 163 '000D' => 'Directory', 164 '001F' => 'Unicode', 165 '001E' => 'Ascii?', 166 '0102' => 'Binary', 167 }; 168 169 use constant MAP_ATTACHMENT_FILE => { 170 '3701' => ["DATA", 0], # Data 171 '3704' => ["SHORTNAME", 1], # Short file name 172 '3707' => ["LONGNAME", 1], # Long file name 173 '370E' => ["MIMETYPE", 1], # mime type 174 '3716' => ["DISPOSITION", 1], # disposition 175 }; 176 177 use constant MAP_SUBITEM_FILE => { 178 '1000' => ["BODY_PLAIN", 0], # Body 179 '1013' => ["BODY_HTML", 0], # HTML Version of body 180 '0037' => ["SUBJECT", 1], # Subject 181 '0047' => ["SUBMISSION_ID", 1], # Seems to contain the date 182 '007D' => ["HEAD", 1], # Full headers 183 '0C1A' => ["FROM", 1], # Reply-To: Name 184 '0C1E' => ["FROM_ADDR_TYPE", 1], # From: Address type 185 '0C1F' => ["FROM_ADDR", 1], # Reply-To: Address 186 '0E04' => ["TO", 1], # To: Names 187 '0E03' => ["CC", 1], # Cc: Names 188 '1035' => ["MESSAGEID", 1], # Message-Id 189 '1042' => ["INREPLYTO", 1], # In reply to Message-Id 190 }; 191 192 use constant MAP_ADDRESSITEM_FILE => { 193 '3001' => ["NAME", 1], # Real name 194 '3002' => ["TYPE", 1], # Address type 195 '403D' => ["TYPE", 1], # Address type 196 '3003' => ["ADDRESS", 1], # Address 197 '403E' => ["ADDRESS", 1], # Address 198 '39FE' => ["SMTPADDRESS", 1], # SMTP Address variant 199 }; 200 201 # 202 # Main body of module 203 # 204 205 sub new { 206 my $that = shift; 207 my $class = ref $that || $that; 208 209 my $self = { 210 ATTACHMENTS => [], 211 ADDRESSES => [], 212 VERBOSE => 0, 213 HAS_UNICODE => 0, 214 FROM_ADDR_TYPE => "", 215 }; 216 bless $self, $class; 217 } 218 219 # 220 # Main sub: parse the PPS tree, and return 221 # 222 sub parse { 223 my $self = shift; 224 my $PPS = shift or die "Internal error: No PPS tree"; 225 $self->_RootDir($PPS); 226 } 227 228 sub mime_object { 229 my $self = shift; 230 231 my $bodymime; 232 my $mime; 233 234 if ($self->_IsMultiPart) { 235 # Construct a multipart message object 236 237 $mime = MIME::Entity->build(Type => "multipart/mixed"); 238 239 # Set the entity that we'll save the body parts to. If there's more than 240 # one part, it's a new entity, otherwise, it's the main $mime object. 241 if ($self->{BODY_HTML} and $self->{BODY_PLAIN}) { 242 $bodymime = MIME::Entity->build( 243 Type => "multipart/alternative", 244 Encoding => "8bit", 245 ); 246 $mime->add_part($bodymime); 247 } else { 248 $bodymime = $mime; 249 } 250 if ($self->{BODY_PLAIN}) { 251 $self->_SaveAttachment($bodymime, { 252 MIMETYPE => 'text/plain; charset=ISO-8859-1', 253 ENCODING => '8bit', 254 DATA => $self->{BODY_PLAIN}, 255 DISPOSITION => 'inline', 256 }); 257 } 258 if ($self->{BODY_HTML}) { 259 $self->_SaveAttachment($bodymime, { 260 MIMETYPE => 'text/html', 261 ENCODING => '8bit', 262 DATA => $self->{BODY_HTML}, 263 DISPOSITION => 'inline', 264 }); 265 } 266 foreach my $att (@{$self->{ATTACHMENTS}}) { 267 $self->_SaveAttachment($mime, $att); 268 } 269 } elsif ($self->{BODY_PLAIN}) { 270 # Construct a single part message object with a plain text body 271 $mime = MIME::Entity->build( 272 Type => "text/plain", 273 Data => $self->{BODY_PLAIN} 274 ); 275 } elsif ($self->{BODY_HTML}) { 276 # Construct a single part message object with an HTML body 277 $mime = MIME::Entity->build( 278 Type => "text/html", 279 Data => $self->{BODY_HTML} 280 ); 281 } 282 283 $self->_CopyHeaderData($mime); 284 285 $self->_SetHeaderFields($mime); 286 287 return $mime; 288 } 289 290 # Actually output the message in mbox format 291 sub print { 292 my $self = shift; 293 294 my $mime = $self->mime_object; 295 296 # Construct From line from whatever we know. 297 my $string = ""; 298 $string = ( 299 $self->{FROM_ADDR_TYPE} eq "SMTP" ? 300 $self->{FROM_ADDR} : 301 'someone@somewhere' 302 ); 303 $string =~ s/\n//g; 304 305 # The date used here is not really important. 306 print "From ", $string, " ", scalar localtime, "\n"; 307 $mime->print(\*STDOUT); 308 print "\n"; 309 } 310 311 sub set_verbosity { 312 my ($self, $verbosity) = @_; 313 defined $verbosity or die "Internal error: no verbosity level"; 314 $self->{VERBOSE} = $verbosity; 315 } 316 317 # 318 # Below are functions that walk the PPS tree. The *Dir functions handle 319 # processing the directory nodes of the tree (mainly, iterating over the 320 # children), whereas the *Item functions handle processing the items in the 321 # directory (if such an item is itself a directory, it will in turn be 322 # processed by the relevant *Dir function). 323 # 324 325 # 326 # RootItem: Check Root Entry, parse sub-entries. 327 # The OLE file consists of a single entry called Root Entry, which has 328 # several children. These children are parsed in the sub SubItem. 329 # 330 sub _RootDir { 331 my ($self, $PPS) = @_; 332 333 foreach my $child (@{$PPS->{Child}}) { 334 $self->_SubItem($child); 335 } 336 } 337 338 sub _SubItem { 339 my ($self, $PPS) = @_; 340 341 if ($PPS->{Type} == DIR_TYPE) { 342 $self->_SubItemDir($PPS); 343 } elsif ($PPS->{Type} == FILE_TYPE) { 344 $self->_SubItemFile($PPS); 345 } else { 346 warn "Unknown entry type: $PPS->{Type}"; 347 } 348 } 349 350 sub _SubItemDir { 351 my ($self, $PPS) = @_; 352 353 $self->_GetOLEDate($PPS); 354 355 my $name = $self->_GetName($PPS); 356 357 if ($name =~ /__recip_version1 0_ /) { # Address of one recipient 358 $self->_AddressDir($PPS); 359 } elsif ($name =~ '__attach_version1 0_ ') { # Attachment 360 $self->_AttachmentDir($PPS); 361 } else { 362 $self->_UnknownDir($self->_GetName($PPS)); 363 } 364 } 365 366 sub _SubItemFile { 367 my ($self, $PPS) = @_; 368 369 my $name = $self->_GetName($PPS); 370 my ($property, $encoding) = $self->_ParseItemName($name); 371 372 $self->_MapProperty($self, $PPS->{Data}, $property, 373 MAP_SUBITEM_FILE) or $self->_UnknownFile($name); 374 } 375 376 sub _AddressDir { 377 my ($self, $PPS) = @_; 378 379 my $address = { 380 NAME => undef, 381 ADDRESS => undef, 382 TYPE => "", 383 }; 384 foreach my $child (@{$PPS->{Child}}) { 385 $self->_AddressItem($child, $address); 386 } 387 push @{$self->{ADDRESSES}}, $address; 388 } 389 390 sub _AddressItem { 391 my ($self, $PPS, $addr_info) = @_; 392 393 my $name = $self->_GetName($PPS); 394 395 # DIR Entries: There should be none. 396 if ($PPS->{Type} == DIR_TYPE) { 397 $self->_UnknownDir($name); 398 } elsif ($PPS->{Type} == FILE_TYPE) { 399 my ($property, $encoding) = $self->_ParseItemName($name); 400 $self->_MapProperty($addr_info, $PPS->{Data}, $property, 401 MAP_ADDRESSITEM_FILE) or $self->_UnknownFile($name); 402 } else { 403 warn "Unknown entry type: $PPS->{Type}"; 404 } 405 } 406 407 sub _AttachmentDir { 408 my ($self, $PPS) = @_; 409 410 my $attachment = { 411 SHORTNAME => undef, 412 LONGNAME => undef, 413 MIMETYPE => 'application/octet-stream', 414 ENCODING => 'base64', 415 DISPOSITION => 'attachment', 416 DATA => undef 417 }; 418 foreach my $child (@{$PPS->{Child}}) { 419 $self->_AttachmentItem($child, $attachment); 420 } 421 push @{$self->{ATTACHMENTS}}, $attachment; 422 } 423 424 sub _AttachmentItem { 425 my ($self, $PPS, $att_info) = @_; 426 427 my $name = $self->_GetName($PPS); 428 429 my ($property, $encoding) = $self->_ParseItemName($name); 430 431 if ($PPS->{Type} == DIR_TYPE) { 432 433 if ($property eq '3701') { # Nested MSG file 434 my $msgp = new MSGParser(); 435 $msgp->parse($PPS); 436 my $data = $msgp->mime_object->as_string; 437 $att_info->{DATA} = $data; 438 $att_info->{MIMETYPE} = 'message/rfc822'; 439 $att_info->{ENCODING} = '8bit'; 440 } else { 441 $self->_UnknownDir($name); 442 } 443 444 } elsif ($PPS->{Type} == FILE_TYPE) { 445 $self->_MapProperty($att_info, $PPS->{Data}, $property, 446 MAP_ATTACHMENT_FILE) or $self->_UnknownFile($name); 447 } else { 448 warn "Unknown entry type: $PPS->{Type}"; 449 } 450 } 451 452 sub _MapProperty { 453 my ($self, $hash, $data, $property, $map) = @_; 454 455 defined $property or return 0; 456 my $arr = $map->{$property} or return 0; 457 458 $arr->[1] and $data =~ s/\000//g; 459 $hash->{$arr->[0]} = $data; 460 461 return 1; 462 } 463 464 sub _UnknownDir { 465 my ($self, $name) = @_; 466 467 if ($name eq '__nameid_version1 0') { 468 $self->{VERBOSE} 469 and warn "Skipping DIR entry $name (Introductory stuff)\n"; 470 return; 471 } 472 warn "Unknown DIR entry $name\n"; 473 } 474 475 sub _UnknownFile { 476 my ($self, $name) = @_; 477 478 if ($name eq '__properties_version1 0') { 479 $self->{VERBOSE} 480 and warn "Skipping FILE entry $name (Properties)\n"; 481 return; 482 } 483 484 my ($property, $encoding) = $self->_ParseItemName($name); 485 unless (defined $property) { 486 warn "Unknown FILE entry $name\n"; 487 return; 488 } 489 if ($skipproperties->{$property}) { 490 $self->{VERBOSE} 491 and warn "Skipping property $property ($skipproperties->{$property})\n"; 492 return; 493 } elsif ($property =~ /^80/) { 494 $self->{VERBOSE} 495 and warn "Skipping property $property (user-defined property)\n"; 496 return; 497 } else { 498 warn "Unknown property $property\n"; 499 return; 500 } 501 } 502 503 # 504 # Helper functions 505 # 506 507 sub _GetName { 508 my ($self, $PPS) = @_; 509 return $self->_NormalizeWhiteSpace(OLE::Storage_Lite::Ucs2Asc($PPS->{Name})); 510 } 511 512 sub _NormalizeWhiteSpace { 513 my ($self, $name) = @_; 514 $name =~ s/\W/ /g; 515 return $name; 516 } 517 518 sub _GetOLEDate { 519 my ($self, $PPS) = @_; 520 unless (defined ($self->{OLEDATE})) { 521 # Make Date 522 my $datearr; 523 $datearr = $PPS->{Time2nd}; 524 $datearr = $PPS->{Time1st} unless($datearr); 525 $self->{OLEDATE} = $self->_FormatDate($datearr) if $datearr; 526 } 527 } 528 529 sub _FormatDate { 530 my ($self, $datearr) = @_; 531 532 # TODO: This is a little convoluted. Directly using strftime didn't seem 533 # to work. 534 my $datetime = mktime(@$datearr); 535 return time2str("%a, %d %h %Y %X %z", $datetime); 536 } 537 538 # If we didn't get the date from the original header data, we may be able 539 # to get it from the SUBMISSION_ID: 540 # It seems to have the format of a semicolon-separated list of key=value 541 # pairs. The key l has a value with the format: 542 # <SERVER>-<DATETIME>Z-<NUMBER>, where DATETIME is the date and time in 543 # the format YYMMDDHHMMSS. 544 sub _SubmissionIdDate { 545 my $self = shift; 546 547 my $submission_id = $self->{SUBMISSION_ID} or return undef; 548 $submission_id =~ m/l=.*-(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)Z-.*/ 549 or return undef; 550 my $year = $1; 551 $year += 100 if $year < 20; 552 return $self->_FormatDate([$6,$5,$4,$3,$2-1,$year]); 553 } 554 555 sub _ParseItemName { 556 my ($self, $name) = @_; 557 558 if ($name =~ /^__substg1 0_(....)(....)$/) { 559 my ($property, $encoding) = ($1, $2); 560 if ($encoding eq ENCODING_UNICODE and not ($self->{HAS_UNICODE})) { 561 warn "This MSG file contains Unicode fields." 562 . " This is currently unsupported.\n"; 563 $self->{HAS_UNICODE} = 1; 564 } elsif (not (KNOWN_ENCODINGS()->{$encoding})) { 565 warn "Unknown encoding $encoding. Results may be strange or wrong.\n"; 566 } 567 return ($property, $encoding); 568 } else { 569 return (undef, undef); 570 } 571 } 572 573 sub _SaveAttachment { 574 my ($self, $mime, $att) = @_; 575 576 my $ent = $mime->attach( 577 Type => $att->{MIMETYPE}, 578 Encoding => $att->{ENCODING}, 579 Data => [], 580 Filename => ($att->{LONGNAME} ? $att->{LONGNAME} : $att->{SHORTNAME}), 581 Disposition => $att->{DISPOSITION} 582 ); 583 584 my $handle; 585 if ($handle = $ent->open("w")) { 586 $handle->print($att->{DATA}); 587 $handle->close; 588 } else { 589 warn "Could not write data!"; 590 } 591 } 592 593 sub _SetAddressPart { 594 my ($self, $adrname, $partname, $data) = @_; 595 596 my $address = $self->{ADDRESSES}->{$adrname}; 597 $data =~ s/\000//g; 598 #warn "Processing address data part $partname : $data\n"; 599 if (defined ($address->{$partname})) { 600 if ($address->{$partname} eq $data) { 601 warn "Skipping duplicate but identical address information for" 602 . " $partname\n" if $self->{VERBOSE}; 603 } else { 604 warn "Address information $partname inconsistent:\n"; 605 warn " Original data: $address->{$partname}\n"; 606 warn " New data: $data\n"; 607 } 608 } else { 609 $address->{$partname} = $data; 610 } 611 } 612 613 # Set header fields 614 sub _AddHeaderField { 615 my ($self, $mime, $fieldname, $value) = @_; 616 617 my $oldvalue = $mime->head->get($fieldname); 618 return if $oldvalue; 619 $mime->head->add($fieldname, $value) if $value; 620 } 621 622 sub _Address { 623 my ($self, $tag) = @_; 624 my $name = $self->{$tag} || ""; 625 my $address = $self->{$tag . "_ADDR"} || ""; 626 return "$name <$address>"; 627 } 628 629 # Find SMTP addresses for the given list of names 630 sub _ExpandAddressList { 631 my ($self, $names) = @_; 632 633 my $addresspool = $self->{ADDRESSES}; 634 my @namelist = split /; */, $names; 635 my @result; 636 name: foreach my $name (@namelist) { 637 foreach my $address (@$addresspool) { 638 if ($name eq $address->{NAME}) { 639 my $addresstext = $address->{NAME} . " <"; 640 if (defined ($address->{SMTPADDRESS})) { 641 $addresstext .= $address->{SMTPADDRESS}; 642 } elsif ($address->{TYPE} eq "SMTP") { 643 $addresstext .= $address->{ADDRESS}; 644 } 645 $addresstext .= ">"; 646 push @result, $addresstext; 647 next name; 648 } 649 } 650 push @result, $name; 651 } 652 return join ", ", @result; 653 } 654 655 sub _ParseHead { 656 my ($self, $data) = @_; 657 defined $data or return undef; 658 # Parse full header date if we got that. 659 my $parser = new MIME::Parser(); 660 $parser->output_to_core(1); 661 $parser->decode_headers(1); 662 $data =~ s/^Microsoft Mail.*$/X-MSGConvert: yes/m; 663 my $entity = $parser->parse_data($data) 664 or warn "Couldn't parse full headers!"; 665 my $head = $entity->head; 666 $head->unfold; 667 return $head; 668 } 669 670 # Find out if we need to construct a multipart message 671 sub _IsMultiPart { 672 my $self = shift; 673 674 return ( 675 ($self->{BODY_HTML} and $self->{BODY_PLAIN}) 676 or @{$self->{ATTACHMENTS}}>0 677 ); 678 } 679 680 # Copy original header data. 681 # Note: This should contain the Date: header. 682 sub _CopyHeaderData { 683 my ($self, $mime) = @_; 684 685 my $head = $self->_ParseHead($self->{HEAD}) or return; 686 687 foreach my $tag (grep {!$skipheaders->{$_}} $head->tags) { 688 foreach my $value ($head->get_all($tag)) { 689 $mime->head->add($tag, $value); 690 } 691 } 692 } 693 694 # Set header fields 695 sub _SetHeaderFields { 696 my ($self, $mime) = @_; 697 698 # If we didn't get the date from the original header data, we may be able 699 # to get it from the SUBMISSION_ID: 700 $self->_AddHeaderField($mime, 'Date', $self->_SubmissionIdDate()); 701 702 # Third and last chance to set the Date: header; this uses the date the 703 # MSG file was saved. 704 $self->_AddHeaderField($mime, 'Date', $self->{OLEDATE}); 705 $self->_AddHeaderField($mime, 'Subject', $self->{SUBJECT}); 706 $self->_AddHeaderField($mime, 'From', $self->_Address("FROM")); 707 #$self->_AddHeaderField($mime, 'Reply-To', $self->_Address("REPLYTO")); 708 $self->_AddHeaderField($mime, 'To', $self->_ExpandAddressList($self->{TO})); 709 $self->_AddHeaderField($mime, 'Cc', $self->_ExpandAddressList($self->{CC})); 710 $self->_AddHeaderField($mime, 'Message-Id', $self->{MESSAGEID}); 711 $self->_AddHeaderField($mime, 'In-Reply-To', $self->{INREPLYTO}); 712 } 713 714 package main; 715 use Getopt::Long; 716 use Pod::Usage; 717 718 # Setup command line processing. 719 my $verbose = ''; 720 my $help = ''; # Print help message and exit. 721 GetOptions('verbose' => \$verbose, 'help|?' => \$help) or pod2usage(2); 722 pod2usage(1) if $help; 723 724 # Get file name 725 my $file = $ARGV[0]; 726 defined $file or pod2usage(2); 727 warn "Will parse file: $file\n" if $verbose; 728 729 # Load and parse MSG file (is OLE) 730 my $Msg = OLE::Storage_Lite->new($file); 731 my $PPS = $Msg->getPpsTree(1); 732 $PPS or die "$file must be an OLE file"; 733 734 # parse PPS tree 735 my $parser = new MSGParser(); 736 $parser->set_verbosity(1) if $verbose; 737 $parser->parse($PPS); 738 $parser->print(); 739 740 # 741 # Usage info follows. 742 # 743 __END__ 744 745 =head1 NAME 746 747 msgconvert.pl - Convert Outlook .msg files to mbox format 748 749 =head1 SYNOPSIS 750 751 msgconvert.pl [options] <file.msg> 752 753 Options: 754 --verbose be verbose 755 --help help message 756 757 =head1 OPTIONS 758 759 =over 8 760 761 =item B<--verbose> 762 763 Print information about skipped parts of the .msg file. 764 765 =item B<--help> 766 767 Print a brief help message. 768 769 =head1 DESCRIPTION 770 771 This program will output the message contained in file.msg in mbox format 772 on stdout. It will complain about unrecognized OLE parts on 773 stderr. 774 775 =head1 BUGS 776 777 Not all data that's in the .MSG file is converted. There simply are some 778 parts whose meaning escapes me. One of these must contain the date the 779 message was sent, for example. Formatting of text messages will also be 780 lost. YMMV. 781 782 =cut -
xapian-omega-1.0.7a/omindex.cc
diff -u xapian-omega-1.0.7a/omindex.cc.orig
old new 4 4 * Copyright 2001,2005 James Aylett 5 5 * Copyright 2001,2002 Ananova Ltd 6 6 * Copyright 2002,2003,2004,2005,2006,2007,2008 Olly Betts 7 * Copyright 2006,2007,2008 AVL List GesmbH 7 8 * 8 9 * This program is free software; you can redistribute it and/or 9 10 * modify it under the terms of the GNU General Public License as … … 62 63 extern char * mkdtemp(char *); 63 64 #endif 64 65 66 #ifndef LIBEXECDIR 67 // must have ending slash 68 //# define LIBEXECDIR "/usr/lib/omega/bin/" 69 # define LIBEXECDIR "" 70 #endif 71 #ifndef PKGDATADIR 72 // must have ending slash 73 # define PKGDATADIR "/usr/share/omega/" 74 #endif 75 65 76 using namespace std; 66 77 67 78 #define TITLE_SIZE 128 … … 69 80 70 81 #define PROG_NAME "omindex" 71 82 #define PROG_DESC "Index static website data via the filesystem" 83 84 /* used in runfilter.cc */ 85 bool verbose = false; 86 string error_log; 72 87 73 88 static bool skip_duplicates = false; 74 89 static bool follow_symlinks = false; 90 static bool silent = false; 91 static string cache_dir; 75 92 static string dbpath; 76 93 static string root; 77 94 static string indexroot; … … 136 153 154 static void 155 index_cached_directory(size_t depth_limit, 156 const string &file, 157 const string &url, 158 const string &ext, 159 const string &cmd, 160 map<string, string>& mime_map); 161 static 162 int mkdir_p(const string &path, mode_t mode); 163 137 164 inline static bool 138 165 p_notalnum(unsigned int c) 139 166 { … … 217 244 // indexing is disallowed 218 245 } 219 246 if (!p.indexing_allowed) { 220 cout << "indexing disallowed by meta tag - skipping" << endl; 247 if (!silent) 248 cout << "indexing disallowed by meta tag - skipping" << endl; 221 249 return; 222 250 } 223 251 dump = p.dump; … … 245 273 return; 246 274 } 247 275 md5_string(dump, md5); 276 #if 0 // FIXME: this won't work as omindex will have the database locked... 277 } else if (mimetype == "message/rfc822") { // // => mbox2script 278 //for stemmer lang, parse stemmer.get_description => Xapian::Stem(bla) 279 string cmd = LIBEXECDIR"mbox2omega " + shell_protect(file) + error_log+"| " 280 "scriptindex " + shell_protect(dbpath) + " "PKGDATADIR"mbox2script.script"; 281 try { 282 dump = stdout_to_string(cmd); 283 } catch (ReadError) { 284 cout << "\"" << cmd << "\" failed - skipping" << endl; 285 return; 286 } 287 #endif 248 288 } else if (mimetype == "application/pdf") { 249 289 string safefile = shell_protect(file); … … 383 423 } else if (mimetype == "text/rtf") { 384 424 // The --text option unhelpfully converts all non-ASCII characters to 385 425 // "?" so we use --html instead, which produces HTML entities. 386 string cmd = "unrtf --nopict --html 2>/dev/null " + shell_protect(file) ;426 string cmd = "unrtf --nopict --html 2>/dev/null " + shell_protect(file) + error_log; 387 427 MyHtmlParser p; 388 428 try { 389 429 p.parse_html(stdout_to_string(cmd)); … … 566 606 continue; 567 607 } 568 608 case DirectoryIterator::REGULAR_FILE: { 609 if (strcasecmp(d.leafname(), "mbox") == 0) { 610 // Special filename. 611 index_file(url, "message/rfc822", d); 612 continue; 613 } 569 614 570 615 string ext; 571 616 string::size_type dot = url.find_last_of('.'); … … 610 655 continue; 611 656 } 612 657 658 string oldroot = root; 659 #ifndef _MSC_VER 660 // NOTE: unpacking does not work on MSWin32 this way! 661 if (ext == "zip") { 662 if (depth_limit == 1) { 663 cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl; 664 continue; 665 } 666 // overwrite 667 string cmd = "unzip -u -P. -o " +shell_protect(file) + " -d " +shell_protect(cache_dir+"/.zip"+indexroot+url+"/"); 668 try { 669 size_t new_limit = depth_limit; 670 if (new_limit) --new_limit; 671 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 672 } catch (ReadError) { 673 cout << "failed " << cmd << " << in index_cached_directory" << endl; 674 root = oldroot; 675 } catch (...) { 676 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 677 root = oldroot; 678 throw; 679 } 680 continue; 681 } 682 else if (ext == "rar") { 683 if (depth_limit == 1) { 684 cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl; 685 continue; 686 } 687 688 // overwrite 689 string cmd = "unrar x -o+ " +shell_protect(file) + " " 690 + shell_protect(cache_dir+"/.rar"+indexroot+url+"/"); 691 try { 692 size_t new_limit = depth_limit; 693 if (new_limit) --new_limit; 694 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 695 } catch (ReadError) { 696 cout << "failed " << cmd << " << in index_cached_directory" << endl; 697 root = oldroot; 698 } catch (...) { 699 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 700 root = oldroot; 701 throw; 702 } 703 continue; 704 } 705 #ifdef HAVE_MSGCONVERT 706 else if (ext == "msg") { 707 struct stat statcache; 708 char olddir[256]; 709 710 if (depth_limit == 1) { 711 cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl; 712 continue; 713 } 714 string cmd = LIBEXECDIR"outlook2text "+shell_protect(file); 715 // unpack multiparts and attachments. so we have to chdir first 716 string fulldir = cache_dir+"/.msg"+indexroot+url; 717 getcwd(olddir,256); 718 #ifdef HAVE_LSTAT 719 lstat(fulldir.c_str(), &statcache); 720 #else 721 stat(fulldir.c_str(), &statcache); 722 #endif 723 if (!S_ISDIR(statcache.st_mode)) { 724 mkdir_p(fulldir, 0755); 725 } 726 try { 727 chdir (fulldir.c_str()); 728 size_t new_limit = depth_limit; 729 if (new_limit) --new_limit; 730 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 731 chdir (olddir); 732 } catch (ReadError) { 733 cout << "failed " << cmd << " << in index_cached_directory" << endl; 734 chdir (olddir); 735 root = oldroot; 736 } catch (...) { 737 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 738 chdir (olddir); 739 root = oldroot; 740 throw; 741 } 742 continue; 743 } 744 #endif 745 #ifdef HAVE_READPST 746 else if (ext == "pst") { 747 if (depth_limit == 1) { 748 cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl; 749 continue; 750 } 751 // unpack attachments also, together with mbox files 752 string cmd = "readpst -r -cv -w -o " 753 + shell_protect(cache_dir+"/.pst"+indexroot+url+"/")+" "+shell_protect(file); 754 try { 755 size_t new_limit = depth_limit; 756 if (new_limit) --new_limit; 757 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 758 } catch (ReadError) { 759 root = oldroot; 760 cout << "failed " << cmd << " << in index_cached_directory" << endl; 761 } catch (...) { 762 root = oldroot; 763 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 764 throw; 765 } 766 continue; 767 } 768 #endif 769 #endif 613 770 // It's in our MIME map so we know how to index it. 614 771 const string & mimetype = mt->second; 615 772 try { … … 640 797 } 798 799 static 800 int mkdir_p(const string &path, mode_t mode) { 801 #ifdef __WIN32__ 802 system(("mkdir \"" + shell_protect(path) + "\"").c_str()); 803 #else 804 system(("mkdir -p " + shell_protect(path)).c_str()); 805 #endif 806 return 0; 807 } 808 809 /* 810 * unpack .msg/.pst/.rar/.zip into local cache dir and recurse there 811 */ 812 static void 813 index_cached_directory(size_t depth_limit, 814 const string &file, 815 const string &url, 816 const string &ext, 817 const string &cmd, 818 map<string, string>& mime_map) 819 { 820 string oldroot = root; 821 root = cache_dir; 822 string cache = root+"/."+ext+indexroot; 823 string cachedir = cache+url; 824 struct stat statfile, statcache; 825 bool extract_cache; 826 #ifdef HAVE_LSTAT 827 lstat(file.c_str(), &statfile); 828 lstat(cachedir.c_str(), &statcache); 829 #else 830 stat(file.c_str(), &statfile); 831 stat(cachedir.c_str(), &statcache); 832 #endif 833 extract_cache = true; 834 // if cachedir exists and if file is older as cachedir and if cachedir existed 5 secs ago, 835 // then it was already extracted. 836 if (S_ISDIR(statcache.st_mode) 837 && S_ISREG(statfile.st_mode) 838 && (statfile.st_mtime < statcache.st_mtime) 839 && (statcache.st_mtime < (time_t)(time(NULL)-500))) // not created by nested mkdir call 840 { 841 // but is it in the database also? prevent from deleting skipped files 842 if (!silent) 843 cout << "Unchanged cache \"" << cachedir << "\" - \"" << file << "\" - skip extraction " 844 // << statfile.st_mtime << " < " << statcache.st_mtime 845 << endl; 846 extract_cache = false; 847 } 848 if (S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode) ) 849 { 850 // check database timestamp for cached container, esp. for cleaned up caches. 851 // if already in db we need not to extract again 852 string urlterm("U"); 853 urlterm += baseurl; 854 urlterm += "/."+ext+indexroot+url; 855 if (urlterm.length() > MAX_SAFE_TERM_LENGTH) 856 urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH); 857 858 { 859 // at first find the docid with the beginning urlterm and check its timestamp 860 Xapian::docid docid = 0; 861 Xapian::PostingIterator p = db.postlist_begin(urlterm); 862 if (p != db.postlist_end(urlterm)) { 863 docid = *p; 864 } 865 if (docid && !ignore_time) { 866 // new: first search value (1) 867 Xapian::Document doc = db.get_document(docid); 868 string lastmod; 869 if (doc.values_count()) 870 lastmod = doc.get_value(VALUE_LASTMOD); 871 if (!lastmod.empty()) { 872 if (string_to_int(lastmod) >= statfile.st_mtime) { 873 if (!silent) 874 cout << "Cache "<< "."+ext+indexroot+url << " not newer. Ignored." << endl; 875 if (docid < updated.size()) { 876 updated[docid] = true; 877 root = oldroot; 878 return; 879 } 880 } 881 } 882 } 883 } 884 } 885 886 if (extract_cache) { 887 if (!silent) 888 cout << "[EXTRACT into cache " << cachedir << "]" << endl; 889 if (verbose && S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode)) 890 cout << " ...changed cache \"" << cachedir << "\" - \"" << file << "\" " 891 << statfile.st_mtime << " < " << statcache.st_mtime << " time: " << time(NULL) 892 << endl; 893 if (!S_ISDIR(statcache.st_mode)) 894 mkdir_p(cachedir, 0755); 895 stdout_to_string(cmd); 896 #ifndef __WIN32__ 897 stdout_to_string("chmod -R u+rwx " + shell_protect(cachedir)); 898 #endif 899 #ifdef HAVE_LSTAT 900 lstat(cachedir.c_str(), &statcache); 901 #else 902 stat(cachedir.c_str(), &statcache); 903 #endif 904 } 905 906 if (S_ISDIR(statcache.st_mode)) { 907 if (depth_limit == 1) { 908 cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl; 909 } else { 910 // max loop 5, magic start: /.ext+file 911 index_directory(depth_limit+5, "/."+ext+url, mime_map); 912 if (!silent) 913 cout << "[CLEANUP " << "rm -rf " << shell_protect(cachedir) << "]" << endl; 914 rm_rf(cachedir); 915 } 916 } 917 else { // no -p would be fatal here 918 cout << "cachedir " << shell_protect(cachedir) << " does not exist - skipped" << endl; 919 } 920 root = oldroot; 921 } 641 922 642 923 int … … 653 934 { "version", no_argument, NULL, 'v' }, 935 { "silent", no_argument, NULL, 'S' }, 654 936 { "overwrite", no_argument, NULL, 'o' }, … … 717 999 mime_map["xlt"] = "application/vnd.ms-excel"; // Excel template 718 1000 mime_map["ppt"] = "application/vnd.ms-powerpoint"; 719 1001 mime_map["pps"] = "application/vnd.ms-powerpoint"; // Powerpoint slideshow 1002 #ifdef HAVE_READPST 1003 // Outlook messager folder 1004 mime_map["pst"] = "application/vnd.ms-outlook-pst"; // readpst | uudeview (libpst) 1005 #endif 1006 #ifdef HAVE_MSGCONVERT 1007 mime_map["msg"] = "application/vnd.ms-outlook"; // outlook2text via msgconvert.pl 1008 #endif 1009 mime_map["mbox"] = "message/rfc822"; // => mbox2omega 1010 mime_map["mbx"] = "message/rfc822"; // => mbox2omega 720 1011 // Perl: 721 1012 mime_map["pl"] = "text/x-perl"; 722 1013 mime_map["pm"] = "text/x-perl"; … … 727 1018 // DjVu: 728 1019 mime_map["djv"] = "image/vnd.djvu"; 729 1020 mime_map["djvu"] = "image/vnd.djvu"; 1021 #ifndef _MSC_VER 1022 mime_map["zip"] = "application/x-zip"; // recursive scanning 1023 # ifdef HAVE_UNRAR 1024 mime_map["rar"] = "application/x-rar"; // recursive scanning 1025 # endif 1026 #endif 730 1027 … … 753 1050 " -f, --follow follow symbolic links\n" 1051 " --silent Print only errors\n" 754 1052 " --overwrite create the database anew (the default is to update\n" … … 844 1142 if (baseurl.empty()) { 845 1143 cerr << PROG_NAME": --url not specified, assuming `/'.\n"; 846 1144 } 1145 // FIXME: need to set log_dir! 1146 error_log = " 2>>"+log_dir+"omindex-error.log"; … … 869 1169 } 1170 // add the db basename to cache_dir 1171 { 1172 ensure_tmpdir(); // FIXME: be lazy! 1173 cache_dir = tmpdir; 1174 const char *p = strrchr(dbpath.c_str(), '/'); 1175 // on windows only 1176 if (!p) p = strrchr(dbpath.c_str(), '\\'); 1177 if (p) { p++; } else { p = dbpath.c_str(); } 1178 cache_dir += p; 1179 } 870 1180 871 1181 int exitcode = 1; 872 1182 try { -
xapian-omega-1.0.7a/outlook2text.in
diff -u xapian-omega-1.0.7a/outlook2text.in.orig
old new 1 #! /bin/sh 2 # converts msg to mbox and extract attachments 3 # either be in the cache dir, or accept it as 2nd arg 4 if [ -n $2 ]; then 5 @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "$2" 6 else 7 # already is in the cache dir 8 base=`basename "$1" .msg` 9 @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "${base}" 10 fi -
xapian-omega-1.0.7a/query.cc
diff -u xapian-omega-1.0.7a/query.cc.orig
old new 141 141 switch (t[0]) { 142 142 case 'a': 143 143 return (t == "a" || t == "about" || t == "an" || t == "and" || 144 t == "are" || t == "as" || t == "at"); 144 t == "are" || t == "as" || t == "at" || t == "according" || 145 t == "again" || t == "against" || t == "ah" || t == "all" || 146 t == "although" || t == "always" || t == "anyone" || t == "after" || 147 t == "also" || t == "any"); 145 148 case 'b': 146 149 return (t == "be" || t == "by"); 147 150 case 'e': -
xapian-omega-1.0.7a/runfilter.cc
diff -u xapian-omega-1.0.7a/runfilter.cc.orig
old new 60 60 61 61 using namespace std; 62 62 63 extern string error_log; 64 extern bool verbose; 65 63 66 string 64 67 stdout_to_string(const string &cmd) 65 68 { … … 97 100 setrlimit(RLIMIT_AS, &ram_limit); 98 101 } 99 102 100 execl("/bin/sh", "/bin/sh", "-c", cmd.c_str(), (void*)NULL); 103 string tmp; 104 tmp = cmd + error_log; 105 if (verbose) { 106 cout << " Executing '" << tmp << "'..." << endl; 107 } 108 109 execl("/bin/sh", "/bin/sh", "-c", tmp.c_str(), (void*)NULL); 101 110 _exit(-1); 102 111 } 103 112 … … 134 143 throw ReadError(); 135 144 } 136 145 #else 137 FILE * fh = popen(cmd.c_str(), "r"); 146 string tmp = cmd; 147 tmp += error_log; 148 if (verbose) { 149 cout << " Executing '" << tmp << "'..." << endl; 150 } 151 FILE * fh = popen(tmp.c_str(), "r"); 138 152 if (fh == NULL) throw ReadError(); 139 153 while (!feof(fh)) { 140 154 char buf[4096]; -
xapian-omega-1.0.7a/scriptindex.cc
diff -u xapian-omega-1.0.7a/scriptindex.cc.orig
old new 4 4 * Copyright 2001 Sam Liddicott 5 5 * Copyright 2001,2002 Ananova Ltd 6 6 * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts 7 * Copyright 2006,2007 AVL List GesmbH 7 8 * 8 9 * This program is free software; you can redistribute it and/or 9 10 * modify it under the terms of the GNU General Public License as … … 38 39 #include <stdio.h> 39 40 #include <time.h> 40 41 #include "safeunistd.h" 42 #include <sys/stat.h> 41 43 … … 58 60 #include "utf8truncate.h" 59 61 #include "utils.h" 62 #include "values.h" 60 63 61 64 #include "gnu_getopt.h" … … 422 425 { 423 426 string line; 424 427 size_t line_no = 0; 428 time_t last_mod = 0; 429 long file_size = 0; 430 431 if (strcmp(fname,"<stdin>") != 0) { 432 struct stat statbuf; 433 stat(fname, &statbuf); 434 if (! statbuf.st_size) { 435 cout << "Empty \"" << fname << "\" - skipping\n"; 436 return false; 437 } 438 file_size = statbuf.st_size; 439 last_mod = statbuf.st_mtime; 440 } 425 441 while (!stream.eof() && getline(stream, line)) { 426 442 ++line_no; 427 443 Xapian::Document doc; … … 638 654 for (i = fields.begin(); i != fields.end(); ++i) { 639 655 list<string>::const_iterator j; 640 656 for (j = i->second.begin(); j != i->second.end(); j++) { 657 if (i->first == "lastmod") last_mod = 0; 658 if (i->first == "size") file_size = 0; 641 659 data += i->first; 642 660 data += '='; 643 661 data += *j; 644 662 data += '\n'; 645 663 } 646 664 } 665 // provide some extra fields if not already provided by the script 666 if (last_mod) { // if indexed per filename 667 data += "lastmod="+int_to_string(last_mod)+'\n'; 668 doc.add_value(VALUE_LASTMOD, int_to_string(last_mod)); 669 } 670 if (file_size) { // if indexed per filename 671 data += "size="+int_to_string(file_size)+'\n'; 672 doc.add_value(VALUE_FILESIZE, int_to_string(file_size)); 673 } 647 674 648 675 // Put the data in the document 649 676 doc.set_data(data); -
xapian-omega-1.0.7a/utils.cc
diff -u xapian-omega-1.0.7a/utils.cc.orig
old new 30 30 31 31 using namespace std; 32 32 33 #ifdef __WIN32__ 34 #include "safewindows.h" 35 #endif 36 33 37 // This ought to be enough for any of the conversions below. … … 40 44 #define BUFSIZE 100 41 45 46 /// Allow system to work directly on C++ strings. 47 inline int system(const string &command) { return system(command.c_str()); } 48 49 /// Remove a directory and contents. 50 void 51 rm_rf(const string &filename) 52 { 53 // Check filename exists and is actually a directory 54 struct stat sb; 55 if (stat(filename, &sb) != 0 || !S_ISDIR(sb.st_mode)) return; 56 57 string safefile = shell_protect(filename); 58 #ifdef __WIN32__ 59 # if 1 60 static int win95 = -1; 61 if (win95 == -1) { 62 OSVERSIONINFO info; 63 memset(&info, 0, sizeof(OSVERSIONINFO)); 64 info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); 65 if (GetVersionEx(&info)) { 66 win95 = (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS); 67 } 68 } 69 70 if (win95) { 71 // for 95 like systems: 72 system("deltree /y \"" + safefile + "\""); 73 } else { 74 // for NT like systems: 75 system("rd /s /q \"" + safefile + "\""); 76 } 77 # else 78 safefile.append("\0", 2); 79 SHFILEOPSTRUCT shfo; 80 memset((void*)&shfo, 0, sizeof(shfo)); 81 shfo.hwnd = 0; 82 shfo.wFunc = FO_DELETE; 83 shfo.pFrom = safefile.data(); 84 shfo.fFlags = FOF_NOCONFIRMATION|FOF_NOERRORUI|FOF_SILENT; 85 (void)SHFileOperation(&shfo); 86 # endif 87 #else 88 system("rm -rf " + safefile); 89 #endif 90 } -
xapian-omega-1.0.7a/utils.h
diff -u xapian-omega-1.0.7a/utils.h.orig
old new 37 49 /** Converts a string to an int. */ 38 50 int string_to_int(const std::string & s); 39 51 52 void rm_rf(const std::string &filename); 53 40 54 #endif -
xapian-omega-1.0.7a/xapian-omega.spec.in
diff -u xapian-omega-1.0.7a/xapian-omega.spec.in.orig
old new 77 78 /var/www/icons/omega 78 79 %{_datadir}/%{name} 79 80 %config(noreplace) /etc/omega.conf 80 %doc %{_datadir}/doc/%{name}-%{version} 81 %docdir /usr/share/doc/%{name}-%{version} 82 %doc AUTHORS ChangeLog COPYING NEWS README TODO 81 83 # man pages may be gzipped, hence the trailing wildcard. 82 84 %{_mandir}/man1/omindex.1* 83 85 %{_mandir}/man1/scriptindex.1*