Ticket #282: xapian-omega-1.0.7a-from-ticket-285-and-cleaned-up-updated-2010-10-27.patch
File xapian-omega-1.0.7a-from-ticket-285-and-cleaned-up-updated-2010-10-27.patch, 60.9 KB (added by , 14 years ago) |
---|
-
xapian-omega-1.0.7a/ChangeLog
diff -u xapian-omega-1.0.7a/ChangeLog.orig
old new 1526 1526 1527 1527 * configure.ac: Check for strftime. 1528 1528 1529 2006-08-22 09:30:12 Reini Urban <reinhard.urban@avl.com> 1530 1531 omega-0.9.6c: 1532 * omindex.cc: Fix wrong timestamp comparison in cache logic 1533 * scriptindex.cc: Add lastmod and size records and values. 1534 * excel2text, outlook2text.in: New scripts 1535 1536 2006-08-18 15:13:32 Reini Urban <reinhard.urban@avl.com> 1537 1538 omega-0.9.6b: 1539 * omindex.cc: Add HAVE_UNRAR, HAVE_MSGCONVERT, HAVE_READPST checks. 1540 Add options --verbose, --silent 1541 1542 2006-08-17 18:06:26 Reini Urban <reinhard.urban@avl.com> 1543 1544 omega-0.9.6a: 1545 * omindex.cc: Added cache_dir, cached virtual directories zip,msg,pst,...). 1546 New option: -c/--nocleanup. 1547 Consistently log stderr to /var/log/omega/omindex-error.log. 1548 * configure.ac: Add HAVE_UNRAR, HAVE_MSGCONVERT, 1549 HAVE_READPST. 1550 * configfile.cc: New cache_dir 1551 * Makefile.am: Link omindex against configfile. 1552 1529 1553 Sun Jul 09 01:40:09 BST 2006 Olly Betts <olly@survex.com> 1530 1554 1531 1555 * docs/omegascript.txt: Note that (by design) an omegascript template -
xapian-omega-1.0.7a/Makefile.am
diff -u xapian-omega-1.0.7a/Makefile.am.orig
old new 61 61 pkglibbindir = $(pkglibdir)/bin 62 62 pkglibbin_PROGRAMS = omega 63 63 bin_PROGRAMS = omindex scriptindex 64 dist_libexec_SCRIPTS = outlook2text excel2text mimeexplode msgconvert.pl 64 65 dist_bin_SCRIPTS = dbi2omega htdig2omega mbox2omega 65 66 66 67 check_PROGRAMS = htmlparsetest md5test utf8converttest … … 92 93 common/safewindows.h\ 93 94 common/stringutils.h 94 95 95 AM_LDFLAGS = $(ICONV_LDFLAGS)96 AM_LDFLAGS = -no-undefined $(ICONV_LDFLAGS) 96 97 97 98 omega_SOURCES = omega.cc query.cc cgiparam.cc utils.cc configfile.cc date.cc\ 98 99 cdb_init.cc cdb_find.cc cdb_hash.cc cdb_unpack.cc loadfile.cc\ … … 102 103 md5wrap.cc xmlparse.cc metaxmlparse.cc utf8convert.cc sample.cc diritor.cc\ 104 configfile.cc\ 103 105 runfilter.cc freemem.cc common/msvc_dirent.cc 104 106 if NEED_MKDTEMP … … 109 111 scriptindex_SOURCES = scriptindex.cc myhtmlparse.cc htmlparse.cc\ 110 112 common/getopt.cc commonhelp.cc utils.cc hashterm.cc loadfile.cc\ 113 configfile.cc\ 111 114 common/safe.cc common/stringutils.cc utf8convert.cc utf8truncate.cc … … 127 130 MAINTAINERCLEANFILES = $(dist_man_MANS) 128 131 endif 129 132 133 CLEANFILES = $(dist_libexec_SCRIPTS) $(dist_bin_SCRIPTS) 134 135 omega.conf: $(srcdir)/omega.conf.in Makefile 136 sed "s,@localstatedir@,$(localstatedir)," $(srcdir)/omega.conf.in > $@ 137 outlook2text: $(srcdir)/outlook2text.in mimeexplode Makefile 138 sed "s,@MSGCONVERT@,$(MSGCONVERT),;s,@MIMEEXPLODE@,$(pkglibbindir)/mimeexplode," $(srcdir)/outlook2text.in > $@ 139 130 140 if DOCUMENTATION_RULES -
xapian-omega-1.0.7a/configfile.cc
diff -u xapian-omega-1.0.7a/configfile.cc.orig
old new 42 42 string template_dir = "/var/lib/omega/templates/"; 43 43 string log_dir = "/var/log/omega/"; 44 44 string cdb_dir = "/var/lib/omega/cdb/"; 45 string cache_dir = "/var/lib/omega/cache/"; 45 46 46 47 /** Return true if the file fname exists. 47 48 */ … … 64 65 } 65 66 66 67 while (in) { 67 char line[1024]; 68 in.getline(line, sizeof(line)); 69 70 char *p = line; 71 while (isspace((unsigned char)*p)) ++p; 72 if (!*p || *p == '#') continue; // Ignore blank line and comments 73 74 char *q = p; 75 while (*q && !isspace((unsigned char)*q)) ++q; 76 string name(p, q - p); 77 78 p = q; 79 while (isspace((unsigned char)*p)) ++p; 80 q = p; 81 while (*q && !isspace((unsigned char)*q)) ++q; 82 string value(p, q - p); 83 84 while (*q && isspace((unsigned char)*q)) ++q; 85 if (value.empty() || *q) { 86 throw string("Bad line in configuration file `") + cfile + "'"; 87 } 88 68 string name, value; 69 in >> name >> value; 70 if (value[value.length()-1] != '/') value += "/"; 89 71 if (name == "database_dir") { 90 database_dir = value + "/";72 database_dir = value; 91 73 } else if (name == "template_dir") { 92 template_dir = value + "/";74 template_dir = value; 93 75 } else if (name == "log_dir") { 94 log_dir = value + "/";76 log_dir = value; 95 77 } else if (name == "cdb_dir") { 96 cdb_dir = value + "/"; 78 cdb_dir = value; 79 } else if (name == "cache_dir") { 80 cache_dir = value; 97 81 } 98 82 } 99 83 -
xapian-omega-1.0.7a/configfile.h
diff -u xapian-omega-1.0.7a/configfile.h.orig
old new 30 30 extern string template_dir; 31 31 extern string log_dir; 32 32 extern string cdb_dir; 33 extern string cache_dir; 33 34 34 35 void read_config_file(); 35 36 -
xapian-omega-1.0.7a/excel2text
diff -u xapian-omega-1.0.7a/excel2text.orig
old new 1 #! /bin/sh 2 # strip numbers, to stdout 3 xls2csv -q0 "$1" | sed -re's/[0123456789.]+,//g' -
xapian-omega-1.0.7a/mimeexplode
diff -u xapian-omega-1.0.7a/mimeexplode.orig
old new 1 #!/usr/bin/perl -w 2 3 =head1 NAME 4 5 mimeexplode - explode one or more MIME messages 6 7 =head1 SYNOPSIS 8 9 mimeexplode [-d <dir>] <mime-msg-file> <mime-msg-file> ... 10 11 someprocess | mimeexplode - 12 13 =head1 DESCRIPTION 14 15 Takes one or more files from the command line that contain MIME 16 messages, and explodes their contents out into subdirectories 17 of the current working directory. The subdirectories are 18 just called C<msg0>, C<msg1>, C<msg2>, etc. Existing directories are 19 skipped over. 20 21 The message information is output to the stdout, like this: 22 23 Message: msg3 (inputfile1.msg) 24 Part: msg3/filename-1.dat (text/plain) 25 Part: msg3/filename-2.dat (text/plain) 26 Message: msg5 (input-file2.msg) 27 Part: msg5/dir.gif (image/gif) 28 Part: msg5/face.jpg (image/jpeg) 29 Message: msg6 (infile3) 30 Part: msg6/filename-1.dat (text/plain) 31 32 This was written as an example of the MIME:: modules in the 33 MIME-parser package I wrote. It may prove useful as a quick-and-dirty 34 way of splitting a MIME message if you need to decode something, and 35 you don't have a MIME mail reader on hand. 36 37 =head1 COMMAND LINE OPTIONS 38 39 -d outdir 40 41 =head1 AUTHOR 42 43 Eryq C<eryq@zeegee.com>, in a big hurry... 44 Reini Urban C<rurban@x-ray.at>: -d option to always explode into the same dir 45 46 =cut 47 48 #BEGIN { unshift @INC, ".." } # to test MIME:: stuff before installing it! 49 50 require 5.001; 51 52 use strict; 53 use vars; 54 55 use MIME::Parser; 56 use Getopt::Std; 57 my %opts; 58 my $outbase = ''; 59 my $postfix = ''; 60 61 #------------------------------------------------------------ 62 # make_msg - make and return the name of a msgXXX directory 63 #------------------------------------------------------------ 64 65 #ignored 66 #sub make_msg { 67 # while (-d "msg$Msgno") { 68 # ++$Msgno; 69 # die "self-imposed limit reached" if $Msgno == 256; 70 # } 71 # mkdir "msg$Msgno",0755 or die "couldn't make msg$Msgno: $!"; 72 # "msg$Msgno"; 73 #} 74 75 #------------------------------------------------------------ 76 # dump_entity - dump an entity's file info 77 #------------------------------------------------------------ 78 sub dump_entity { 79 my $ent = shift; 80 my @parts = $ent->parts; 81 82 if (@parts) { # multipart... 83 map { dump_entity($_) } @parts; 84 } 85 else { # single part... 86 print " Part: ", $ent->bodyhandle->path, 87 " (", scalar($ent->head->mime_type), ")\n"; 88 } 89 } 90 91 #------------------------------------------------------------ 92 # main 93 #------------------------------------------------------------ 94 sub main { 95 my $file; 96 my $entity; 97 98 # make sure the same message gets exploded into the same dir 99 getopts('d:', \%opts); 100 $outbase = $opts{d} ? $opts{d} : "msg0"; 101 my $outdir = $outbase; 102 103 # Go through messages: 104 @ARGV or unshift @ARGV, "-"; 105 while (defined($file = shift @ARGV)) { 106 107 # Sanity: 108 (-d $outdir) or mkdir "$outdir",0755; 109 (-w "$outdir") or die "cwd $outdir not writable!"; 110 #my $msgdir = make_msg(); 111 #print "Message: $msgdir ($file)\n"; 112 113 # Create a new parser object: 114 my $parser = new MIME::Parser; 115 ### $parser->parse_nested_messages('REPLACE'); 116 117 # Optional: set up parameters that will affect how it extracts 118 # documents from the input stream: 119 $parser->output_dir($outdir); 120 121 # Parse an input stream: 122 open FILE, $file or die "couldn't open $file"; 123 $entity = $parser->read(\*FILE) or 124 print STDERR "Couldn't parse MIME in $file; continuing...\n"; 125 close FILE; 126 127 # Congratulations: you now have a (possibly multipart) MIME entity! 128 dump_entity($entity) if $entity; 129 ### $entity->dump_skeleton if $entity; 130 131 $postfix++; 132 $outdir = $outbase.$postfix; 133 } 134 1; 135 } 136 137 exit (&main ? 0 : -1); 138 #------------------------------------------------------------ 139 1; 140 -
xapian-omega-1.0.7a/msgconvert.pl
diff -u xapian-omega-1.0.7a/msgconvert.pl.orig
old new 1 #!/usr/bin/perl -w 2 # 3 # msgconvert.pl: 4 # 5 # Convert .MSG files (made by Outlook (Express)) to multipart MIME messages. 6 # 7 # Copyright 2002, 2004, 2006 Matijs van Zuijlen 8 # 9 # This program is free software; you can redistribute it and/or modify it 10 # under the terms of the GNU General Public License as published by the 11 # Free Software Foundation; either version 2 of the License, or (at your 12 # option) any later version. 13 # 14 # This program is distributed in the hope that it will be useful, but 15 # WITHOUT ANY WARRANTY; without even the implied warranty of 16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 17 # Public License for more details. 18 # 19 # CHANGES: 20 # 20020715 Recognize new items 'Cc', mime type of attachment, long 21 # filename of attachment, and full headers. Attachments turn out 22 # to be numbered, so a regexp is now used to recognize label of 23 # items that are attachments. 24 # 20020831 long file name will definitely be used if present. Full headers 25 # and mime type information are used when present. Created 26 # generic system for specifying known items to be skipped. 27 # Unexpected contents is never reason to bail out anymore. Added 28 # support for usage message and option processing (--verbose). 29 # 20040104 Handle address data slightly better, make From line less fake, 30 # make $verbose and $skippable_entries global vars, handle HTML 31 # variant of body text if present (though not optimally). 32 # 20040214 Fix typos and incorrect comments. 33 # 20040307 - Complete rewrite: All functional parts are now in the package 34 # MSGParser; 35 # - Creation of MIME::Entity object is delayed until the output 36 # routines, which means all data is known; This means I can 37 # create a multipart/alternative body. 38 # - Item names are parsed (thanks to bfrederi@alumni.sfu.ca for 39 # the information). 40 # 20040514 Check if $self->{HEAD} actually exists before trying to add its 41 # contents to the output Mime object's header data. 42 # (Bug reported by Thomas Ng). 43 # Don't produce multipart messages if not needed. 44 # (Bug reported by Justin B. Scout). 45 # 20040529 Correctly format OLEDATE. 46 # 20040530 - Extract date from property 0047 (thanks, Marc Goodman). 47 # - Use address data to make To: and Cc: lines complete 48 # - Use the in-reply-to property 49 # - More unknown properties named. 50 # - Found another property containing an SMTP address. 51 # - Put non-SMTP type addresses back in output. 52 # 20040825 Replace 'our' to declare globals with 'use vars'. This means 53 # the globals our now properly scoped inside the package and not 54 # the file. 55 # This also fixes the bug that this program did not work on perl 56 # versions below 5.6. (Bug reported by Tim Gustafson) 57 # 20060218 More sensible encoding warnings. 58 # 20060219 Move OLE parsing to main program. 59 # Parse nested MSG files (Bug reported by Christof Lukas). 60 # 20060225 Simplify code. 61 # 62 63 # 64 # Import modules. 65 # 66 package MSGParser; 67 use strict; 68 use OLE::Storage_Lite; 69 use MIME::Entity; 70 use MIME::Parser; 71 use Date::Format; 72 use POSIX qw(mktime); 73 use constant DIR_TYPE => 1; 74 use constant FILE_TYPE => 2; 75 76 use vars qw($skipproperties $skipheaders); 77 # 78 # Descriptions partially based on mapitags.h 79 # 80 $skipproperties = { 81 # Envelope properties 82 '000B' => "Conversation key?", 83 '001A' => "Type of message", 84 '003B' => "Sender address variant", 85 '003D' => "Contains 'Re: '", 86 '003F' => "'recieved by' id", 87 '0040' => "'recieved by' name", 88 '0041' => "Sender variant address id", 89 '0042' => "Sender variant name", 90 '0043' => "'recieved representing' id", 91 '0044' => "'recieved representing' name", 92 '0046' => "Read receipt address id", 93 '0051' => "'recieved by' search key", 94 '0052' => "'recieved representing' search key", 95 '0053' => "Read receipt search key", 96 '0064' => "Sender variant address type", 97 '0065' => "Sender variant address", 98 '0070' => "Conversation topic", 99 '0071' => "Conversation index", 100 '0075' => "'recieved by' address type", 101 '0076' => "'recieved by' email address", 102 '0077' => "'recieved representing' address type", 103 '0078' => "'recieved representing' email address", 104 '007F' => "something like a message id", 105 # Recipient properties 106 '0C19' => "Reply address variant", 107 '0C1D' => "Reply address variant", 108 '0C1E' => "Reply address type", 109 # Non-transmittable properties 110 '0E02' => "?Should BCC be displayed", 111 '0E0A' => "sent mail id", 112 '0E1D' => "Subject w/o Re", 113 '0E27' => "64 bytes: Unknown", 114 '0FF6' => "Index", 115 '0FF9' => "Index", 116 '0FFF' => "Address variant", 117 # Content properties 118 '1008' => "Summary or something", 119 '1009' => "RTF Compressed", 120 # 'Common property' 121 '3001' => "Display name", 122 '3002' => "Address Type", 123 '300B' => "'Search key'", 124 # Attachment properties 125 '3702' => "Attachment encoding", 126 '3703' => "Attachment extension", 127 '3709' => "'Attachment rendering'", # Maybe an icon or something? 128 '3713' => "Icon URL?", 129 # 'Mail user' 130 '3A20' => "Address variant", 131 # 3900 -- 39FF: 'Address book' 132 '39FF' => "7 bit display name", 133 # 'Display table properties' 134 '3FF8' => "Routing data?", 135 '3FF9' => "Routing data?", 136 '3FFA' => "Routing data?", 137 '3FFB' => "Routing data?", 138 # 'Transport-defined envelope property' 139 '4029' => "Sender variant address type", 140 '402A' => "Sender variant address", 141 '402B' => "Sender variant name", 142 '5FF6' => "Recipient name", 143 '5FF7' => "Recipient address variant", 144 # 'Provider-defined internal non-transmittable property' 145 '6740' => "Unknown, binary data", 146 # User defined id's 147 '8000' => "Content Class", 148 '8002' => "Unknown, binary data", 149 }; 150 151 $skipheaders = { 152 "MIME-Version" => 1, 153 "Content-Type" => 1, 154 "Content-Transfer-Encoding" => 1, 155 "X-Mailer" => 1, 156 "X-Msgconvert" => 1, 157 "X-MS-Tnef-Correlator" => 1, 158 "X-MS-Has-Attach" => 1, 159 }; 160 161 use constant ENCODING_UNICODE => '001F'; 162 use constant KNOWN_ENCODINGS => { 163 '000D' => 'Directory', 164 '001F' => 'Unicode', 165 '001E' => 'Ascii?', 166 '0102' => 'Binary', 167 }; 168 169 use constant MAP_ATTACHMENT_FILE => { 170 '3701' => ["DATA", 0], # Data 171 '3704' => ["SHORTNAME", 1], # Short file name 172 '3707' => ["LONGNAME", 1], # Long file name 173 '370E' => ["MIMETYPE", 1], # mime type 174 '3716' => ["DISPOSITION", 1], # disposition 175 }; 176 177 use constant MAP_SUBITEM_FILE => { 178 '1000' => ["BODY_PLAIN", 0], # Body 179 '1013' => ["BODY_HTML", 0], # HTML Version of body 180 '0037' => ["SUBJECT", 1], # Subject 181 '0047' => ["SUBMISSION_ID", 1], # Seems to contain the date 182 '007D' => ["HEAD", 1], # Full headers 183 '0C1A' => ["FROM", 1], # Reply-To: Name 184 '0C1E' => ["FROM_ADDR_TYPE", 1], # From: Address type 185 '0C1F' => ["FROM_ADDR", 1], # Reply-To: Address 186 '0E04' => ["TO", 1], # To: Names 187 '0E03' => ["CC", 1], # Cc: Names 188 '1035' => ["MESSAGEID", 1], # Message-Id 189 '1042' => ["INREPLYTO", 1], # In reply to Message-Id 190 }; 191 192 use constant MAP_ADDRESSITEM_FILE => { 193 '3001' => ["NAME", 1], # Real name 194 '3002' => ["TYPE", 1], # Address type 195 '403D' => ["TYPE", 1], # Address type 196 '3003' => ["ADDRESS", 1], # Address 197 '403E' => ["ADDRESS", 1], # Address 198 '39FE' => ["SMTPADDRESS", 1], # SMTP Address variant 199 }; 200 201 # 202 # Main body of module 203 # 204 205 sub new { 206 my $that = shift; 207 my $class = ref $that || $that; 208 209 my $self = { 210 ATTACHMENTS => [], 211 ADDRESSES => [], 212 VERBOSE => 0, 213 HAS_UNICODE => 0, 214 FROM_ADDR_TYPE => "", 215 }; 216 bless $self, $class; 217 } 218 219 # 220 # Main sub: parse the PPS tree, and return 221 # 222 sub parse { 223 my $self = shift; 224 my $PPS = shift or die "Internal error: No PPS tree"; 225 $self->_RootDir($PPS); 226 } 227 228 sub mime_object { 229 my $self = shift; 230 231 my $bodymime; 232 my $mime; 233 234 if ($self->_IsMultiPart) { 235 # Construct a multipart message object 236 237 $mime = MIME::Entity->build(Type => "multipart/mixed"); 238 239 # Set the entity that we'll save the body parts to. If there's more than 240 # one part, it's a new entity, otherwise, it's the main $mime object. 241 if ($self->{BODY_HTML} and $self->{BODY_PLAIN}) { 242 $bodymime = MIME::Entity->build( 243 Type => "multipart/alternative", 244 Encoding => "8bit", 245 ); 246 $mime->add_part($bodymime); 247 } else { 248 $bodymime = $mime; 249 } 250 if ($self->{BODY_PLAIN}) { 251 $self->_SaveAttachment($bodymime, { 252 MIMETYPE => 'text/plain; charset=ISO-8859-1', 253 ENCODING => '8bit', 254 DATA => $self->{BODY_PLAIN}, 255 DISPOSITION => 'inline', 256 }); 257 } 258 if ($self->{BODY_HTML}) { 259 $self->_SaveAttachment($bodymime, { 260 MIMETYPE => 'text/html', 261 ENCODING => '8bit', 262 DATA => $self->{BODY_HTML}, 263 DISPOSITION => 'inline', 264 }); 265 } 266 foreach my $att (@{$self->{ATTACHMENTS}}) { 267 $self->_SaveAttachment($mime, $att); 268 } 269 } elsif ($self->{BODY_PLAIN}) { 270 # Construct a single part message object with a plain text body 271 $mime = MIME::Entity->build( 272 Type => "text/plain", 273 Data => $self->{BODY_PLAIN} 274 ); 275 } elsif ($self->{BODY_HTML}) { 276 # Construct a single part message object with an HTML body 277 $mime = MIME::Entity->build( 278 Type => "text/html", 279 Data => $self->{BODY_HTML} 280 ); 281 } 282 283 $self->_CopyHeaderData($mime); 284 285 $self->_SetHeaderFields($mime); 286 287 return $mime; 288 } 289 290 # Actually output the message in mbox format 291 sub print { 292 my $self = shift; 293 294 my $mime = $self->mime_object; 295 296 # Construct From line from whatever we know. 297 my $string = ""; 298 $string = ( 299 $self->{FROM_ADDR_TYPE} eq "SMTP" ? 300 $self->{FROM_ADDR} : 301 'someone@somewhere' 302 ); 303 $string =~ s/\n//g; 304 305 # The date used here is not really important. 306 print "From ", $string, " ", scalar localtime, "\n"; 307 $mime->print(\*STDOUT); 308 print "\n"; 309 } 310 311 sub set_verbosity { 312 my ($self, $verbosity) = @_; 313 defined $verbosity or die "Internal error: no verbosity level"; 314 $self->{VERBOSE} = $verbosity; 315 } 316 317 # 318 # Below are functions that walk the PPS tree. The *Dir functions handle 319 # processing the directory nodes of the tree (mainly, iterating over the 320 # children), whereas the *Item functions handle processing the items in the 321 # directory (if such an item is itself a directory, it will in turn be 322 # processed by the relevant *Dir function). 323 # 324 325 # 326 # RootItem: Check Root Entry, parse sub-entries. 327 # The OLE file consists of a single entry called Root Entry, which has 328 # several children. These children are parsed in the sub SubItem. 329 # 330 sub _RootDir { 331 my ($self, $PPS) = @_; 332 333 foreach my $child (@{$PPS->{Child}}) { 334 $self->_SubItem($child); 335 } 336 } 337 338 sub _SubItem { 339 my ($self, $PPS) = @_; 340 341 if ($PPS->{Type} == DIR_TYPE) { 342 $self->_SubItemDir($PPS); 343 } elsif ($PPS->{Type} == FILE_TYPE) { 344 $self->_SubItemFile($PPS); 345 } else { 346 warn "Unknown entry type: $PPS->{Type}"; 347 } 348 } 349 350 sub _SubItemDir { 351 my ($self, $PPS) = @_; 352 353 $self->_GetOLEDate($PPS); 354 355 my $name = $self->_GetName($PPS); 356 357 if ($name =~ /__recip_version1 0_ /) { # Address of one recipient 358 $self->_AddressDir($PPS); 359 } elsif ($name =~ '__attach_version1 0_ ') { # Attachment 360 $self->_AttachmentDir($PPS); 361 } else { 362 $self->_UnknownDir($self->_GetName($PPS)); 363 } 364 } 365 366 sub _SubItemFile { 367 my ($self, $PPS) = @_; 368 369 my $name = $self->_GetName($PPS); 370 my ($property, $encoding) = $self->_ParseItemName($name); 371 372 $self->_MapProperty($self, $PPS->{Data}, $property, 373 MAP_SUBITEM_FILE) or $self->_UnknownFile($name); 374 } 375 376 sub _AddressDir { 377 my ($self, $PPS) = @_; 378 379 my $address = { 380 NAME => undef, 381 ADDRESS => undef, 382 TYPE => "", 383 }; 384 foreach my $child (@{$PPS->{Child}}) { 385 $self->_AddressItem($child, $address); 386 } 387 push @{$self->{ADDRESSES}}, $address; 388 } 389 390 sub _AddressItem { 391 my ($self, $PPS, $addr_info) = @_; 392 393 my $name = $self->_GetName($PPS); 394 395 # DIR Entries: There should be none. 396 if ($PPS->{Type} == DIR_TYPE) { 397 $self->_UnknownDir($name); 398 } elsif ($PPS->{Type} == FILE_TYPE) { 399 my ($property, $encoding) = $self->_ParseItemName($name); 400 $self->_MapProperty($addr_info, $PPS->{Data}, $property, 401 MAP_ADDRESSITEM_FILE) or $self->_UnknownFile($name); 402 } else { 403 warn "Unknown entry type: $PPS->{Type}"; 404 } 405 } 406 407 sub _AttachmentDir { 408 my ($self, $PPS) = @_; 409 410 my $attachment = { 411 SHORTNAME => undef, 412 LONGNAME => undef, 413 MIMETYPE => 'application/octet-stream', 414 ENCODING => 'base64', 415 DISPOSITION => 'attachment', 416 DATA => undef 417 }; 418 foreach my $child (@{$PPS->{Child}}) { 419 $self->_AttachmentItem($child, $attachment); 420 } 421 push @{$self->{ATTACHMENTS}}, $attachment; 422 } 423 424 sub _AttachmentItem { 425 my ($self, $PPS, $att_info) = @_; 426 427 my $name = $self->_GetName($PPS); 428 429 my ($property, $encoding) = $self->_ParseItemName($name); 430 431 if ($PPS->{Type} == DIR_TYPE) { 432 433 if ($property eq '3701') { # Nested MSG file 434 my $msgp = new MSGParser(); 435 $msgp->parse($PPS); 436 my $data = $msgp->mime_object->as_string; 437 $att_info->{DATA} = $data; 438 $att_info->{MIMETYPE} = 'message/rfc822'; 439 $att_info->{ENCODING} = '8bit'; 440 } else { 441 $self->_UnknownDir($name); 442 } 443 444 } elsif ($PPS->{Type} == FILE_TYPE) { 445 $self->_MapProperty($att_info, $PPS->{Data}, $property, 446 MAP_ATTACHMENT_FILE) or $self->_UnknownFile($name); 447 } else { 448 warn "Unknown entry type: $PPS->{Type}"; 449 } 450 } 451 452 sub _MapProperty { 453 my ($self, $hash, $data, $property, $map) = @_; 454 455 defined $property or return 0; 456 my $arr = $map->{$property} or return 0; 457 458 $arr->[1] and $data =~ s/\000//g; 459 $hash->{$arr->[0]} = $data; 460 461 return 1; 462 } 463 464 sub _UnknownDir { 465 my ($self, $name) = @_; 466 467 if ($name eq '__nameid_version1 0') { 468 $self->{VERBOSE} 469 and warn "Skipping DIR entry $name (Introductory stuff)\n"; 470 return; 471 } 472 warn "Unknown DIR entry $name\n"; 473 } 474 475 sub _UnknownFile { 476 my ($self, $name) = @_; 477 478 if ($name eq '__properties_version1 0') { 479 $self->{VERBOSE} 480 and warn "Skipping FILE entry $name (Properties)\n"; 481 return; 482 } 483 484 my ($property, $encoding) = $self->_ParseItemName($name); 485 unless (defined $property) { 486 warn "Unknown FILE entry $name\n"; 487 return; 488 } 489 if ($skipproperties->{$property}) { 490 $self->{VERBOSE} 491 and warn "Skipping property $property ($skipproperties->{$property})\n"; 492 return; 493 } elsif ($property =~ /^80/) { 494 $self->{VERBOSE} 495 and warn "Skipping property $property (user-defined property)\n"; 496 return; 497 } else { 498 warn "Unknown property $property\n"; 499 return; 500 } 501 } 502 503 # 504 # Helper functions 505 # 506 507 sub _GetName { 508 my ($self, $PPS) = @_; 509 return $self->_NormalizeWhiteSpace(OLE::Storage_Lite::Ucs2Asc($PPS->{Name})); 510 } 511 512 sub _NormalizeWhiteSpace { 513 my ($self, $name) = @_; 514 $name =~ s/\W/ /g; 515 return $name; 516 } 517 518 sub _GetOLEDate { 519 my ($self, $PPS) = @_; 520 unless (defined ($self->{OLEDATE})) { 521 # Make Date 522 my $datearr; 523 $datearr = $PPS->{Time2nd}; 524 $datearr = $PPS->{Time1st} unless($datearr); 525 $self->{OLEDATE} = $self->_FormatDate($datearr) if $datearr; 526 } 527 } 528 529 sub _FormatDate { 530 my ($self, $datearr) = @_; 531 532 # TODO: This is a little convoluted. Directly using strftime didn't seem 533 # to work. 534 my $datetime = mktime(@$datearr); 535 return time2str("%a, %d %h %Y %X %z", $datetime); 536 } 537 538 # If we didn't get the date from the original header data, we may be able 539 # to get it from the SUBMISSION_ID: 540 # It seems to have the format of a semicolon-separated list of key=value 541 # pairs. The key l has a value with the format: 542 # <SERVER>-<DATETIME>Z-<NUMBER>, where DATETIME is the date and time in 543 # the format YYMMDDHHMMSS. 544 sub _SubmissionIdDate { 545 my $self = shift; 546 547 my $submission_id = $self->{SUBMISSION_ID} or return undef; 548 $submission_id =~ m/l=.*-(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)Z-.*/ 549 or return undef; 550 my $year = $1; 551 $year += 100 if $year < 20; 552 return $self->_FormatDate([$6,$5,$4,$3,$2-1,$year]); 553 } 554 555 sub _ParseItemName { 556 my ($self, $name) = @_; 557 558 if ($name =~ /^__substg1 0_(....)(....)$/) { 559 my ($property, $encoding) = ($1, $2); 560 if ($encoding eq ENCODING_UNICODE and not ($self->{HAS_UNICODE})) { 561 warn "This MSG file contains Unicode fields." 562 . " This is currently unsupported.\n"; 563 $self->{HAS_UNICODE} = 1; 564 } elsif (not (KNOWN_ENCODINGS()->{$encoding})) { 565 warn "Unknown encoding $encoding. Results may be strange or wrong.\n"; 566 } 567 return ($property, $encoding); 568 } else { 569 return (undef, undef); 570 } 571 } 572 573 sub _SaveAttachment { 574 my ($self, $mime, $att) = @_; 575 576 my $ent = $mime->attach( 577 Type => $att->{MIMETYPE}, 578 Encoding => $att->{ENCODING}, 579 Data => [], 580 Filename => ($att->{LONGNAME} ? $att->{LONGNAME} : $att->{SHORTNAME}), 581 Disposition => $att->{DISPOSITION} 582 ); 583 584 my $handle; 585 if ($handle = $ent->open("w")) { 586 $handle->print($att->{DATA}); 587 $handle->close; 588 } else { 589 warn "Could not write data!"; 590 } 591 } 592 593 sub _SetAddressPart { 594 my ($self, $adrname, $partname, $data) = @_; 595 596 my $address = $self->{ADDRESSES}->{$adrname}; 597 $data =~ s/\000//g; 598 #warn "Processing address data part $partname : $data\n"; 599 if (defined ($address->{$partname})) { 600 if ($address->{$partname} eq $data) { 601 warn "Skipping duplicate but identical address information for" 602 . " $partname\n" if $self->{VERBOSE}; 603 } else { 604 warn "Address information $partname inconsistent:\n"; 605 warn " Original data: $address->{$partname}\n"; 606 warn " New data: $data\n"; 607 } 608 } else { 609 $address->{$partname} = $data; 610 } 611 } 612 613 # Set header fields 614 sub _AddHeaderField { 615 my ($self, $mime, $fieldname, $value) = @_; 616 617 my $oldvalue = $mime->head->get($fieldname); 618 return if $oldvalue; 619 $mime->head->add($fieldname, $value) if $value; 620 } 621 622 sub _Address { 623 my ($self, $tag) = @_; 624 my $name = $self->{$tag} || ""; 625 my $address = $self->{$tag . "_ADDR"} || ""; 626 return "$name <$address>"; 627 } 628 629 # Find SMTP addresses for the given list of names 630 sub _ExpandAddressList { 631 my ($self, $names) = @_; 632 633 my $addresspool = $self->{ADDRESSES}; 634 my @namelist = split /; */, $names; 635 my @result; 636 name: foreach my $name (@namelist) { 637 foreach my $address (@$addresspool) { 638 if ($name eq $address->{NAME}) { 639 my $addresstext = $address->{NAME} . " <"; 640 if (defined ($address->{SMTPADDRESS})) { 641 $addresstext .= $address->{SMTPADDRESS}; 642 } elsif ($address->{TYPE} eq "SMTP") { 643 $addresstext .= $address->{ADDRESS}; 644 } 645 $addresstext .= ">"; 646 push @result, $addresstext; 647 next name; 648 } 649 } 650 push @result, $name; 651 } 652 return join ", ", @result; 653 } 654 655 sub _ParseHead { 656 my ($self, $data) = @_; 657 defined $data or return undef; 658 # Parse full header date if we got that. 659 my $parser = new MIME::Parser(); 660 $parser->output_to_core(1); 661 $parser->decode_headers(1); 662 $data =~ s/^Microsoft Mail.*$/X-MSGConvert: yes/m; 663 my $entity = $parser->parse_data($data) 664 or warn "Couldn't parse full headers!"; 665 my $head = $entity->head; 666 $head->unfold; 667 return $head; 668 } 669 670 # Find out if we need to construct a multipart message 671 sub _IsMultiPart { 672 my $self = shift; 673 674 return ( 675 ($self->{BODY_HTML} and $self->{BODY_PLAIN}) 676 or @{$self->{ATTACHMENTS}}>0 677 ); 678 } 679 680 # Copy original header data. 681 # Note: This should contain the Date: header. 682 sub _CopyHeaderData { 683 my ($self, $mime) = @_; 684 685 my $head = $self->_ParseHead($self->{HEAD}) or return; 686 687 foreach my $tag (grep {!$skipheaders->{$_}} $head->tags) { 688 foreach my $value ($head->get_all($tag)) { 689 $mime->head->add($tag, $value); 690 } 691 } 692 } 693 694 # Set header fields 695 sub _SetHeaderFields { 696 my ($self, $mime) = @_; 697 698 # If we didn't get the date from the original header data, we may be able 699 # to get it from the SUBMISSION_ID: 700 $self->_AddHeaderField($mime, 'Date', $self->_SubmissionIdDate()); 701 702 # Third and last chance to set the Date: header; this uses the date the 703 # MSG file was saved. 704 $self->_AddHeaderField($mime, 'Date', $self->{OLEDATE}); 705 $self->_AddHeaderField($mime, 'Subject', $self->{SUBJECT}); 706 $self->_AddHeaderField($mime, 'From', $self->_Address("FROM")); 707 #$self->_AddHeaderField($mime, 'Reply-To', $self->_Address("REPLYTO")); 708 $self->_AddHeaderField($mime, 'To', $self->_ExpandAddressList($self->{TO})); 709 $self->_AddHeaderField($mime, 'Cc', $self->_ExpandAddressList($self->{CC})); 710 $self->_AddHeaderField($mime, 'Message-Id', $self->{MESSAGEID}); 711 $self->_AddHeaderField($mime, 'In-Reply-To', $self->{INREPLYTO}); 712 } 713 714 package main; 715 use Getopt::Long; 716 use Pod::Usage; 717 718 # Setup command line processing. 719 my $verbose = ''; 720 my $help = ''; # Print help message and exit. 721 GetOptions('verbose' => \$verbose, 'help|?' => \$help) or pod2usage(2); 722 pod2usage(1) if $help; 723 724 # Get file name 725 my $file = $ARGV[0]; 726 defined $file or pod2usage(2); 727 warn "Will parse file: $file\n" if $verbose; 728 729 # Load and parse MSG file (is OLE) 730 my $Msg = OLE::Storage_Lite->new($file); 731 my $PPS = $Msg->getPpsTree(1); 732 $PPS or die "$file must be an OLE file"; 733 734 # parse PPS tree 735 my $parser = new MSGParser(); 736 $parser->set_verbosity(1) if $verbose; 737 $parser->parse($PPS); 738 $parser->print(); 739 740 # 741 # Usage info follows. 742 # 743 __END__ 744 745 =head1 NAME 746 747 msgconvert.pl - Convert Outlook .msg files to mbox format 748 749 =head1 SYNOPSIS 750 751 msgconvert.pl [options] <file.msg> 752 753 Options: 754 --verbose be verbose 755 --help help message 756 757 =head1 OPTIONS 758 759 =over 8 760 761 =item B<--verbose> 762 763 Print information about skipped parts of the .msg file. 764 765 =item B<--help> 766 767 Print a brief help message. 768 769 =head1 DESCRIPTION 770 771 This program will output the message contained in file.msg in mbox format 772 on stdout. It will complain about unrecognized OLE parts on 773 stderr. 774 775 =head1 BUGS 776 777 Not all data that's in the .MSG file is converted. There simply are some 778 parts whose meaning escapes me. One of these must contain the date the 779 message was sent, for example. Formatting of text messages will also be 780 lost. YMMV. 781 782 =cut -
xapian-omega-1.0.7a/omega.cc
diff -u xapian-omega-1.0.7a/omega.cc.orig
old new 264 264 } 265 265 } 266 266 267 // filter by URL substring 268 val = cgi_params.find("U"); 269 if (val != cgi_params.end()) { 270 string url = val->second; 271 if (!url.empty()) { 272 filters += ("U" + url + "*"); 273 filters += filter_sep; 274 } 275 } 276 267 277 // date range filters 268 278 val = cgi_params.find("START"); 269 279 if (val != cgi_params.end()) date_start = val->second; -
xapian-omega-1.0.7a/omega.conf.in
diff -u xapian-omega-1.0.7a/omega.conf.in.orig
old new 1 # Directory containing Xapian databases: 2 database_dir @localstatedir@/omega/data 3 4 # Directory containing OmegaScript templates: 5 template_dir @localstatedir@/omega/templates 6 7 # Directory to write Omega logs to: 8 log_dir /var/log/omega 9 10 # Directory containing any cdb files for the $lookup OmegaScript command: 11 cdb_dir @localstatedir@/omega/cdb 12 13 # Directory containing extracted archives: 14 cache_dir @localstatedir@/omega/cache -
xapian-omega-1.0.7a/omindex.cc
diff -u xapian-omega-1.0.7a/omindex.cc.orig
old new 4 4 * Copyright 2001,2005 James Aylett 5 5 * Copyright 2001,2002 Ananova Ltd 6 6 * Copyright 2002,2003,2004,2005,2006,2007,2008 Olly Betts 7 * Copyright 2006,2007,2008 AVL List GesmbH 7 8 * 8 9 * This program is free software; you can redistribute it and/or 9 10 * modify it under the terms of the GNU General Public License as … … 42 43 #include <xapian.h> 43 44 44 45 #include "commonhelp.h" 46 #include "configfile.h" 45 47 #include "diritor.h" 46 48 #include "hashterm.h" 47 49 #include "loadfile.h" … … 62 64 extern char * mkdtemp(char *); 63 65 #endif 64 66 67 #ifndef LIBEXECDIR 68 // must have ending slash 69 //# define LIBEXECDIR "/usr/lib/omega/bin/" 70 # define LIBEXECDIR "" 71 #endif 72 #ifndef PKGDATADIR 73 // must have ending slash 74 # define PKGDATADIR "/usr/share/omega/" 75 #endif 76 65 77 using namespace std; 66 78 67 79 #define TITLE_SIZE 128 … … 69 81 70 82 #define PROG_NAME "omindex" 71 83 #define PROG_DESC "Index static website data via the filesystem" 84 85 /* used in runfilter.cc */ 86 bool verbose = false; 87 string error_log; 72 88 73 89 static bool skip_duplicates = false; 74 90 static bool follow_symlinks = false; 91 static bool nocleanup = false; 92 static bool silent = false; 75 93 static string dbpath; 76 94 static string root; 77 95 static string indexroot; … … 136 154 155 static void 156 index_cached_directory(size_t depth_limit, 157 const string &file, 158 const string &url, 159 const string &ext, 160 const string &cmd, 161 map<string, string>& mime_map); 162 static 163 int mkdir_p(const string &path, mode_t mode); 164 137 165 inline static bool 138 166 p_notalnum(unsigned int c) 139 167 { … … 184 212 185 cout << "Indexing \"" << url << "\" as " << mimetype << " ... " << flush; 213 if (!silent) 214 cout << "Indexing \"" << url.substr(1) << "\" as " << mimetype << " ... " << flush; 186 215 187 216 string urlterm("U"); 188 217 urlterm += baseurl; … … 217 246 // indexing is disallowed 218 247 } 219 248 if (!p.indexing_allowed) { 220 cout << "indexing disallowed by meta tag - skipping" << endl; 249 if (!silent) 250 cout << "indexing disallowed by meta tag - skipping" << endl; 221 251 return; 222 252 } 223 253 dump = p.dump; … … 245 275 return; 246 276 } 247 277 md5_string(dump, md5); 278 #if 0 // FIXME: this won't work as omindex will have the database locked... 279 } else if (mimetype == "message/rfc822") { // // => mbox2script 280 //for stemmer lang, parse stemmer.get_description => Xapian::Stem(bla) 281 string cmd = LIBEXECDIR"mbox2omega " + shell_protect(file) + error_log+"| " 282 "scriptindex " + shell_protect(dbpath) + " "PKGDATADIR"mbox2script.script"; 283 try { 284 dump = stdout_to_string(cmd); 285 } catch (ReadError) { 286 cout << "\"" << cmd << "\" failed - skipping" << endl; 287 return; 288 } 289 #endif 248 290 } else if (mimetype == "application/pdf") { 249 291 string safefile = shell_protect(file); … … 383 425 } else if (mimetype == "text/rtf") { 384 426 // The --text option unhelpfully converts all non-ASCII characters to 385 427 // "?" so we use --html instead, which produces HTML entities. 386 string cmd = "unrtf --nopict --html 2>/dev/null " + shell_protect(file) ;428 string cmd = "unrtf --nopict --html 2>/dev/null " + shell_protect(file) + error_log; 387 429 MyHtmlParser p; 388 430 try { 389 431 p.parse_html(stdout_to_string(cmd)); … … (this hunk was shorter than expected) 536 579 Xapian::docid did = db.replace_document(urlterm, newdocument); 537 580 if (did < updated.size()) { 538 581 updated[did] = true; 582 if (!silent) 539 583 cout << "updated." << endl; 540 584 } else { 585 if (!silent) 541 586 cout << "added." << endl; 542 587 } 543 588 } catch (...) { 544 589 // FIXME: is this ever actually needed? 545 590 db.add_document(newdocument); 591 if (!silent) 546 592 cout << "added (failed re-seek for duplicate)." << endl; 547 593 } 548 594 } else { 549 595 // If this were a duplicate, we'd have skipped it above. 550 596 db.add_document(newdocument); 597 if (!silent) 551 598 cout << "added." << endl; 552 599 } 553 600 } 554 601 602 /* Note: switched to cache_dir as root for virtual directories, 603 because /srcdir/.zip might not be creatable. */ 555 604 static void 556 605 index_directory(size_t depth_limit, const string &dir, 557 606 map<string, string>& mime_map) 558 607 { 559 608 string path = root + indexroot + dir; 560 609 561 cout << "[Entering directory " << dir << "]" << endl; 610 if (!silent) 611 cout << "[Entering directory " << dir.substr(1) << "]" << endl; 562 612 563 613 DirectoryIterator d(follow_symlinks); 564 614 try { 565 d.start(path); 566 while (d.next()) try { 567 string url = dir; 568 if (!url.empty() && url[url.size() - 1] != '/') url += '/'; 569 url += d.leafname(); 570 string file = root + indexroot + url; 571 switch (d.get_type()) { 572 case DirectoryIterator::DIRECTORY: 573 if (depth_limit == 1) continue; 574 try { 575 size_t new_limit = depth_limit; 576 if (new_limit) --new_limit; 577 index_directory(new_limit, url, mime_map); 578 } catch (...) { 579 cout << "Caught unknown exception in index_directory, rethrowing" << endl; 580 throw; 615 d.start(root + indexroot + dir); 616 } catch (const std::string & error) { 617 cout << error << " - skipping" << endl; 618 return; 619 } 620 while (d.next()) try { 621 struct stat statbuf; 622 string url = dir; 623 if (!url.empty() && url[url.size() - 1] != '/') url += '/'; 624 url += d.leafname(); 625 string file = root + indexroot + url; 626 switch (d.get_type()) { 627 case DirectoryIterator::DIRECTORY: 628 if (depth_limit == 1) continue; 629 try { 630 size_t new_limit = depth_limit; 631 if (new_limit) --new_limit; 632 index_directory(new_limit, url, mime_map); 633 } catch (...) { 634 cout << "Caught unknown exception in index_directory, rethrowing" << endl; 635 throw; 636 } 637 continue; 638 case DirectoryIterator::REGULAR_FILE: { 639 string ext; 640 string::size_type dot = url.find_last_of('.'); 641 if (dot != string::npos) ext = url.substr(dot + 1); 642 if (!ext.empty()) { 643 ext = string(ext); // lowercase ext 644 for (unsigned int i=0; i<ext.length(); i++) { 645 ext[i] = tolower(ext[i]); 581 646 } 582 case DirectoryIterator::REGULAR_FILE: { 583 string ext; 584 string::size_type dot = url.find_last_of('.'); 585 if (dot != string::npos) ext = url.substr(dot + 1); 586 587 map<string,string>::iterator mt = mime_map.find(ext); 588 if (mt == mime_map.end()) { 589 // If the extension isn't found, see if the lower-cased 590 // version (if different) is found. 591 bool changed = false; 592 string::iterator i; 593 for (i = ext.begin(); i != ext.end(); ++i) { 594 if (*i >= 'A' && *i <= 'Z') { 595 *i = tolower(*i); 596 changed = true; 597 } 647 } 648 649 if (strcmp(d.leafname(), "mbox") == 0) { 650 // Special filename. 651 off_t size = d.get_size(); 652 time_t mtime = d.get_mtime(); 653 index_file(indexroot + url, "message/rfc822", mtime, size); 654 continue; 655 } 656 657 map<string,string>::iterator mt = mime_map.find(ext); 658 if (mt == mime_map.end()) { 659 // If the extension isn't found, see if the lower-cased 660 // version (if different) is found. 661 bool changed = false; 662 string::iterator i; 663 for (i = ext.begin(); i != ext.end(); ++i) { 664 if (*i >= 'A' && *i <= 'Z') { 665 *i = tolower(*i); 666 changed = true; 667 } 668 } 669 if (changed) mt = mime_map.find(ext); 670 } 671 if (mt != mime_map.end()) { 672 string oldroot = root; 673 // Only check the file size if we recognise the 674 // extension to avoid a call to stat()/lstat() for 675 // files we can't handle when readdir() tells us the 676 // file type. 677 off_t size = d.get_size(); 678 if (size == 0) { 679 cout << "Skipping empty file: \"" << file << "\"" 680 << endl; 681 continue; 682 } 683 684 #ifndef _MSC_VER 685 // NOTE: unpacking does not work on MSWin32 this way! 686 // we'd really have to pull in utils.cc:rmdir from xapian-core 687 if (ext == "zip") { 688 if (depth_limit == 1) { 689 cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl; 690 continue; 691 } 692 // overwrite 693 string cmd = "unzip -u -P. -o " +shell_protect(file) + " -d " +shell_protect(cache_dir+"/.zip"+indexroot+url+"/"); 694 try { 695 size_t new_limit = depth_limit; 696 if (new_limit) --new_limit; 697 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 698 } catch (ReadError) { 699 cout << "failed " << cmd << " << in index_cached_directory" << endl; 700 root = oldroot; 701 } catch (...) { 702 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 703 root = oldroot; 704 throw; 598 705 } 599 if (changed) mt = mime_map.find(ext);706 continue; 600 707 } 601 if (mt != mime_map.end()) { 602 // Only check the file size if we recognise the 603 // extension to avoid a call to stat()/lstat() for 604 // files we can't handle when readdir() tells us the 605 // file type. 606 off_t size = d.get_size(); 607 if (size == 0) { 608 cout << "Skipping empty file: \"" << file << "\"" 609 << endl; 708 else if (ext == "rar") { 709 if (depth_limit == 1) { 710 cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl; 610 711 continue; 611 712 } 612 613 // It's in our MIME map so we know how to index it. 614 const string & mimetype = mt->second; 713 // overwrite 714 string cmd = "unrar x -o+ " +shell_protect(file) + " " 715 + shell_protect(cache_dir+"/.rar"+indexroot+url+"/"); 716 try { 717 size_t new_limit = depth_limit; 718 if (new_limit) --new_limit; 719 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 720 } catch (ReadError) { 721 cout << "failed " << cmd << " << in index_cached_directory" << endl; 722 root = oldroot; 723 } catch (...) { 724 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 725 root = oldroot; 726 throw; 727 } 728 continue; 729 } 730 #ifdef HAVE_MSGCONVERT 731 else if (ext == "msg") { 732 struct stat statcache; 733 char olddir[256]; 734 735 if (depth_limit == 1) { 736 cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl; 737 continue; 738 } 739 string cmd = LIBEXECDIR"outlook2text "+shell_protect(file); 740 // unpack multiparts and attachments. so we have to chdir first 741 string fulldir = cache_dir+"/.msg"+indexroot+url; 742 getcwd(olddir,256); 743 #ifdef HAVE_LSTAT 744 lstat(fulldir.c_str(), &statcache); 745 #else 746 stat(fulldir.c_str(), &statcache); 747 #endif 748 if (!S_ISDIR(statcache.st_mode)) { 749 mkdir_p(fulldir, 0755); 750 } 615 751 try { 616 time_t mtime = d.get_mtime(); 617 index_file(indexroot + url, mimetype, mtime, size); 618 } catch (NoSuchFilter) { 619 // FIXME: we ought to ignore by mime-type not 620 // extension. 621 cout << "Filter for \"" << mimetype 622 << "\" not installed - ignoring extension \"" 623 << ext << "\"" << endl; 624 mime_map.erase(mt); 625 } 626 } else { 627 cout << "Unknown extension: \"" << file 628 << "\" - skipping" << endl; 752 chdir (fulldir.c_str()); 753 size_t new_limit = depth_limit; 754 if (new_limit) --new_limit; 755 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 756 chdir (olddir); 757 } catch (ReadError) { 758 cout << "failed " << cmd << " << in index_cached_directory" << endl; 759 chdir (olddir); 760 root = oldroot; 761 } catch (...) { 762 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 763 chdir (olddir); 764 root = oldroot; 765 throw; 766 } 767 continue; 629 768 } 630 continue; 631 } 632 default: 633 cout << "Not a regular file \"" << file 769 #endif 770 #ifdef HAVE_READPST 771 else if (ext == "pst") { 772 if (depth_limit == 1) { 773 cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl; 774 continue; 775 } 776 // unpack attachments also, together with mbox files 777 string cmd = "readpst -r -cv -w -o " 778 + shell_protect(cache_dir+"/.pst"+indexroot+url+"/")+" "+shell_protect(file); 779 try { 780 size_t new_limit = depth_limit; 781 if (new_limit) --new_limit; 782 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 783 } catch (ReadError) { 784 root = oldroot; 785 cout << "failed " << cmd << " << in index_cached_directory" << endl; 786 } catch (...) { 787 root = oldroot; 788 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 789 throw; 790 } 791 continue; 792 } 793 #endif 794 #endif 795 // It's in our MIME map so we know how to index it. 796 const string & mimetype = mt->second; 797 try { 798 time_t mtime = d.get_mtime(); 799 index_file(indexroot + url, mimetype, mtime, size); 800 } catch (NoSuchFilter) { 801 // FIXME: we ought to ignore by mime-type not 802 // extension. 803 cout << "Filter for \"" << mimetype 804 << "\" not installed - ignoring extension \"" 805 << ext << "\"" << endl; 806 mime_map.erase(mt); 807 } 808 } else { 809 cout << "Unknown extension: \"" << file 634 810 << "\" - skipping" << endl; 811 } 812 continue; 635 813 } 636 } catch (const std::string & error) {637 cout << error << " - skipping" << endl;638 continue;814 default: 815 cout << "Not a regular file \"" << file 816 << "\" - skipping" << endl; 639 817 } 640 818 } catch (const std::string & error) { 641 cout << error << " - skipping directory" << endl; 642 return; 819 cout << error << " - skipping" << endl; 820 continue; 821 } 822 } 823 824 static 825 int mkdir_p(const string &path, mode_t mode) { 826 #ifdef __WIN32__ 827 stdout_to_string("mkdir \""+shell_protect(path)+"\""); 828 #else 829 stdout_to_string("mkdir -p "+shell_protect(path)); 830 #endif 831 return 0; 832 } 833 834 /* 835 * unpack .msg/.pst/.rar/.zip into local cache dir and recurse there 836 */ 837 static void 838 index_cached_directory(size_t depth_limit, 839 const string &file, 840 const string &url, 841 const string &ext, 842 const string &cmd, 843 map<string, string>& mime_map) 844 { 845 string oldroot = root; 846 root = cache_dir; 847 string cache = root+"/."+ext+indexroot; 848 string cachedir = cache+url; 849 struct stat statfile, statcache; 850 bool extract_cache; 851 #ifdef HAVE_LSTAT 852 lstat(file.c_str(), &statfile); 853 lstat(cachedir.c_str(), &statcache); 854 #else 855 stat(file.c_str(), &statfile); 856 stat(cachedir.c_str(), &statcache); 857 #endif 858 extract_cache = true; 859 // if cachedir exists and if file is older as cachedir and if cachedir existed 5 secs ago, 860 // then it was already extracted. 861 if (S_ISDIR(statcache.st_mode) 862 && S_ISREG(statfile.st_mode) 863 && (statfile.st_mtime < statcache.st_mtime) 864 && (statcache.st_mtime < (time_t)(time(NULL)-500))) // not created by nested mkdir call 865 { 866 // but is it in the database also? prevent from deleting skipped files 867 if (!silent) 868 cout << "Unchanged cache \"" << cachedir << "\" - \"" << file << "\" - skip extraction " 869 // << statfile.st_mtime << " < " << statcache.st_mtime 870 << endl; 871 extract_cache = false; 872 } 873 if (S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode) ) 874 { 875 // check database timestamp for cached container, esp. for cleaned up caches. 876 // if already in db we need not to extract again 877 string urlterm("U"); 878 urlterm += baseurl; 879 urlterm += "/."+ext+indexroot+url; 880 if (urlterm.length() > MAX_SAFE_TERM_LENGTH) 881 urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH); 882 883 { 884 // at first find the docid with the beginning urlterm and check its timestamp 885 Xapian::docid docid = 0; 886 Xapian::PostingIterator p = db.postlist_begin(urlterm); 887 if (p != db.postlist_end(urlterm)) { 888 docid = *p; 889 } 890 if (docid && !ignore_time) { 891 // new: first search value (1) 892 Xapian::Document doc = db.get_document(docid); 893 string lastmod; 894 if (doc.values_count()) 895 lastmod = doc.get_value(VALUE_LASTMOD); 896 if (!lastmod.empty()) { 897 if (string_to_int(lastmod) >= statfile.st_mtime) { 898 if (!silent) 899 cout << "Cache "<< "."+ext+indexroot+url << " not newer. Ignored." << endl; 900 if (docid < updated.size()) { 901 updated[docid] = true; 902 root = oldroot; 903 return; 904 } 905 } 906 } 907 } 908 } 909 } 910 911 if (extract_cache) { 912 if (!silent) 913 cout << "[EXTRACT into cache " << cachedir << "]" << endl; 914 if (verbose && S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode)) 915 cout << " ...changed cache \"" << cachedir << "\" - \"" << file << "\" " 916 << statfile.st_mtime << " < " << statcache.st_mtime << " time: " << time(NULL) 917 << endl; 918 if (!S_ISDIR(statcache.st_mode)) 919 mkdir_p(cachedir, 0755); 920 stdout_to_string(cmd); 921 #ifndef __WIN32__ 922 stdout_to_string("chmod -R u+rwx " + shell_protect(cachedir)); 923 #endif 924 #ifdef HAVE_LSTAT 925 lstat(cachedir.c_str(), &statcache); 926 #else 927 stat(cachedir.c_str(), &statcache); 928 #endif 929 } 930 931 if (S_ISDIR(statcache.st_mode)) { 932 if (depth_limit == 1) { 933 cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl; 934 } else { 935 // max loop 5, magic start: /.ext+file 936 index_directory(depth_limit+5, "/."+ext+url, mime_map); 937 if (!nocleanup) { 938 if (!silent) 939 cout << "[CLEANUP " << "rm -rf " << shell_protect(cachedir) << "]" << endl; 940 rm_rf(cachedir); 941 } 942 } 943 } 944 else { // no -p would be fatal here 945 cout << "cachedir " << shell_protect(cachedir) << " does not exist - skipped" << endl; 643 946 } 947 root = oldroot; 644 948 } 645 949 646 950 int … … 653 958 static const struct option longopts[] = { 654 959 { "help", no_argument, NULL, 'h' }, 655 960 { "version", no_argument, NULL, 'v' }, 961 { "verbose", no_argument, NULL, 'V' }, 962 { "silent", no_argument, NULL, 'S' }, 656 963 { "overwrite", no_argument, NULL, 'o' }, 657 964 { "duplicates", required_argument, NULL, 'd' }, 658 965 { "preserve-nonduplicates", no_argument, NULL, 'p' }, … … 667 974 { "depth-limit",required_argument, NULL, 'l' }, 668 975 { "follow", no_argument, NULL, 'f' }, 669 976 { "stemmer", required_argument, NULL, 's' }, 977 { "nocleanup", no_argument, NULL, 'c' }, 978 { "cachedir", required_argument, NULL, 'C' }, 670 979 { 0, 0, NULL, 0 } 671 980 }; 672 981 … … 717 1026 mime_map["xlt"] = "application/vnd.ms-excel"; // Excel template 718 1027 mime_map["ppt"] = "application/vnd.ms-powerpoint"; 719 1028 mime_map["pps"] = "application/vnd.ms-powerpoint"; // Powerpoint slideshow 1029 #ifdef HAVE_READPST 1030 // Outlook messager folder 1031 mime_map["pst"] = "application/vnd.ms-outlook-pst"; // readpst | uudeview (libpst) 1032 #endif 1033 #ifdef HAVE_MSGCONVERT 1034 mime_map["msg"] = "application/vnd.ms-outlook"; // outlook2text via msgconvert.pl 1035 #endif 1036 mime_map["mbox"] = "message/rfc822"; // => mbox2omega 720 1037 // Perl: 721 1038 mime_map["pl"] = "text/x-perl"; 722 1039 mime_map["pm"] = "text/x-perl"; … … 727 1044 // DjVu: 728 1045 mime_map["djv"] = "image/vnd.djvu"; 729 1046 mime_map["djvu"] = "image/vnd.djvu"; 1047 #ifndef _MSC_VER 1048 mime_map["zip"] = "application/x-zip"; // recursive scanning 1049 # ifdef HAVE_UNRAR 1050 mime_map["rar"] = "application/x-rar"; // recursive scanning 1051 # endif 1052 #endif 1053 1054 read_config_file(); 730 1055 731 while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M: lpf", longopts, NULL)) != -1) {1056 while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:C:lpfc", longopts, NULL))!=EOF) { 732 1057 switch (getopt_ret) { 733 1058 case 'h': { 734 1059 cout << PROG_NAME" - "PROG_DESC"\n\n" … … 753 1078 " duplicate replace mode\n" 754 1079 " -D, --db path to database to use\n" 755 1080 " -U, --url base url DIRECTORY represents (default: /)\n" 1081 " -C, --cachedir path to local cache to use (default from omega.conf)\n" 756 1082 " -M, --mime-type additional MIME mapping ext:type\n" 757 1083 " -l, --depth-limit=LIMIT set recursion limit (0 = unlimited)\n" 758 1084 " -f, --follow follow symbolic links\n" 1085 " -c, --nocleanup keep cache, don't delete temporary .zip,.rar,.pst,.msg cache folders\n" 759 1086 " --overwrite create the database anew (the default is to update\n" 760 " if the database already exists)" << endl; 1087 " if the database already exists)" 1088 " --verbose Print commands also\n" 1089 " --silent Print only errors\n"; 761 1090 print_stemmer_help(" "); 762 1091 print_help_and_version_help(" "); 763 1092 return 0; … … 785 1114 case 'p': // don't delete unupdated documents 786 1115 preserve_unupdated = true; 787 1116 break; 1117 case 'V': 1118 verbose = true; 1119 break; 1120 case 'c': 1121 nocleanup = true; 1122 break; 788 1123 case 'l': { // Set recursion limit 789 1124 int arg = atoi(optarg); 790 1125 if (arg < 0) arg = 0; … … 817 1152 case 'U': 818 1153 baseurl = optarg; 819 1154 break; 1155 case 'C': 1156 cache_dir = optarg; 1157 break; 820 1158 case 'o': // --overwrite 821 1159 overwrite = true; 822 1160 break; … … 844 1182 if (baseurl.empty()) { 845 1183 cerr << PROG_NAME": --url not specified, assuming `/'.\n"; 846 1184 } 1185 error_log = " 2>>"+log_dir+"omindex-error.log"; 847 1186 // baseurl mustn't end '/' or you end up with the wrong URL 848 1187 // (//thing is different to /thing). We could probably make this 849 1188 // safe a different way, by ensuring that we don't put a leading '/' … … 869 1208 } else { 870 1209 indexroot = ""; // index the whole of root 871 1210 } 1211 // add the db basename to cache_dir 1212 { 1213 const char *p = strrchr(dbpath.c_str(), '/'); 1214 // on windows only 1215 if (!p) p = strrchr(dbpath.c_str(), '\\'); 1216 if (p) { p++; } else { p = dbpath.c_str(); } 1217 cache_dir += p; 1218 } 872 1219 873 1220 int exitcode = 1; 874 1221 try { -
xapian-omega-1.0.7a/outlook2text.in
diff -u xapian-omega-1.0.7a/outlook2text.in.orig
old new 1 #! /bin/sh 2 # converts msg to mbox and extract attachments 3 # either be in the cache dir, or accept it as 2nd arg 4 if [ -n $2 ]; then 5 @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "$2" 6 else 7 # already is in the cache dir 8 base=`basename "$1" .msg` 9 @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "${base}" 10 fi -
xapian-omega-1.0.7a/query.cc
diff -u xapian-omega-1.0.7a/query.cc.orig
old new 141 141 switch (t[0]) { 142 142 case 'a': 143 143 return (t == "a" || t == "about" || t == "an" || t == "and" || 144 t == "are" || t == "as" || t == "at"); 144 t == "are" || t == "as" || t == "at" || t == "according" || 145 t == "again" || t == "against" || t == "ah" || t == "all" || 146 t == "although" || t == "always" || t == "anyone" || t == "after" || 147 t == "also" || t == "any"); 145 148 case 'b': 146 149 return (t == "be" || t == "by"); 147 150 case 'e': -
xapian-omega-1.0.7a/runfilter.cc
diff -u xapian-omega-1.0.7a/runfilter.cc.orig
old new 60 60 61 61 using namespace std; 62 62 63 extern string error_log; 64 extern bool verbose; 65 63 66 string 64 67 stdout_to_string(const string &cmd) 65 68 { … … 97 100 setrlimit(RLIMIT_AS, &ram_limit); 98 101 } 99 102 100 execl("/bin/sh", "/bin/sh", "-c", cmd.c_str(), (void*)NULL); 103 string tmp; 104 tmp = cmd + error_log; 105 if (verbose) { 106 cout << " Executing '" << tmp << "'..." << endl; 107 } 108 109 execl("/bin/sh", "/bin/sh", "-c", tmp.c_str(), (void*)NULL); 101 110 _exit(-1); 102 111 } 103 112 … … 134 143 throw ReadError(); 135 144 } 136 145 #else 137 FILE * fh = popen(cmd.c_str(), "r"); 146 string tmp; 147 tmp = cmd + error_log; 148 if (verbose) { 149 cout << " Executing '" << tmp << "'..." << endl; 150 } 151 FILE * fh = popen(tmp.c_str(), "r"); 138 152 if (fh == NULL) throw ReadError(); 139 153 while (!feof(fh)) { 140 154 char buf[4096]; -
xapian-omega-1.0.7a/scriptindex.cc
diff -u xapian-omega-1.0.7a/scriptindex.cc.orig
old new 4 4 * Copyright 2001 Sam Liddicott 5 5 * Copyright 2001,2002 Ananova Ltd 6 6 * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts 7 * Copyright 2006,2007 AVL List GesmbH 7 8 * 8 9 * This program is free software; you can redistribute it and/or 9 10 * modify it under the terms of the GNU General Public License as … … 38 39 #include <stdio.h> 39 40 #include <time.h> 40 41 #include "safeunistd.h" 42 #include <sys/stat.h> 41 43 42 44 #include "commonhelp.h" 45 #include "configfile.h" 43 46 #include "hashterm.h" 44 47 #include "loadfile.h" 45 48 #include "myhtmlparse.h" 46 49 #include "stringutils.h" 47 50 #include "utf8truncate.h" 48 51 #include "utils.h" 52 #include "values.h" 49 53 50 54 #include "gnu_getopt.h" … … 422 426 { 423 427 string line; 424 428 size_t line_no = 0; 429 time_t last_mod = 0; 430 long file_size = 0; 431 432 if (strcmp(fname,"<stdin>") != 0) { 433 struct stat statbuf; 434 stat(fname, &statbuf); 435 if (! statbuf.st_size) { 436 cout << "Empty \"" << fname << "\" - skipping\n"; 437 return false; 438 } 439 file_size = statbuf.st_size; 440 last_mod = statbuf.st_mtime; 441 } 425 442 while (!stream.eof() && getline(stream, line)) { 426 443 ++line_no; 427 444 Xapian::Document doc; … … 638 655 for (i = fields.begin(); i != fields.end(); ++i) { 639 656 list<string>::const_iterator j; 640 657 for (j = i->second.begin(); j != i->second.end(); j++) { 658 if (i->first == "lastmod") last_mod = 0; 659 if (i->first == "size") file_size = 0; 641 660 data += i->first; 642 661 data += '='; 643 662 data += *j; 644 663 data += '\n'; 645 664 } 646 665 } 666 // provide some extra fields if not already provided by the script 667 if (last_mod) { // if indexed per filename 668 data += "lastmod="+int_to_string(last_mod)+'\n'; 669 doc.add_value(VALUE_LASTMOD, int_to_string(last_mod)); 670 } 671 if (file_size) { // if indexed per filename 672 data += "size="+int_to_string(file_size)+'\n'; 673 doc.add_value(VALUE_FILESIZE, int_to_string(file_size)); 674 } 647 675 648 676 // Put the data in the document 649 677 doc.set_data(data); -
xapian-omega-1.0.7a/utils.cc
diff -u xapian-omega-1.0.7a/utils.cc.orig
old new 30 30 31 31 using namespace std; 32 32 33 #ifdef __WIN32__ 34 #include "safewindows.h" 35 #endif 36 33 37 // This ought to be enough for any of the conversions below. 34 38 #define BUFSIZE 100 35 39 40 /// Allow system to work directly on C++ strings. 41 inline int system(const string &command) { return system(command.c_str()); } 42 43 /// Remove a directory and contents. 44 void 45 rm_rf(const string &filename) 46 { 47 // Check filename exists and is actually a directory 48 struct stat sb; 49 if (stat(filename, &sb) != 0 || !S_ISDIR(sb.st_mode)) return; 50 51 string safefile = shell_protect(filename); 52 #ifdef __WIN32__ 53 # if 1 54 static int win95 = -1; 55 if (win95 == -1) { 56 OSVERSIONINFO info; 57 memset(&info, 0, sizeof(OSVERSIONINFO)); 58 info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); 59 if (GetVersionEx(&info)) { 60 win95 = (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS); 61 } 62 } 63 64 if (win95) { 65 // for 95 like systems: 66 system("deltree /y \"" + safefile + "\""); 67 } else { 68 // for NT like systems: 69 system("rd /s /q \"" + safefile + "\""); 70 } 71 # else 72 safefile.append("\0", 2); 73 SHFILEOPSTRUCT shfo; 74 memset((void*)&shfo, 0, sizeof(shfo)); 75 shfo.hwnd = 0; 76 shfo.wFunc = FO_DELETE; 77 shfo.pFrom = safefile.data(); 78 shfo.fFlags = FOF_NOCONFIRMATION|FOF_NOERRORUI|FOF_SILENT; 79 (void)SHFileOperation(&shfo); 80 # endif 81 #else 82 system("rm -rf " + safefile); 83 #endif 84 } -
xapian-omega-1.0.7a/utils.h
diff -u xapian-omega-1.0.7a/utils.h.orig
old new 22 22 23 23 #include <string> 24 24 25 #include <stdlib.h> 26 #include <sys/stat.h> 27 #include <sys/types.h> 28 #ifdef _MSC_VER 29 # include <direct.h> 30 # include <io.h> 31 #else 32 # include <unistd.h> 33 #endif 34 #include <ctype.h> 35 #include <fcntl.h> 36 25 37 /** Converts year, month, day into an 8 character string like: "20061031". */ 26 38 std::string date_to_string(int year, int month, int day); 27 39 … … 37 49 /** Converts a string to an int. */ 38 50 int string_to_int(const std::string & s); 39 51 52 void rm_rf(const std::string &filename); 53 40 54 #endif -
xapian-omega-1.0.7a/xapian-omega.spec.in
diff -u xapian-omega-1.0.7a/xapian-omega.spec.in.orig
old new 45 45 # Create /var directories 46 46 mkdir -p %{buildroot}%{contentdir}/omega/data 47 47 mkdir -p %{buildroot}%{contentdir}/omega/cdb 48 mkdir -p %{buildroot}%{contentdir}/omega/cache 48 49 mkdir -p %{buildroot}%{logdir}/omega 49 50 # Default templates 50 51 mkdir -p %{buildroot}%{contentdir}/omega/templates … … 77 78 /var/www/icons/omega 78 79 %{_datadir}/%{name} 79 80 %config(noreplace) /etc/omega.conf 80 %doc %{_datadir}/doc/%{name}-%{version} 81 %docdir /usr/share/doc/%{name}-%{version} 82 %doc AUTHORS ChangeLog COPYING NEWS README TODO 81 83 # man pages may be gzipped, hence the trailing wildcard. 82 84 %{_mandir}/man1/omindex.1* 83 85 %{_mandir}/man1/scriptindex.1*