Ticket #282: xapian-omega-1.2.5-from-ticket-285-and-cleaned-up-updated-2011-05-13.patch
File xapian-omega-1.2.5-from-ticket-285-and-cleaned-up-updated-2011-05-13.patch, 49.1 KB (added by , 13 years ago) |
---|
-
utils.h
31 31 /** Converts a string to an int. */ 32 32 int string_to_int(const std::string & s); 33 33 34 void rm_rf(const std::string &filename); 35 34 36 #endif -
omindex.cc
5 5 * Copyright 2001,2002 Ananova Ltd 6 6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011 Olly Betts 7 7 * Copyright 2009 Frank J Bruzzaniti 8 * Copyright 2006,2007,2008 AVL List GesmbH 8 9 * 9 10 * This program is free software; you can redistribute it and/or 10 11 * modify it under the terms of the GNU General Public License as … … 68 69 extern char * mkdtemp(char *); 69 70 #endif 70 71 72 #ifndef LIBEXECDIR 73 // must have ending slash 74 //# define LIBEXECDIR "/usr/lib/omega/bin/" 75 # define LIBEXECDIR "" 76 #endif 77 #ifndef PKGDATADIR 78 // must have ending slash 79 # define PKGDATADIR "/usr/share/omega/" 80 #endif 81 71 82 using namespace std; 72 83 73 84 #define TITLE_SIZE 128 … … 81 92 static bool ignore_exclusions = false; 82 93 static bool spelling = false; 83 94 static bool verbose = false; 95 string error_log; /* used in runfilter.cc */ 96 static string baseurl; 97 static string dbpath; 98 static string cache_dir; 84 99 static enum { 85 100 EMPTY_BODY_WARN, EMPTY_BODY_INDEX, EMPTY_BODY_SKIP 86 101 } empty_body = EMPTY_BODY_WARN; … … 103 118 // text are common, so we handle these with a std::map. 104 119 static map<string, string> commands; 105 120 121 static void 122 index_directory(const string &path, const string &url_, size_t depth_limit, 123 map<string, string>& mime_map); 124 106 125 inline static bool 107 126 p_notalnum(unsigned int c) 108 127 { … … 328 347 skip(file, "unknown MIME type '" + mimetype + "'"); 329 348 } 330 349 350 static 351 void mkdir_p(const string &path, mode_t mode) { 352 (void)mode; // FIXME 353 #ifdef __WIN32__ 354 system(("mkdir \"" + shell_protect(path) + "\"").c_str()); 355 #else 356 if (system(("mkdir -p " + shell_protect(path)).c_str()) < 0) { /* FIXME */ } 357 #endif 358 } 359 360 /* 361 * unpack .msg/.pst/.rar/.zip into local cache dir and recurse there 362 */ 363 static void 364 index_cached_directory(size_t depth_limit, 365 const string &file, 366 const string &url, 367 const string &ext, 368 const string &cmd, 369 map<string, string>& mime_map) 370 { 371 string oldroot = root; 372 root = cache_dir; 373 string cache = root+"/."+ext; 374 string cachedir = cache+url; 375 struct stat statfile, statcache; 376 bool extract_cache; 377 #ifdef HAVE_LSTAT 378 lstat(file.c_str(), &statfile); 379 lstat(cachedir.c_str(), &statcache); 380 #else 381 stat(file.c_str(), &statfile); 382 stat(cachedir.c_str(), &statcache); 383 #endif 384 extract_cache = true; 385 // if cachedir exists and if file is older as cachedir and if cachedir existed 5 secs ago, 386 // then it was already extracted. 387 if (S_ISDIR(statcache.st_mode) 388 && S_ISREG(statfile.st_mode) 389 && (statfile.st_mtime < statcache.st_mtime) 390 && (statcache.st_mtime < (time_t)(time(NULL)-500))) // not created by nested mkdir call 391 { 392 // but is it in the database also? prevent from deleting skipped files 393 if (verbose) 394 cout << "Unchanged cache \"" << cachedir << "\" - \"" << file << "\" - skip extraction " 395 // << statfile.st_mtime << " < " << statcache.st_mtime 396 << endl; 397 extract_cache = false; 398 } 399 if (S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode) ) { 400 // If last_mod > last_mod_max, we know for sure that the file is new 401 // or updated. 402 if (statfile.st_mtime <= last_mod_max) { 403 // check database timestamp for cached container, esp. for cleaned up caches. 404 // if already in db we need not to extract again 405 string urlterm("U"); 406 urlterm += baseurl; 407 urlterm += "/."+ext+url; 408 if (urlterm.length() > MAX_SAFE_TERM_LENGTH) 409 urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH); 410 411 Xapian::PostingIterator p = db.postlist_begin(urlterm); 412 if (p != db.postlist_end(urlterm)) { 413 Xapian::docid docid = *p; 414 Xapian::Document doc = db.get_document(docid); 415 string value = doc.get_value(VALUE_LASTMOD); 416 time_t old_last_mod = binary_string_to_int(value); 417 if (statfile.st_mtime <= old_last_mod) { 418 if (verbose) 419 cout << "Cache "<< "."+ext+url << " not newer. Ignored." << endl; 420 // The docid should be in updated - the only valid 421 // exception is if the URL was long and hashed to the 422 // same URL as an existing document indexed in the same 423 // batch. 424 if (usual(docid < updated.size() && !updated[docid])) { 425 updated[docid] = true; 426 --old_docs_not_seen; 427 } 428 root = oldroot; 429 return; 430 } 431 } 432 } 433 } 434 435 if (extract_cache) { 436 if (verbose) 437 cout << "[EXTRACT into cache " << cachedir << "]" << endl; 438 if (verbose && S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode)) 439 cout << " ...changed cache \"" << cachedir << "\" - \"" << file << "\" " 440 << statfile.st_mtime << " < " << statcache.st_mtime << " time: " << time(NULL) 441 << endl; 442 if (!S_ISDIR(statcache.st_mode)) 443 mkdir_p(cachedir, 0755); 444 stdout_to_string(cmd); 445 #ifndef __WIN32__ 446 stdout_to_string("chmod -R u+rwx " + shell_protect(cachedir)); 447 #endif 448 #ifdef HAVE_LSTAT 449 lstat(cachedir.c_str(), &statcache); 450 #else 451 stat(cachedir.c_str(), &statcache); 452 #endif 453 } 454 455 if (S_ISDIR(statcache.st_mode)) { 456 if (depth_limit == 1) { 457 cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl; 458 } else { 459 // max loop 5, magic start: /.ext+file 460 index_directory(file + "/."+ext+url, url, depth_limit + 5, mime_map); 461 if (verbose) 462 cout << "[CLEANUP " << "rm -rf " << shell_protect(cachedir) << "]" << endl; 463 rm_rf(cachedir); 464 } 465 } 466 else { // no -p would be fatal here 467 cout << "cachedir " << shell_protect(cachedir) << " does not exist - skipped" << endl; 468 } 469 root = oldroot; 470 } 471 331 472 static void 332 473 index_file(const string &file, const string &url, DirectoryIterator & d, 333 map<string, string>& mime_map )474 map<string, string>& mime_map, size_t depth_limit) 334 475 { 335 476 string ext; 336 477 const char * dot_ptr = strrchr(d.leafname(), '.'); … … 358 499 359 500 string mimetype; 360 501 if (mt == mime_map.end()) { 502 if (strcasecmp(d.leafname(), "mbox") == 0) { 503 // Special filename. 504 mimetype = "message/rfc822"; 505 goto got_mimetype; 506 } 507 361 508 mimetype = d.get_magic_mimetype(); 362 509 if (mimetype.empty()) { 363 510 skip(file, "Unknown extension and unrecognised format", … … 370 517 mimetype = mt->second; 371 518 } 372 519 520 got_mimetype: 521 373 522 if (verbose) 374 523 cout << "Indexing \"" << file.substr(root.size()) << "\" as " 375 524 << mimetype << " ... "; … … 429 578 } 430 579 } 431 580 } 581 // add the db basename to cache_dir 582 { 583 ensure_tmpdir(); // FIXME: be lazy! 584 cache_dir = tmpdir; 585 const char *p = strrchr(dbpath.c_str(), '/'); 586 // on windows only 587 if (!p) p = strrchr(dbpath.c_str(), '\\'); 588 if (p) { p++; } else { p = dbpath.c_str(); } 589 cache_dir += p; 590 } 432 591 433 592 if (verbose) cout << flush; 434 593 … … 493 652 } else { 494 653 // FIXME: What charset is the file? Look at contents? 495 654 } 655 #if 0 // FIXME: this won't work as omindex will have the database locked... 656 } else if (mimetype == "message/rfc822") { // // => mbox2script 657 //for stemmer lang, parse stemmer.get_description => Xapian::Stem(bla) 658 string cmd = LIBEXECDIR"mbox2omega " + shell_protect(file) + error_log+"| " 659 "scriptindex " + shell_protect(dbpath) + " "PKGDATADIR"mbox2script.script"; 660 try { 661 dump = stdout_to_string(cmd); 662 } catch (ReadError) { 663 cout << "\"" << cmd << "\" failed - skipping" << endl; 664 return; 665 } 666 #endif 496 667 } else if (mimetype == "application/pdf") { 497 668 string safefile = shell_protect(file); 498 669 string cmd = "pdftotext -enc UTF-8 " + safefile + " -"; … … 721 892 722 893 generate_sample_from_csv(dump, sample); 723 894 } else if (mimetype == "application/vnd.ms-outlook") { 724 string cmd = get_pkglibbindir() + "/outlookmsg2html " + shell_protect(file); 725 MyHtmlParser p; 726 p.ignore_metarobots(); 895 string oldroot = root; 896 struct stat statcache; 897 char olddir[256]; 898 899 if (depth_limit == 1) { 900 skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME); 901 return; 902 } 903 string cmd = LIBEXECDIR"outlook2text "+shell_protect(file); 904 // unpack multiparts and attachments. so we have to chdir first 905 string fulldir = cache_dir+"/.msg"+url; 906 if (getcwd(olddir, 256) == NULL) { /* FIXME */ } 907 #ifdef HAVE_LSTAT 908 lstat(fulldir.c_str(), &statcache); 909 #else 910 stat(fulldir.c_str(), &statcache); 911 #endif 912 if (!S_ISDIR(statcache.st_mode)) { 913 mkdir_p(fulldir, 0755); 914 } 727 915 try { 728 dump = stdout_to_string(cmd); 729 // FIXME: what should the default charset be? 730 p.parse_html(dump, "iso-8859-1", false); 731 } catch (const string & newcharset) { 732 p.reset(); 733 p.ignore_metarobots(); 734 p.parse_html(dump, newcharset, true); 916 if (chdir(fulldir.c_str()) < 0) { /* FIXME */ } 917 size_t new_limit = depth_limit; 918 if (new_limit) --new_limit; 919 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 920 if (chdir(olddir) < 0) { /* FIXME */ } 735 921 } catch (ReadError) { 736 skip_cmd_failed(file, cmd); 737 return; 922 cout << "failed " << cmd << " << in index_cached_directory" << endl; 923 if (chdir(olddir) < 0) { /* FIXME */ } 924 root = oldroot; 925 } catch (...) { 926 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 927 if (chdir(olddir) < 0) { /* FIXME */ } 928 root = oldroot; 929 throw; 738 930 } 739 dump = p.dump; 740 title = p.title; 741 keywords = p.keywords; 742 sample = p.sample; 743 author = p.author; 931 return; 744 932 } else if (mimetype == "image/svg+xml") { 745 933 SvgParser svgparser; 746 934 svgparser.parse_html(d.file_to_string()); … … 769 957 if (idx != string::npos) { 770 958 dump.assign(desc, idx + 1, string::npos); 771 959 } 960 } else if (mimetype == "application/x-zip") { 961 string oldroot = root; 962 if (depth_limit == 1) { 963 skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME); 964 return; 965 } 966 // overwrite 967 string cmd = "unzip -u -P. -o " +shell_protect(file) + " -d " +shell_protect(cache_dir+"/.zip"+url+"/"); 968 try { 969 size_t new_limit = depth_limit; 970 if (new_limit) --new_limit; 971 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 972 } catch (ReadError) { 973 cout << "failed " << cmd << " << in index_cached_directory" << endl; 974 root = oldroot; 975 } catch (...) { 976 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 977 root = oldroot; 978 throw; 979 } 980 return; 981 } else if (mimetype == "application/x-rar") { 982 string oldroot = root; 983 if (depth_limit == 1) { 984 skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME); 985 return; 986 } 987 988 // overwrite 989 string cmd = "unrar x -o+ " +shell_protect(file) + " " 990 + shell_protect(cache_dir+"/.rar"+url+"/"); 991 try { 992 size_t new_limit = depth_limit; 993 if (new_limit) --new_limit; 994 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 995 } catch (ReadError) { 996 cout << "failed " << cmd << " << in index_cached_directory" << endl; 997 root = oldroot; 998 } catch (...) { 999 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 1000 root = oldroot; 1001 throw; 1002 } 1003 return; 1004 } else if (mimetype == "application/vnd.ms-outlook-pst") { 1005 string oldroot = root; 1006 if (depth_limit == 1) { 1007 skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME); 1008 return; 1009 } 1010 // unpack attachments also, together with mbox files 1011 string cmd = "readpst -r -cv -w -o " 1012 + shell_protect(cache_dir+"/.pst"+url+"/")+" "+shell_protect(file); 1013 try { 1014 size_t new_limit = depth_limit; 1015 if (new_limit) --new_limit; 1016 index_cached_directory(new_limit, file, url, ext, cmd, mime_map); 1017 } catch (ReadError) { 1018 root = oldroot; 1019 cout << "failed " << cmd << " << in index_cached_directory" << endl; 1020 } catch (...) { 1021 root = oldroot; 1022 cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl; 1023 throw; 1024 } 1025 return; 772 1026 } else { 773 1027 // Don't know how to index this type. 774 1028 skip_unknown_mimetype(file, mimetype); … … 985 1239 break; 986 1240 } 987 1241 case DirectoryIterator::REGULAR_FILE: 988 index_file(file, url, d, mime_map );1242 index_file(file, url, d, mime_map, depth_limit); 989 1243 break; 990 1244 default: 991 1245 skip(file, "Not a regular file", … … 1009 1263 bool overwrite = false; 1010 1264 // If delete_removed_documents is true, delete any documents we don't see. 1011 1265 bool delete_removed_documents = true; 1012 string baseurl;1013 1266 size_t depth_limit = 0; 1014 1267 1015 1268 static const struct option longopts[] = { … … 1133 1386 mime_map["ppt"] = "application/vnd.ms-powerpoint"; 1134 1387 mime_map["pps"] = "application/vnd.ms-powerpoint"; // Powerpoint slideshow 1135 1388 mime_map["msg"] = "application/vnd.ms-outlook"; // Outlook .msg email 1389 #ifdef HAVE_READPST 1390 // Outlook messager folder 1391 mime_map["pst"] = "application/vnd.ms-outlook-pst"; // readpst | uudeview (libpst) 1392 #endif 1393 1394 // Misc compound formats: 1395 mime_map["mbox"] = "message/rfc822"; // => mbox2omega 1396 mime_map["mbx"] = "message/rfc822"; // => mbox2omega 1397 #ifndef _MSC_VER 1398 mime_map["zip"] = "application/x-zip"; // recursive scanning 1399 # ifdef HAVE_UNRAR 1400 mime_map["rar"] = "application/x-rar"; // recursive scanning 1401 # endif 1402 #endif 1136 1403 1137 1404 // Perl: 1138 1405 mime_map["pl"] = "text/x-perl"; … … 1190 1457 argv[1] = const_cast<char *>("--version"); 1191 1458 } 1192 1459 1193 string dbpath;1194 1460 int getopt_ret; 1195 1461 while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:F:l:s:pfSVe:i", 1196 1462 longopts, NULL)) != -1) { … … 1340 1606 baseurl += '/'; 1341 1607 } 1342 1608 1609 string log_dir = "./"; // FIXME: need to set log_dir to something appropriate. 1610 error_log = " 2>>"+log_dir+"omindex-error.log"; 1611 1343 1612 if (optind >= argc || optind + 2 < argc) { 1344 1613 cerr << PROG_NAME": you must specify a directory to index.\n" 1345 1614 "Do this either as a single directory (corresponding to the base URL)\n" -
outlook2text.in
1 #! /bin/sh 2 # converts msg to mbox and extract attachments 3 # either be in the cache dir, or accept it as 2nd arg 4 if [ -n $2 ]; then 5 @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "$2" 6 else 7 # already is in the cache dir 8 base=`basename "$1" .msg` 9 @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "${base}" 10 fi -
runfilter.cc
54 54 55 55 using namespace std; 56 56 57 extern string error_log; 58 57 59 string 58 60 stdout_to_string(const string &cmd) 59 61 { 60 62 string out; 63 string tmp = cmd; 64 tmp += error_log; 61 65 #if defined HAVE_FORK && defined HAVE_SOCKETPAIR && defined HAVE_SETRLIMIT 62 66 // We want to be able to get the exit status of the child process. 63 67 signal(SIGCHLD, SIG_DFL); … … 100 104 } 101 105 #endif 102 106 103 execl("/bin/sh", "/bin/sh", "-c", cmd.c_str(), (void*)NULL);107 execl("/bin/sh", "/bin/sh", "-c", tmp.c_str(), (void*)NULL); 104 108 _exit(-1); 105 109 } 106 110 … … 139 143 throw ReadError(); 140 144 } 141 145 #else 142 FILE * fh = popen( cmd.c_str(), "r");146 FILE * fh = popen(tmp.c_str(), "r"); 143 147 if (fh == NULL) throw ReadError(); 144 148 while (!feof(fh)) { 145 149 char buf[4096]; -
utils.cc
25 25 26 26 #include <stdio.h> // for sprintf/snprintf 27 27 #include <cstdlib> 28 #include <cstring> 29 #include "safesysstat.h" 28 30 29 31 #include <string> 30 32 31 33 using namespace std; 32 34 35 #ifdef __WIN32__ 36 #include "safewindows.h" 37 #endif 38 33 39 // This ought to be enough for any of the conversions below. 34 40 #define BUFSIZE 100 35 41 … … 39 45 int len = SNPRINTF(buf, BUFSIZE, (FMT), val);\ 40 46 if (len == -1 || len > BUFSIZE) return string(buf, BUFSIZE);\ 41 47 return string(buf, len); 48 /// Allow system to work directly on C++ strings. 49 inline int system(const string &command) { return system(command.c_str()); } 50 51 // Duplicated from omindex.cc - FIXME 52 static string 53 shell_protect(const string & file) 54 { 55 string safefile = file; 56 #ifdef __WIN32__ 57 bool need_to_quote = false; 58 for (string::iterator i = safefile.begin(); i != safefile.end(); ++i) { 59 unsigned char ch = *i; 60 if (!isalnum(ch) && ch < 128) { 61 if (ch == '/') { 62 // Convert Unix path separators to backslashes. C library 63 // functions understand "/" in paths, but external commands 64 // generally don't, and also may interpret a leading '/' as 65 // introducing a command line option. 66 *i = '\\'; 67 } else if (ch == ' ') { 68 need_to_quote = true; 69 } else if (ch < 32 || strchr("<>\"|*?", ch)) { 70 // Check for invalid characters in the filename. 71 string m("Invalid character '"); 72 m += ch; 73 m += "' in filename \""; 74 m += file; 75 m += '"'; 76 throw m; 77 } 78 } 79 } 80 if (safefile[0] == '-') { 81 // If the filename starts with a '-', protect it from being treated as 82 // an option by prepending ".\". 83 safefile.insert(0, ".\\"); 84 } 85 if (need_to_quote) { 86 safefile.insert(0, "\""); 87 safefile += '"'; 88 } 89 #else 90 string::size_type p = 0; 91 if (!safefile.empty() && safefile[0] == '-') { 92 // If the filename starts with a '-', protect it from being treated as 93 // an option by prepending "./". 94 safefile.insert(0, "./"); 95 p = 2; 96 } 97 while (p < safefile.size()) { 98 // Don't escape some safe characters which are common in filenames. 99 unsigned char ch = safefile[p]; 100 if (!isalnum(ch) && strchr("/._-", ch) == NULL) { 101 safefile.insert(p, "\\"); 102 ++p; 103 } 104 ++p; 105 } 106 #endif 107 return safefile; 108 } 109 110 /// Remove a directory and contents. 111 void 112 rm_rf(const string &filename) 113 { 114 // Check filename exists and is actually a directory 115 struct stat sb; 116 if (stat(filename.c_str(), &sb) != 0 || !S_ISDIR(sb.st_mode)) return; 117 118 string safefile = shell_protect(filename); 119 #ifdef __WIN32__ 120 # if 1 121 static int win95 = -1; 122 if (win95 == -1) { 123 OSVERSIONINFO info; 124 memset(&info, 0, sizeof(OSVERSIONINFO)); 125 info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); 126 if (GetVersionEx(&info)) { 127 win95 = (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS); 128 } 129 } 130 131 if (win95) { 132 // for 95 like systems: 133 system("deltree /y \"" + safefile + "\""); 134 } else { 135 // for NT like systems: 136 system("rd /s /q \"" + safefile + "\""); 137 } 138 # else 139 safefile.append("\0", 2); 140 SHFILEOPSTRUCT shfo; 141 memset((void*)&shfo, 0, sizeof(shfo)); 142 shfo.hwnd = 0; 143 shfo.wFunc = FO_DELETE; 144 shfo.pFrom = safefile.data(); 145 shfo.fFlags = FOF_NOCONFIRMATION|FOF_NOERRORUI|FOF_SILENT; 146 (void)SHFileOperation(&shfo); 147 # endif 148 #else 149 system("rm -rf " + safefile); 150 #endif 151 } 42 152 #else 43 153 #define CONVERT_TO_STRING(FMT) \ 44 154 char buf[BUFSIZE];\ -
ChangeLog
1 2006-08-22 09:30:12 Reini Urban <reinhard.urban@avl.com> 2 3 omega-0.9.6c: 4 * omindex.cc: Fix wrong timestamp comparison in cache logic 5 * scriptindex.cc: Add lastmod and size records and values. 6 * excel2text, outlook2text.in: New scripts 7 8 2006-08-18 15:13:32 Reini Urban <reinhard.urban@avl.com> 9 10 omega-0.9.6b: 11 * omindex.cc: Add HAVE_UNRAR, HAVE_MSGCONVERT, HAVE_READPST checks. 12 13 2006-08-17 18:06:26 Reini Urban <reinhard.urban@avl.com> 14 15 omega-0.9.6a: 16 * omindex.cc: Added cached virtual directories zip,msg,pst,...). 17 Consistently log stderr to /var/log/omega/omindex-error.log. 18 1 19 Wed Apr 20 07:00:56 GMT 2011 Olly Betts <olly@survex.com> 2 20 3 21 * NEWS: Fix typo; clarify wording. -
scriptindex.cc
4 4 * Copyright 2001 Sam Liddicott 5 5 * Copyright 2001,2002 Ananova Ltd 6 6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010 Olly Betts 7 * Copyright 2006,2007 AVL List GesmbH 7 8 * 8 9 * This program is free software; you can redistribute it and/or 9 10 * modify it under the terms of the GNU General Public License as … … 39 40 #include <cstdio> 40 41 #include <ctime> 41 42 #include "safeunistd.h" 43 #include <sys/stat.h> 42 44 43 45 #include "commonhelp.h" 44 46 #include "hashterm.h" 45 47 #include "loadfile.h" 46 48 #include "myhtmlparse.h" 49 #include "str.h" 47 50 #include "stringutils.h" 48 51 #include "utf8truncate.h" 49 52 #include "utils.h" 53 #include "values.h" 50 54 51 55 #include "gnu_getopt.h" 52 56 … … 431 435 { 432 436 string line; 433 437 size_t line_no = 0; 438 time_t last_mod = 0; 439 long file_size = 0; 440 441 if (strcmp(fname,"<stdin>") != 0) { 442 struct stat statbuf; 443 stat(fname, &statbuf); 444 if (! statbuf.st_size) { 445 cout << "Empty \"" << fname << "\" - skipping\n"; 446 return false; 447 } 448 file_size = statbuf.st_size; 449 last_mod = statbuf.st_mtime; 450 } 434 451 while (!stream.eof() && getline(stream, line)) { 435 452 ++line_no; 436 453 Xapian::Document doc; … … 677 694 for (i = fields.begin(); i != fields.end(); ++i) { 678 695 list<string>::const_iterator j; 679 696 for (j = i->second.begin(); j != i->second.end(); j++) { 697 if (i->first == "lastmod") last_mod = 0; 698 if (i->first == "size") file_size = 0; 680 699 data += i->first; 681 700 data += '='; 682 701 data += *j; 683 702 data += '\n'; 684 703 } 685 704 } 705 // provide some extra fields if not already provided by the script 706 if (last_mod) { // if indexed per filename 707 data += "lastmod="+str(last_mod)+'\n'; 708 doc.add_value(VALUE_LASTMOD, int_to_binary_string((uint32_t)last_mod)); 709 } 710 if (file_size) { // if indexed per filename 711 data += "size="+str(file_size)+'\n'; 712 doc.add_value(VALUE_SIZE, Xapian::sortable_serialise(file_size)); 713 } 686 714 687 715 // Put the data in the document 688 716 doc.set_data(data); -
excel2text
1 #! /bin/sh 2 # strip numbers, to stdout 3 xls2csv -q0 "$1" | sed -re's/[0123456789.]+,//g' -
mimeexplode
1 #!/usr/bin/perl -w 2 3 =head1 NAME 4 5 mimeexplode - explode one or more MIME messages 6 7 =head1 SYNOPSIS 8 9 mimeexplode [-d <dir>] <mime-msg-file> <mime-msg-file> ... 10 11 someprocess | mimeexplode - 12 13 =head1 DESCRIPTION 14 15 Takes one or more files from the command line that contain MIME 16 messages, and explodes their contents out into subdirectories 17 of the current working directory. The subdirectories are 18 just called C<msg0>, C<msg1>, C<msg2>, etc. Existing directories are 19 skipped over. 20 21 The message information is output to the stdout, like this: 22 23 Message: msg3 (inputfile1.msg) 24 Part: msg3/filename-1.dat (text/plain) 25 Part: msg3/filename-2.dat (text/plain) 26 Message: msg5 (input-file2.msg) 27 Part: msg5/dir.gif (image/gif) 28 Part: msg5/face.jpg (image/jpeg) 29 Message: msg6 (infile3) 30 Part: msg6/filename-1.dat (text/plain) 31 32 This was written as an example of the MIME:: modules in the 33 MIME-parser package I wrote. It may prove useful as a quick-and-dirty 34 way of splitting a MIME message if you need to decode something, and 35 you don't have a MIME mail reader on hand. 36 37 =head1 COMMAND LINE OPTIONS 38 39 -d outdir 40 41 =head1 AUTHOR 42 43 Eryq C<eryq@zeegee.com>, in a big hurry... 44 Reini Urban C<rurban@x-ray.at>: -d option to always explode into the same dir 45 46 =cut 47 48 #BEGIN { unshift @INC, ".." } # to test MIME:: stuff before installing it! 49 50 require 5.001; 51 52 use strict; 53 use vars; 54 55 use MIME::Parser; 56 use Getopt::Std; 57 my %opts; 58 my $outbase = ''; 59 my $postfix = ''; 60 61 #------------------------------------------------------------ 62 # make_msg - make and return the name of a msgXXX directory 63 #------------------------------------------------------------ 64 65 #ignored 66 #sub make_msg { 67 # while (-d "msg$Msgno") { 68 # ++$Msgno; 69 # die "self-imposed limit reached" if $Msgno == 256; 70 # } 71 # mkdir "msg$Msgno",0755 or die "couldn't make msg$Msgno: $!"; 72 # "msg$Msgno"; 73 #} 74 75 #------------------------------------------------------------ 76 # dump_entity - dump an entity's file info 77 #------------------------------------------------------------ 78 sub dump_entity { 79 my $ent = shift; 80 my @parts = $ent->parts; 81 82 if (@parts) { # multipart... 83 map { dump_entity($_) } @parts; 84 } 85 else { # single part... 86 print " Part: ", $ent->bodyhandle->path, 87 " (", scalar($ent->head->mime_type), ")\n"; 88 } 89 } 90 91 #------------------------------------------------------------ 92 # main 93 #------------------------------------------------------------ 94 sub main { 95 my $file; 96 my $entity; 97 98 # make sure the same message gets exploded into the same dir 99 getopts('d:', \%opts); 100 $outbase = $opts{d} ? $opts{d} : "msg0"; 101 my $outdir = $outbase; 102 103 # Go through messages: 104 @ARGV or unshift @ARGV, "-"; 105 while (defined($file = shift @ARGV)) { 106 107 # Sanity: 108 (-d $outdir) or mkdir "$outdir",0755; 109 (-w "$outdir") or die "cwd $outdir not writable!"; 110 #my $msgdir = make_msg(); 111 #print "Message: $msgdir ($file)\n"; 112 113 # Create a new parser object: 114 my $parser = new MIME::Parser; 115 ### $parser->parse_nested_messages('REPLACE'); 116 117 # Optional: set up parameters that will affect how it extracts 118 # documents from the input stream: 119 $parser->output_dir($outdir); 120 121 # Parse an input stream: 122 open FILE, $file or die "couldn't open $file"; 123 $entity = $parser->read(\*FILE) or 124 print STDERR "Couldn't parse MIME in $file; continuing...\n"; 125 close FILE; 126 127 # Congratulations: you now have a (possibly multipart) MIME entity! 128 dump_entity($entity) if $entity; 129 ### $entity->dump_skeleton if $entity; 130 131 $postfix++; 132 $outdir = $outbase.$postfix; 133 } 134 1; 135 } 136 137 exit (&main ? 0 : -1); 138 #------------------------------------------------------------ 139 1; 140 -
msgconvert.pl
1 #!/usr/bin/perl -w 2 # 3 # msgconvert.pl: 4 # 5 # Convert .MSG files (made by Outlook (Express)) to multipart MIME messages. 6 # 7 # Copyright 2002, 2004, 2006 Matijs van Zuijlen 8 # 9 # This program is free software; you can redistribute it and/or modify it 10 # under the terms of the GNU General Public License as published by the 11 # Free Software Foundation; either version 2 of the License, or (at your 12 # option) any later version. 13 # 14 # This program is distributed in the hope that it will be useful, but 15 # WITHOUT ANY WARRANTY; without even the implied warranty of 16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 17 # Public License for more details. 18 # 19 # CHANGES: 20 # 20020715 Recognize new items 'Cc', mime type of attachment, long 21 # filename of attachment, and full headers. Attachments turn out 22 # to be numbered, so a regexp is now used to recognize label of 23 # items that are attachments. 24 # 20020831 long file name will definitely be used if present. Full headers 25 # and mime type information are used when present. Created 26 # generic system for specifying known items to be skipped. 27 # Unexpected contents is never reason to bail out anymore. Added 28 # support for usage message and option processing (--verbose). 29 # 20040104 Handle address data slightly better, make From line less fake, 30 # make $verbose and $skippable_entries global vars, handle HTML 31 # variant of body text if present (though not optimally). 32 # 20040214 Fix typos and incorrect comments. 33 # 20040307 - Complete rewrite: All functional parts are now in the package 34 # MSGParser; 35 # - Creation of MIME::Entity object is delayed until the output 36 # routines, which means all data is known; This means I can 37 # create a multipart/alternative body. 38 # - Item names are parsed (thanks to bfrederi@alumni.sfu.ca for 39 # the information). 40 # 20040514 Check if $self->{HEAD} actually exists before trying to add its 41 # contents to the output Mime object's header data. 42 # (Bug reported by Thomas Ng). 43 # Don't produce multipart messages if not needed. 44 # (Bug reported by Justin B. Scout). 45 # 20040529 Correctly format OLEDATE. 46 # 20040530 - Extract date from property 0047 (thanks, Marc Goodman). 47 # - Use address data to make To: and Cc: lines complete 48 # - Use the in-reply-to property 49 # - More unknown properties named. 50 # - Found another property containing an SMTP address. 51 # - Put non-SMTP type addresses back in output. 52 # 20040825 Replace 'our' to declare globals with 'use vars'. This means 53 # the globals our now properly scoped inside the package and not 54 # the file. 55 # This also fixes the bug that this program did not work on perl 56 # versions below 5.6. (Bug reported by Tim Gustafson) 57 # 20060218 More sensible encoding warnings. 58 # 20060219 Move OLE parsing to main program. 59 # Parse nested MSG files (Bug reported by Christof Lukas). 60 # 20060225 Simplify code. 61 # 62 63 # 64 # Import modules. 65 # 66 package MSGParser; 67 use strict; 68 use OLE::Storage_Lite; 69 use MIME::Entity; 70 use MIME::Parser; 71 use Date::Format; 72 use POSIX qw(mktime); 73 use constant DIR_TYPE => 1; 74 use constant FILE_TYPE => 2; 75 76 use vars qw($skipproperties $skipheaders); 77 # 78 # Descriptions partially based on mapitags.h 79 # 80 $skipproperties = { 81 # Envelope properties 82 '000B' => "Conversation key?", 83 '001A' => "Type of message", 84 '003B' => "Sender address variant", 85 '003D' => "Contains 'Re: '", 86 '003F' => "'recieved by' id", 87 '0040' => "'recieved by' name", 88 '0041' => "Sender variant address id", 89 '0042' => "Sender variant name", 90 '0043' => "'recieved representing' id", 91 '0044' => "'recieved representing' name", 92 '0046' => "Read receipt address id", 93 '0051' => "'recieved by' search key", 94 '0052' => "'recieved representing' search key", 95 '0053' => "Read receipt search key", 96 '0064' => "Sender variant address type", 97 '0065' => "Sender variant address", 98 '0070' => "Conversation topic", 99 '0071' => "Conversation index", 100 '0075' => "'recieved by' address type", 101 '0076' => "'recieved by' email address", 102 '0077' => "'recieved representing' address type", 103 '0078' => "'recieved representing' email address", 104 '007F' => "something like a message id", 105 # Recipient properties 106 '0C19' => "Reply address variant", 107 '0C1D' => "Reply address variant", 108 '0C1E' => "Reply address type", 109 # Non-transmittable properties 110 '0E02' => "?Should BCC be displayed", 111 '0E0A' => "sent mail id", 112 '0E1D' => "Subject w/o Re", 113 '0E27' => "64 bytes: Unknown", 114 '0FF6' => "Index", 115 '0FF9' => "Index", 116 '0FFF' => "Address variant", 117 # Content properties 118 '1008' => "Summary or something", 119 '1009' => "RTF Compressed", 120 # 'Common property' 121 '3001' => "Display name", 122 '3002' => "Address Type", 123 '300B' => "'Search key'", 124 # Attachment properties 125 '3702' => "Attachment encoding", 126 '3703' => "Attachment extension", 127 '3709' => "'Attachment rendering'", # Maybe an icon or something? 128 '3713' => "Icon URL?", 129 # 'Mail user' 130 '3A20' => "Address variant", 131 # 3900 -- 39FF: 'Address book' 132 '39FF' => "7 bit display name", 133 # 'Display table properties' 134 '3FF8' => "Routing data?", 135 '3FF9' => "Routing data?", 136 '3FFA' => "Routing data?", 137 '3FFB' => "Routing data?", 138 # 'Transport-defined envelope property' 139 '4029' => "Sender variant address type", 140 '402A' => "Sender variant address", 141 '402B' => "Sender variant name", 142 '5FF6' => "Recipient name", 143 '5FF7' => "Recipient address variant", 144 # 'Provider-defined internal non-transmittable property' 145 '6740' => "Unknown, binary data", 146 # User defined id's 147 '8000' => "Content Class", 148 '8002' => "Unknown, binary data", 149 }; 150 151 $skipheaders = { 152 "MIME-Version" => 1, 153 "Content-Type" => 1, 154 "Content-Transfer-Encoding" => 1, 155 "X-Mailer" => 1, 156 "X-Msgconvert" => 1, 157 "X-MS-Tnef-Correlator" => 1, 158 "X-MS-Has-Attach" => 1, 159 }; 160 161 use constant ENCODING_UNICODE => '001F'; 162 use constant KNOWN_ENCODINGS => { 163 '000D' => 'Directory', 164 '001F' => 'Unicode', 165 '001E' => 'Ascii?', 166 '0102' => 'Binary', 167 }; 168 169 use constant MAP_ATTACHMENT_FILE => { 170 '3701' => ["DATA", 0], # Data 171 '3704' => ["SHORTNAME", 1], # Short file name 172 '3707' => ["LONGNAME", 1], # Long file name 173 '370E' => ["MIMETYPE", 1], # mime type 174 '3716' => ["DISPOSITION", 1], # disposition 175 }; 176 177 use constant MAP_SUBITEM_FILE => { 178 '1000' => ["BODY_PLAIN", 0], # Body 179 '1013' => ["BODY_HTML", 0], # HTML Version of body 180 '0037' => ["SUBJECT", 1], # Subject 181 '0047' => ["SUBMISSION_ID", 1], # Seems to contain the date 182 '007D' => ["HEAD", 1], # Full headers 183 '0C1A' => ["FROM", 1], # Reply-To: Name 184 '0C1E' => ["FROM_ADDR_TYPE", 1], # From: Address type 185 '0C1F' => ["FROM_ADDR", 1], # Reply-To: Address 186 '0E04' => ["TO", 1], # To: Names 187 '0E03' => ["CC", 1], # Cc: Names 188 '1035' => ["MESSAGEID", 1], # Message-Id 189 '1042' => ["INREPLYTO", 1], # In reply to Message-Id 190 }; 191 192 use constant MAP_ADDRESSITEM_FILE => { 193 '3001' => ["NAME", 1], # Real name 194 '3002' => ["TYPE", 1], # Address type 195 '403D' => ["TYPE", 1], # Address type 196 '3003' => ["ADDRESS", 1], # Address 197 '403E' => ["ADDRESS", 1], # Address 198 '39FE' => ["SMTPADDRESS", 1], # SMTP Address variant 199 }; 200 201 # 202 # Main body of module 203 # 204 205 sub new { 206 my $that = shift; 207 my $class = ref $that || $that; 208 209 my $self = { 210 ATTACHMENTS => [], 211 ADDRESSES => [], 212 VERBOSE => 0, 213 HAS_UNICODE => 0, 214 FROM_ADDR_TYPE => "", 215 }; 216 bless $self, $class; 217 } 218 219 # 220 # Main sub: parse the PPS tree, and return 221 # 222 sub parse { 223 my $self = shift; 224 my $PPS = shift or die "Internal error: No PPS tree"; 225 $self->_RootDir($PPS); 226 } 227 228 sub mime_object { 229 my $self = shift; 230 231 my $bodymime; 232 my $mime; 233 234 if ($self->_IsMultiPart) { 235 # Construct a multipart message object 236 237 $mime = MIME::Entity->build(Type => "multipart/mixed"); 238 239 # Set the entity that we'll save the body parts to. If there's more than 240 # one part, it's a new entity, otherwise, it's the main $mime object. 241 if ($self->{BODY_HTML} and $self->{BODY_PLAIN}) { 242 $bodymime = MIME::Entity->build( 243 Type => "multipart/alternative", 244 Encoding => "8bit", 245 ); 246 $mime->add_part($bodymime); 247 } else { 248 $bodymime = $mime; 249 } 250 if ($self->{BODY_PLAIN}) { 251 $self->_SaveAttachment($bodymime, { 252 MIMETYPE => 'text/plain; charset=ISO-8859-1', 253 ENCODING => '8bit', 254 DATA => $self->{BODY_PLAIN}, 255 DISPOSITION => 'inline', 256 }); 257 } 258 if ($self->{BODY_HTML}) { 259 $self->_SaveAttachment($bodymime, { 260 MIMETYPE => 'text/html', 261 ENCODING => '8bit', 262 DATA => $self->{BODY_HTML}, 263 DISPOSITION => 'inline', 264 }); 265 } 266 foreach my $att (@{$self->{ATTACHMENTS}}) { 267 $self->_SaveAttachment($mime, $att); 268 } 269 } elsif ($self->{BODY_PLAIN}) { 270 # Construct a single part message object with a plain text body 271 $mime = MIME::Entity->build( 272 Type => "text/plain", 273 Data => $self->{BODY_PLAIN} 274 ); 275 } elsif ($self->{BODY_HTML}) { 276 # Construct a single part message object with an HTML body 277 $mime = MIME::Entity->build( 278 Type => "text/html", 279 Data => $self->{BODY_HTML} 280 ); 281 } 282 283 $self->_CopyHeaderData($mime); 284 285 $self->_SetHeaderFields($mime); 286 287 return $mime; 288 } 289 290 # Actually output the message in mbox format 291 sub print { 292 my $self = shift; 293 294 my $mime = $self->mime_object; 295 296 # Construct From line from whatever we know. 297 my $string = ""; 298 $string = ( 299 $self->{FROM_ADDR_TYPE} eq "SMTP" ? 300 $self->{FROM_ADDR} : 301 'someone@somewhere' 302 ); 303 $string =~ s/\n//g; 304 305 # The date used here is not really important. 306 print "From ", $string, " ", scalar localtime, "\n"; 307 $mime->print(\*STDOUT); 308 print "\n"; 309 } 310 311 sub set_verbosity { 312 my ($self, $verbosity) = @_; 313 defined $verbosity or die "Internal error: no verbosity level"; 314 $self->{VERBOSE} = $verbosity; 315 } 316 317 # 318 # Below are functions that walk the PPS tree. The *Dir functions handle 319 # processing the directory nodes of the tree (mainly, iterating over the 320 # children), whereas the *Item functions handle processing the items in the 321 # directory (if such an item is itself a directory, it will in turn be 322 # processed by the relevant *Dir function). 323 # 324 325 # 326 # RootItem: Check Root Entry, parse sub-entries. 327 # The OLE file consists of a single entry called Root Entry, which has 328 # several children. These children are parsed in the sub SubItem. 329 # 330 sub _RootDir { 331 my ($self, $PPS) = @_; 332 333 foreach my $child (@{$PPS->{Child}}) { 334 $self->_SubItem($child); 335 } 336 } 337 338 sub _SubItem { 339 my ($self, $PPS) = @_; 340 341 if ($PPS->{Type} == DIR_TYPE) { 342 $self->_SubItemDir($PPS); 343 } elsif ($PPS->{Type} == FILE_TYPE) { 344 $self->_SubItemFile($PPS); 345 } else { 346 warn "Unknown entry type: $PPS->{Type}"; 347 } 348 } 349 350 sub _SubItemDir { 351 my ($self, $PPS) = @_; 352 353 $self->_GetOLEDate($PPS); 354 355 my $name = $self->_GetName($PPS); 356 357 if ($name =~ /__recip_version1 0_ /) { # Address of one recipient 358 $self->_AddressDir($PPS); 359 } elsif ($name =~ '__attach_version1 0_ ') { # Attachment 360 $self->_AttachmentDir($PPS); 361 } else { 362 $self->_UnknownDir($self->_GetName($PPS)); 363 } 364 } 365 366 sub _SubItemFile { 367 my ($self, $PPS) = @_; 368 369 my $name = $self->_GetName($PPS); 370 my ($property, $encoding) = $self->_ParseItemName($name); 371 372 $self->_MapProperty($self, $PPS->{Data}, $property, 373 MAP_SUBITEM_FILE) or $self->_UnknownFile($name); 374 } 375 376 sub _AddressDir { 377 my ($self, $PPS) = @_; 378 379 my $address = { 380 NAME => undef, 381 ADDRESS => undef, 382 TYPE => "", 383 }; 384 foreach my $child (@{$PPS->{Child}}) { 385 $self->_AddressItem($child, $address); 386 } 387 push @{$self->{ADDRESSES}}, $address; 388 } 389 390 sub _AddressItem { 391 my ($self, $PPS, $addr_info) = @_; 392 393 my $name = $self->_GetName($PPS); 394 395 # DIR Entries: There should be none. 396 if ($PPS->{Type} == DIR_TYPE) { 397 $self->_UnknownDir($name); 398 } elsif ($PPS->{Type} == FILE_TYPE) { 399 my ($property, $encoding) = $self->_ParseItemName($name); 400 $self->_MapProperty($addr_info, $PPS->{Data}, $property, 401 MAP_ADDRESSITEM_FILE) or $self->_UnknownFile($name); 402 } else { 403 warn "Unknown entry type: $PPS->{Type}"; 404 } 405 } 406 407 sub _AttachmentDir { 408 my ($self, $PPS) = @_; 409 410 my $attachment = { 411 SHORTNAME => undef, 412 LONGNAME => undef, 413 MIMETYPE => 'application/octet-stream', 414 ENCODING => 'base64', 415 DISPOSITION => 'attachment', 416 DATA => undef 417 }; 418 foreach my $child (@{$PPS->{Child}}) { 419 $self->_AttachmentItem($child, $attachment); 420 } 421 push @{$self->{ATTACHMENTS}}, $attachment; 422 } 423 424 sub _AttachmentItem { 425 my ($self, $PPS, $att_info) = @_; 426 427 my $name = $self->_GetName($PPS); 428 429 my ($property, $encoding) = $self->_ParseItemName($name); 430 431 if ($PPS->{Type} == DIR_TYPE) { 432 433 if ($property eq '3701') { # Nested MSG file 434 my $msgp = new MSGParser(); 435 $msgp->parse($PPS); 436 my $data = $msgp->mime_object->as_string; 437 $att_info->{DATA} = $data; 438 $att_info->{MIMETYPE} = 'message/rfc822'; 439 $att_info->{ENCODING} = '8bit'; 440 } else { 441 $self->_UnknownDir($name); 442 } 443 444 } elsif ($PPS->{Type} == FILE_TYPE) { 445 $self->_MapProperty($att_info, $PPS->{Data}, $property, 446 MAP_ATTACHMENT_FILE) or $self->_UnknownFile($name); 447 } else { 448 warn "Unknown entry type: $PPS->{Type}"; 449 } 450 } 451 452 sub _MapProperty { 453 my ($self, $hash, $data, $property, $map) = @_; 454 455 defined $property or return 0; 456 my $arr = $map->{$property} or return 0; 457 458 $arr->[1] and $data =~ s/\000//g; 459 $hash->{$arr->[0]} = $data; 460 461 return 1; 462 } 463 464 sub _UnknownDir { 465 my ($self, $name) = @_; 466 467 if ($name eq '__nameid_version1 0') { 468 $self->{VERBOSE} 469 and warn "Skipping DIR entry $name (Introductory stuff)\n"; 470 return; 471 } 472 warn "Unknown DIR entry $name\n"; 473 } 474 475 sub _UnknownFile { 476 my ($self, $name) = @_; 477 478 if ($name eq '__properties_version1 0') { 479 $self->{VERBOSE} 480 and warn "Skipping FILE entry $name (Properties)\n"; 481 return; 482 } 483 484 my ($property, $encoding) = $self->_ParseItemName($name); 485 unless (defined $property) { 486 warn "Unknown FILE entry $name\n"; 487 return; 488 } 489 if ($skipproperties->{$property}) { 490 $self->{VERBOSE} 491 and warn "Skipping property $property ($skipproperties->{$property})\n"; 492 return; 493 } elsif ($property =~ /^80/) { 494 $self->{VERBOSE} 495 and warn "Skipping property $property (user-defined property)\n"; 496 return; 497 } else { 498 warn "Unknown property $property\n"; 499 return; 500 } 501 } 502 503 # 504 # Helper functions 505 # 506 507 sub _GetName { 508 my ($self, $PPS) = @_; 509 return $self->_NormalizeWhiteSpace(OLE::Storage_Lite::Ucs2Asc($PPS->{Name})); 510 } 511 512 sub _NormalizeWhiteSpace { 513 my ($self, $name) = @_; 514 $name =~ s/\W/ /g; 515 return $name; 516 } 517 518 sub _GetOLEDate { 519 my ($self, $PPS) = @_; 520 unless (defined ($self->{OLEDATE})) { 521 # Make Date 522 my $datearr; 523 $datearr = $PPS->{Time2nd}; 524 $datearr = $PPS->{Time1st} unless($datearr); 525 $self->{OLEDATE} = $self->_FormatDate($datearr) if $datearr; 526 } 527 } 528 529 sub _FormatDate { 530 my ($self, $datearr) = @_; 531 532 # TODO: This is a little convoluted. Directly using strftime didn't seem 533 # to work. 534 my $datetime = mktime(@$datearr); 535 return time2str("%a, %d %h %Y %X %z", $datetime); 536 } 537 538 # If we didn't get the date from the original header data, we may be able 539 # to get it from the SUBMISSION_ID: 540 # It seems to have the format of a semicolon-separated list of key=value 541 # pairs. The key l has a value with the format: 542 # <SERVER>-<DATETIME>Z-<NUMBER>, where DATETIME is the date and time in 543 # the format YYMMDDHHMMSS. 544 sub _SubmissionIdDate { 545 my $self = shift; 546 547 my $submission_id = $self->{SUBMISSION_ID} or return undef; 548 $submission_id =~ m/l=.*-(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)Z-.*/ 549 or return undef; 550 my $year = $1; 551 $year += 100 if $year < 20; 552 return $self->_FormatDate([$6,$5,$4,$3,$2-1,$year]); 553 } 554 555 sub _ParseItemName { 556 my ($self, $name) = @_; 557 558 if ($name =~ /^__substg1 0_(....)(....)$/) { 559 my ($property, $encoding) = ($1, $2); 560 if ($encoding eq ENCODING_UNICODE and not ($self->{HAS_UNICODE})) { 561 warn "This MSG file contains Unicode fields." 562 . " This is currently unsupported.\n"; 563 $self->{HAS_UNICODE} = 1; 564 } elsif (not (KNOWN_ENCODINGS()->{$encoding})) { 565 warn "Unknown encoding $encoding. Results may be strange or wrong.\n"; 566 } 567 return ($property, $encoding); 568 } else { 569 return (undef, undef); 570 } 571 } 572 573 sub _SaveAttachment { 574 my ($self, $mime, $att) = @_; 575 576 my $ent = $mime->attach( 577 Type => $att->{MIMETYPE}, 578 Encoding => $att->{ENCODING}, 579 Data => [], 580 Filename => ($att->{LONGNAME} ? $att->{LONGNAME} : $att->{SHORTNAME}), 581 Disposition => $att->{DISPOSITION} 582 ); 583 584 my $handle; 585 if ($handle = $ent->open("w")) { 586 $handle->print($att->{DATA}); 587 $handle->close; 588 } else { 589 warn "Could not write data!"; 590 } 591 } 592 593 sub _SetAddressPart { 594 my ($self, $adrname, $partname, $data) = @_; 595 596 my $address = $self->{ADDRESSES}->{$adrname}; 597 $data =~ s/\000//g; 598 #warn "Processing address data part $partname : $data\n"; 599 if (defined ($address->{$partname})) { 600 if ($address->{$partname} eq $data) { 601 warn "Skipping duplicate but identical address information for" 602 . " $partname\n" if $self->{VERBOSE}; 603 } else { 604 warn "Address information $partname inconsistent:\n"; 605 warn " Original data: $address->{$partname}\n"; 606 warn " New data: $data\n"; 607 } 608 } else { 609 $address->{$partname} = $data; 610 } 611 } 612 613 # Set header fields 614 sub _AddHeaderField { 615 my ($self, $mime, $fieldname, $value) = @_; 616 617 my $oldvalue = $mime->head->get($fieldname); 618 return if $oldvalue; 619 $mime->head->add($fieldname, $value) if $value; 620 } 621 622 sub _Address { 623 my ($self, $tag) = @_; 624 my $name = $self->{$tag} || ""; 625 my $address = $self->{$tag . "_ADDR"} || ""; 626 return "$name <$address>"; 627 } 628 629 # Find SMTP addresses for the given list of names 630 sub _ExpandAddressList { 631 my ($self, $names) = @_; 632 633 my $addresspool = $self->{ADDRESSES}; 634 my @namelist = split /; */, $names; 635 my @result; 636 name: foreach my $name (@namelist) { 637 foreach my $address (@$addresspool) { 638 if ($name eq $address->{NAME}) { 639 my $addresstext = $address->{NAME} . " <"; 640 if (defined ($address->{SMTPADDRESS})) { 641 $addresstext .= $address->{SMTPADDRESS}; 642 } elsif ($address->{TYPE} eq "SMTP") { 643 $addresstext .= $address->{ADDRESS}; 644 } 645 $addresstext .= ">"; 646 push @result, $addresstext; 647 next name; 648 } 649 } 650 push @result, $name; 651 } 652 return join ", ", @result; 653 } 654 655 sub _ParseHead { 656 my ($self, $data) = @_; 657 defined $data or return undef; 658 # Parse full header date if we got that. 659 my $parser = new MIME::Parser(); 660 $parser->output_to_core(1); 661 $parser->decode_headers(1); 662 $data =~ s/^Microsoft Mail.*$/X-MSGConvert: yes/m; 663 my $entity = $parser->parse_data($data) 664 or warn "Couldn't parse full headers!"; 665 my $head = $entity->head; 666 $head->unfold; 667 return $head; 668 } 669 670 # Find out if we need to construct a multipart message 671 sub _IsMultiPart { 672 my $self = shift; 673 674 return ( 675 ($self->{BODY_HTML} and $self->{BODY_PLAIN}) 676 or @{$self->{ATTACHMENTS}}>0 677 ); 678 } 679 680 # Copy original header data. 681 # Note: This should contain the Date: header. 682 sub _CopyHeaderData { 683 my ($self, $mime) = @_; 684 685 my $head = $self->_ParseHead($self->{HEAD}) or return; 686 687 foreach my $tag (grep {!$skipheaders->{$_}} $head->tags) { 688 foreach my $value ($head->get_all($tag)) { 689 $mime->head->add($tag, $value); 690 } 691 } 692 } 693 694 # Set header fields 695 sub _SetHeaderFields { 696 my ($self, $mime) = @_; 697 698 # If we didn't get the date from the original header data, we may be able 699 # to get it from the SUBMISSION_ID: 700 $self->_AddHeaderField($mime, 'Date', $self->_SubmissionIdDate()); 701 702 # Third and last chance to set the Date: header; this uses the date the 703 # MSG file was saved. 704 $self->_AddHeaderField($mime, 'Date', $self->{OLEDATE}); 705 $self->_AddHeaderField($mime, 'Subject', $self->{SUBJECT}); 706 $self->_AddHeaderField($mime, 'From', $self->_Address("FROM")); 707 #$self->_AddHeaderField($mime, 'Reply-To', $self->_Address("REPLYTO")); 708 $self->_AddHeaderField($mime, 'To', $self->_ExpandAddressList($self->{TO})); 709 $self->_AddHeaderField($mime, 'Cc', $self->_ExpandAddressList($self->{CC})); 710 $self->_AddHeaderField($mime, 'Message-Id', $self->{MESSAGEID}); 711 $self->_AddHeaderField($mime, 'In-Reply-To', $self->{INREPLYTO}); 712 } 713 714 package main; 715 use Getopt::Long; 716 use Pod::Usage; 717 718 # Setup command line processing. 719 my $verbose = ''; 720 my $help = ''; # Print help message and exit. 721 GetOptions('verbose' => \$verbose, 'help|?' => \$help) or pod2usage(2); 722 pod2usage(1) if $help; 723 724 # Get file name 725 my $file = $ARGV[0]; 726 defined $file or pod2usage(2); 727 warn "Will parse file: $file\n" if $verbose; 728 729 # Load and parse MSG file (is OLE) 730 my $Msg = OLE::Storage_Lite->new($file); 731 my $PPS = $Msg->getPpsTree(1); 732 $PPS or die "$file must be an OLE file"; 733 734 # parse PPS tree 735 my $parser = new MSGParser(); 736 $parser->set_verbosity(1) if $verbose; 737 $parser->parse($PPS); 738 $parser->print(); 739 740 # 741 # Usage info follows. 742 # 743 __END__ 744 745 =head1 NAME 746 747 msgconvert.pl - Convert Outlook .msg files to mbox format 748 749 =head1 SYNOPSIS 750 751 msgconvert.pl [options] <file.msg> 752 753 Options: 754 --verbose be verbose 755 --help help message 756 757 =head1 OPTIONS 758 759 =over 8 760 761 =item B<--verbose> 762 763 Print information about skipped parts of the .msg file. 764 765 =item B<--help> 766 767 Print a brief help message. 768 769 =head1 DESCRIPTION 770 771 This program will output the message contained in file.msg in mbox format 772 on stdout. It will complain about unrecognized OLE parts on 773 stderr. 774 775 =head1 BUGS 776 777 Not all data that's in the .MSG file is converted. There simply are some 778 parts whose meaning escapes me. One of these must contain the date the 779 message was sent, for example. Formatting of text messages will also be 780 lost. YMMV. 781 782 =cut -
Makefile.am
78 78 pkglibbin_PROGRAMS = omega 79 79 dist_pkglibbin_SCRIPTS = outlookmsg2html 80 80 bin_PROGRAMS = omindex scriptindex 81 dist_libexec_SCRIPTS = outlook2text excel2text mimeexplode msgconvert.pl 81 82 dist_bin_SCRIPTS = dbi2omega htdig2omega mbox2omega 82 83 83 84 check_PROGRAMS = htmlparsetest md5test utf8converttest … … 160 161 MAINTAINERCLEANFILES = $(dist_man_MANS) 161 162 endif 162 163 164 CLEANFILES = outlook2text 165 166 outlook2text: $(srcdir)/outlook2text.in Makefile 167 sed "s,@MSGCONVERT@,$(MSGCONVERT),;s,@MIMEEXPLODE@,$(pkglibbindir)/mimeexplode," $(srcdir)/outlook2text.in > $@ 168 163 169 if DOCUMENTATION_RULES 164 170 omindex.1: omindex$(EXEEXT) makemanpage 165 171 ./makemanpage ./omindex $(srcdir)/omindex.cc omindex.1