Ticket #282: xapian-omega-1.2.5-from-ticket-285-and-cleaned-up-updated-2011-05-13.patch

File xapian-omega-1.2.5-from-ticket-285-and-cleaned-up-updated-2011-05-13.patch, 49.1 KB (added by Olly Betts, 13 years ago)

Patch against trunk shortly after 1.2.5

  • utils.h

     
    3131/** Converts a string to an int. */
    3232int string_to_int(const std::string & s);
    3333
     34void rm_rf(const std::string &filename);
     35
    3436#endif
  • omindex.cc

     
    55 * Copyright 2001,2002 Ananova Ltd
    66 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011 Olly Betts
    77 * Copyright 2009 Frank J Bruzzaniti
     8 * Copyright 2006,2007,2008 AVL List GesmbH
    89 *
    910 * This program is free software; you can redistribute it and/or
    1011 * modify it under the terms of the GNU General Public License as
     
    6869extern char * mkdtemp(char *);
    6970#endif
    7071
     72#ifndef LIBEXECDIR
     73// must have ending slash
     74//# define LIBEXECDIR "/usr/lib/omega/bin/"
     75# define LIBEXECDIR ""
     76#endif
     77#ifndef PKGDATADIR
     78// must have ending slash
     79# define PKGDATADIR "/usr/share/omega/"
     80#endif
     81
    7182using namespace std;
    7283
    7384#define TITLE_SIZE 128
     
    8192static bool ignore_exclusions = false;
    8293static bool spelling = false;
    8394static bool verbose = false;
     95string error_log; /* used in runfilter.cc */
     96static string baseurl;
     97static string dbpath;
     98static string cache_dir;
    8499static enum {
    85100    EMPTY_BODY_WARN, EMPTY_BODY_INDEX, EMPTY_BODY_SKIP
    86101} empty_body = EMPTY_BODY_WARN;
     
    103118// text are common, so we handle these with a std::map.
    104119static map<string, string> commands;
    105120
     121static void
     122index_directory(const string &path, const string &url_, size_t depth_limit,
     123                map<string, string>& mime_map);
     124
    106125inline static bool
    107126p_notalnum(unsigned int c)
    108127{
     
    328347    skip(file, "unknown MIME type '" + mimetype + "'");
    329348}
    330349
     350static
     351void mkdir_p(const string &path, mode_t mode) {
     352    (void)mode; // FIXME
     353#ifdef __WIN32__
     354    system(("mkdir \"" + shell_protect(path) + "\"").c_str());
     355#else
     356    if (system(("mkdir -p " + shell_protect(path)).c_str()) < 0) { /* FIXME */ }
     357#endif
     358}
     359
     360/*
     361 * unpack .msg/.pst/.rar/.zip into local cache dir and recurse there
     362 */
     363static void
     364index_cached_directory(size_t depth_limit,
     365                       const string &file,
     366                       const string &url,
     367                       const string &ext,
     368                       const string &cmd,
     369                       map<string, string>& mime_map)
     370{
     371    string oldroot = root;
     372    root = cache_dir;
     373    string cache = root+"/."+ext;
     374    string cachedir = cache+url;
     375    struct stat statfile, statcache;
     376    bool extract_cache;
     377#ifdef HAVE_LSTAT
     378    lstat(file.c_str(), &statfile);
     379    lstat(cachedir.c_str(), &statcache);
     380#else
     381    stat(file.c_str(), &statfile);
     382    stat(cachedir.c_str(), &statcache);
     383#endif
     384    extract_cache = true;
     385    // if cachedir exists and if file is older as cachedir and if cachedir existed 5 secs ago,
     386    // then it was already extracted.
     387    if (S_ISDIR(statcache.st_mode)
     388        && S_ISREG(statfile.st_mode)
     389        && (statfile.st_mtime < statcache.st_mtime)
     390        && (statcache.st_mtime < (time_t)(time(NULL)-500))) // not created by nested mkdir call
     391    {
     392        // but is it in the database also? prevent from deleting skipped files
     393        if (verbose)
     394            cout << "Unchanged cache \"" << cachedir << "\" - \"" << file << "\" - skip extraction "
     395                 // << statfile.st_mtime << " < " << statcache.st_mtime
     396                 << endl;
     397        extract_cache = false;
     398    }
     399    if (S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode) ) {
     400        // If last_mod > last_mod_max, we know for sure that the file is new
     401        // or updated.
     402        if (statfile.st_mtime <= last_mod_max) {
     403            // check database timestamp for cached container, esp. for cleaned up caches.
     404            // if already in db we need not to extract again
     405            string urlterm("U");
     406            urlterm += baseurl;
     407            urlterm += "/."+ext+url;
     408            if (urlterm.length() > MAX_SAFE_TERM_LENGTH)
     409                urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH);
     410       
     411            Xapian::PostingIterator p = db.postlist_begin(urlterm);
     412            if (p != db.postlist_end(urlterm)) {
     413                Xapian::docid docid = *p;
     414                Xapian::Document doc = db.get_document(docid);
     415                string value = doc.get_value(VALUE_LASTMOD);
     416                time_t old_last_mod = binary_string_to_int(value);
     417                if (statfile.st_mtime <= old_last_mod) {
     418                    if (verbose)
     419                        cout << "Cache "<< "."+ext+url << " not newer. Ignored." << endl;
     420                    // The docid should be in updated - the only valid
     421                    // exception is if the URL was long and hashed to the
     422                    // same URL as an existing document indexed in the same
     423                    // batch.
     424                    if (usual(docid < updated.size() && !updated[docid])) {
     425                        updated[docid] = true;
     426                        --old_docs_not_seen;
     427                    }
     428                    root = oldroot;
     429                    return;
     430                }
     431            }
     432        }
     433    }
     434
     435    if (extract_cache) {
     436        if (verbose)
     437            cout << "[EXTRACT into cache " << cachedir << "]" << endl;
     438        if (verbose && S_ISDIR(statcache.st_mode) && S_ISREG(statfile.st_mode))
     439            cout << " ...changed cache \"" << cachedir << "\" - \"" << file << "\" "
     440                 << statfile.st_mtime << " < " << statcache.st_mtime << " time: " << time(NULL)
     441                 << endl;
     442        if (!S_ISDIR(statcache.st_mode))
     443            mkdir_p(cachedir, 0755);
     444        stdout_to_string(cmd);
     445#ifndef __WIN32__
     446        stdout_to_string("chmod -R u+rwx " + shell_protect(cachedir));
     447#endif
     448#ifdef HAVE_LSTAT
     449        lstat(cachedir.c_str(), &statcache);
     450#else
     451        stat(cachedir.c_str(), &statcache);
     452#endif
     453    }
     454
     455    if (S_ISDIR(statcache.st_mode)) {
     456        if (depth_limit == 1) {
     457            cout << "Recursion limit reached for \""<< url << "\" - skipping " << endl;
     458        } else {
     459            // max loop 5, magic start: /.ext+file
     460            index_directory(file + "/."+ext+url, url, depth_limit + 5, mime_map);
     461            if (verbose)
     462                cout << "[CLEANUP " << "rm -rf " << shell_protect(cachedir) << "]" << endl;
     463            rm_rf(cachedir);
     464        }
     465    }
     466    else { // no -p would be fatal here
     467        cout << "cachedir " << shell_protect(cachedir) << " does not exist - skipped" << endl;
     468    }
     469    root = oldroot;
     470}
     471
    331472static void
    332473index_file(const string &file, const string &url, DirectoryIterator & d,
    333            map<string, string>& mime_map)
     474           map<string, string>& mime_map, size_t depth_limit)
    334475{
    335476    string ext;
    336477    const char * dot_ptr = strrchr(d.leafname(), '.');
     
    358499
    359500    string mimetype;
    360501    if (mt == mime_map.end()) {
     502        if (strcasecmp(d.leafname(), "mbox") == 0) {
     503            // Special filename.
     504            mimetype = "message/rfc822";
     505            goto got_mimetype;
     506        }
     507
    361508        mimetype = d.get_magic_mimetype();
    362509        if (mimetype.empty()) {
    363510            skip(file, "Unknown extension and unrecognised format",
     
    370517        mimetype = mt->second;
    371518    }
    372519
     520got_mimetype:
     521
    373522    if (verbose)
    374523        cout << "Indexing \"" << file.substr(root.size()) << "\" as "
    375524             << mimetype << " ... ";
     
    429578            }
    430579        }
    431580    }
     581    // add the db basename to cache_dir
     582    {
     583        ensure_tmpdir(); // FIXME: be lazy!
     584        cache_dir = tmpdir;
     585        const char *p = strrchr(dbpath.c_str(), '/');
     586        // on windows only
     587        if (!p) p = strrchr(dbpath.c_str(), '\\');
     588        if (p) { p++; } else { p = dbpath.c_str(); }
     589        cache_dir += p;
     590    }
    432591
    433592    if (verbose) cout << flush;
    434593
     
    493652            } else {
    494653                // FIXME: What charset is the file?  Look at contents?
    495654            }
     655#if 0 // FIXME: this won't work as omindex will have the database locked...
     656    } else if (mimetype == "message/rfc822") { // // => mbox2script
     657        //for stemmer lang, parse stemmer.get_description => Xapian::Stem(bla)
     658        string cmd = LIBEXECDIR"mbox2omega " + shell_protect(file) + error_log+"| "
     659            "scriptindex " + shell_protect(dbpath) + " "PKGDATADIR"mbox2script.script";
     660        try {
     661            dump = stdout_to_string(cmd);
     662        } catch (ReadError) {
     663            cout << "\"" << cmd << "\" failed - skipping" << endl;
     664            return;
     665        }
     666#endif
    496667        } else if (mimetype == "application/pdf") {
    497668            string safefile = shell_protect(file);
    498669            string cmd = "pdftotext -enc UTF-8 " + safefile + " -";
     
    721892
    722893            generate_sample_from_csv(dump, sample);
    723894        } else if (mimetype == "application/vnd.ms-outlook") {
    724             string cmd = get_pkglibbindir() + "/outlookmsg2html " + shell_protect(file);
    725             MyHtmlParser p;
    726             p.ignore_metarobots();
     895            string oldroot = root;
     896            struct stat statcache;
     897            char olddir[256];
     898           
     899            if (depth_limit == 1) {
     900                skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME);
     901                return;
     902            }
     903            string cmd = LIBEXECDIR"outlook2text "+shell_protect(file);
     904            // unpack multiparts and attachments. so we have to chdir first
     905            string fulldir = cache_dir+"/.msg"+url;
     906            if (getcwd(olddir, 256) == NULL) { /* FIXME */ }
     907#ifdef HAVE_LSTAT
     908            lstat(fulldir.c_str(), &statcache);
     909#else
     910            stat(fulldir.c_str(), &statcache);
     911#endif
     912            if (!S_ISDIR(statcache.st_mode)) {
     913                mkdir_p(fulldir, 0755);
     914            }
    727915            try {
    728                 dump = stdout_to_string(cmd);
    729                 // FIXME: what should the default charset be?
    730                 p.parse_html(dump, "iso-8859-1", false);
    731             } catch (const string & newcharset) {
    732                 p.reset();
    733                 p.ignore_metarobots();
    734                 p.parse_html(dump, newcharset, true);
     916                if (chdir(fulldir.c_str()) < 0) { /* FIXME */ }
     917                size_t new_limit = depth_limit;
     918                if (new_limit) --new_limit;
     919                index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
     920                if (chdir(olddir) < 0) { /* FIXME */ }
    735921            } catch (ReadError) {
    736                 skip_cmd_failed(file, cmd);
    737                 return;
     922                cout << "failed " << cmd << " << in index_cached_directory" << endl;
     923                if (chdir(olddir) < 0) { /* FIXME */ }
     924                root = oldroot;
     925            } catch (...) {
     926                cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     927                if (chdir(olddir) < 0) { /* FIXME */ }
     928                root = oldroot;
     929                throw;
    738930            }
    739             dump = p.dump;
    740             title = p.title;
    741             keywords = p.keywords;
    742             sample = p.sample;
    743             author = p.author;
     931            return;
    744932        } else if (mimetype == "image/svg+xml") {
    745933            SvgParser svgparser;
    746934            svgparser.parse_html(d.file_to_string());
     
    769957            if (idx != string::npos) {
    770958                dump.assign(desc, idx + 1, string::npos);
    771959            }
     960        } else if (mimetype == "application/x-zip") {
     961            string oldroot = root;
     962            if (depth_limit == 1) {
     963                skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME);
     964                return;
     965            }
     966            // overwrite
     967            string cmd = "unzip -u -P. -o " +shell_protect(file) + " -d " +shell_protect(cache_dir+"/.zip"+url+"/");
     968            try {
     969                size_t new_limit = depth_limit;
     970                if (new_limit) --new_limit;
     971                index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
     972            } catch (ReadError) {
     973                cout << "failed " << cmd << " << in index_cached_directory" << endl;
     974                root = oldroot;
     975            } catch (...) {
     976                cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     977                root = oldroot;
     978                throw;
     979            }
     980            return;
     981        } else if (mimetype == "application/x-rar") {
     982            string oldroot = root;
     983            if (depth_limit == 1) {
     984                skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME);
     985                return;
     986            }
     987
     988            // overwrite
     989            string cmd = "unrar x -o+ " +shell_protect(file) + " "
     990                + shell_protect(cache_dir+"/.rar"+url+"/");
     991            try {
     992                size_t new_limit = depth_limit;
     993                if (new_limit) --new_limit;
     994                index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
     995            } catch (ReadError) {
     996                cout << "failed " << cmd << " << in index_cached_directory" << endl;
     997                root = oldroot;
     998            } catch (...) {
     999                cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     1000                root = oldroot;
     1001                throw;
     1002            }
     1003            return;
     1004        } else if (mimetype == "application/vnd.ms-outlook-pst") {
     1005            string oldroot = root;
     1006            if (depth_limit == 1) {
     1007                skip(file, "Recursion limit reached for compound file", SKIP_SHOW_FILENAME);
     1008                return;
     1009            }
     1010            // unpack attachments also, together with mbox files
     1011            string cmd = "readpst -r -cv -w -o "
     1012                + shell_protect(cache_dir+"/.pst"+url+"/")+" "+shell_protect(file);
     1013            try {
     1014                size_t new_limit = depth_limit;
     1015                if (new_limit) --new_limit;
     1016                index_cached_directory(new_limit, file, url, ext, cmd, mime_map);
     1017            } catch (ReadError) {
     1018                root = oldroot;
     1019                cout << "failed " << cmd << " << in index_cached_directory" << endl;
     1020            } catch (...) {
     1021                root = oldroot;
     1022                cout << "Caught unknown exception in index_cached_directory, rethrowing" << endl;
     1023                throw;
     1024            }
     1025            return;
    7721026        } else {
    7731027            // Don't know how to index this type.
    7741028            skip_unknown_mimetype(file, mimetype);
     
    9851239                        break;
    9861240                    }
    9871241                    case DirectoryIterator::REGULAR_FILE:
    988                         index_file(file, url, d, mime_map);
     1242                        index_file(file, url, d, mime_map, depth_limit);
    9891243                        break;
    9901244                    default:
    9911245                        skip(file, "Not a regular file",
     
    10091263    bool overwrite = false;
    10101264    // If delete_removed_documents is true, delete any documents we don't see.
    10111265    bool delete_removed_documents = true;
    1012     string baseurl;
    10131266    size_t depth_limit = 0;
    10141267
    10151268    static const struct option longopts[] = {
     
    11331386    mime_map["ppt"] = "application/vnd.ms-powerpoint";
    11341387    mime_map["pps"] = "application/vnd.ms-powerpoint"; // Powerpoint slideshow
    11351388    mime_map["msg"] = "application/vnd.ms-outlook"; // Outlook .msg email
     1389#ifdef HAVE_READPST
     1390    //  Outlook messager folder
     1391    mime_map["pst"] = "application/vnd.ms-outlook-pst"; // readpst | uudeview (libpst)
     1392#endif
     1393
     1394    // Misc compound formats:
     1395    mime_map["mbox"] = "message/rfc822";                // => mbox2omega
     1396    mime_map["mbx"] = "message/rfc822";                // => mbox2omega
     1397#ifndef _MSC_VER
     1398    mime_map["zip"] = "application/x-zip"; // recursive scanning
     1399#  ifdef HAVE_UNRAR
     1400    mime_map["rar"] = "application/x-rar"; // recursive scanning
     1401#  endif
     1402#endif
    11361403
    11371404    // Perl:
    11381405    mime_map["pl"] = "text/x-perl";
     
    11901457        argv[1] = const_cast<char *>("--version");
    11911458    }
    11921459
    1193     string dbpath;
    11941460    int getopt_ret;
    11951461    while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:F:l:s:pfSVe:i",
    11961462                                         longopts, NULL)) != -1) {
     
    13401606        baseurl += '/';
    13411607    }
    13421608
     1609    string log_dir = "./"; // FIXME: need to set log_dir to something appropriate.
     1610    error_log = " 2>>"+log_dir+"omindex-error.log";
     1611
    13431612    if (optind >= argc || optind + 2 < argc) {
    13441613        cerr << PROG_NAME": you must specify a directory to index.\n"
    13451614"Do this either as a single directory (corresponding to the base URL)\n"
  • outlook2text.in

     
     1#! /bin/sh
     2# converts msg to mbox and extract attachments
     3# either be in the cache dir, or accept it as 2nd arg
     4if [ -n $2 ]; then
     5  @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "$2"
     6else
     7  # already is in the cache dir
     8  base=`basename "$1" .msg`
     9  @MSGCONVERT@ "$1" | @MIMEEXPLODE@ -d "${base}"
     10fi
  • runfilter.cc

     
    5454
    5555using namespace std;
    5656
     57extern string error_log;
     58
    5759string
    5860stdout_to_string(const string &cmd)
    5961{
    6062    string out;
     63    string tmp = cmd;
     64    tmp += error_log;
    6165#if defined HAVE_FORK && defined HAVE_SOCKETPAIR && defined HAVE_SETRLIMIT
    6266    // We want to be able to get the exit status of the child process.
    6367    signal(SIGCHLD, SIG_DFL);
     
    100104        }
    101105#endif
    102106
    103         execl("/bin/sh", "/bin/sh", "-c", cmd.c_str(), (void*)NULL);
     107        execl("/bin/sh", "/bin/sh", "-c", tmp.c_str(), (void*)NULL);
    104108        _exit(-1);
    105109    }
    106110
     
    139143        throw ReadError();
    140144    }
    141145#else
    142     FILE * fh = popen(cmd.c_str(), "r");
     146    FILE * fh = popen(tmp.c_str(), "r");
    143147    if (fh == NULL) throw ReadError();
    144148    while (!feof(fh)) {
    145149        char buf[4096];
  • utils.cc

     
    2525
    2626#include <stdio.h> // for sprintf/snprintf
    2727#include <cstdlib>
     28#include <cstring>
     29#include "safesysstat.h"
    2830
    2931#include <string>
    3032
    3133using namespace std;
    3234
     35#ifdef __WIN32__
     36#include "safewindows.h"
     37#endif
     38
    3339// This ought to be enough for any of the conversions below.
    3440#define BUFSIZE 100
    3541
     
    3945    int len = SNPRINTF(buf, BUFSIZE, (FMT), val);\
    4046    if (len == -1 || len > BUFSIZE) return string(buf, BUFSIZE);\
    4147    return string(buf, len);
     48/// Allow system to work directly on C++ strings.
     49inline int system(const string &command) { return system(command.c_str()); }
     50
     51// Duplicated from omindex.cc - FIXME
     52static string
     53shell_protect(const string & file)
     54{
     55    string safefile = file;
     56#ifdef __WIN32__
     57    bool need_to_quote = false;
     58    for (string::iterator i = safefile.begin(); i != safefile.end(); ++i) {
     59        unsigned char ch = *i;
     60        if (!isalnum(ch) && ch < 128) {
     61            if (ch == '/') {
     62                // Convert Unix path separators to backslashes.  C library
     63                // functions understand "/" in paths, but external commands
     64                // generally don't, and also may interpret a leading '/' as
     65                // introducing a command line option.
     66                *i = '\\';
     67            } else if (ch == ' ') {
     68                need_to_quote = true;
     69            } else if (ch < 32 || strchr("<>\"|*?", ch)) {
     70                // Check for invalid characters in the filename.
     71                string m("Invalid character '");
     72                m += ch;
     73                m += "' in filename \"";
     74                m += file;
     75                m += '"';
     76                throw m;
     77            }
     78        }
     79    }
     80    if (safefile[0] == '-') {
     81        // If the filename starts with a '-', protect it from being treated as
     82        // an option by prepending ".\".
     83        safefile.insert(0, ".\\");
     84    }
     85    if (need_to_quote) {
     86        safefile.insert(0, "\"");
     87        safefile += '"';
     88    }
     89#else
     90    string::size_type p = 0;
     91    if (!safefile.empty() && safefile[0] == '-') {
     92        // If the filename starts with a '-', protect it from being treated as
     93        // an option by prepending "./".
     94        safefile.insert(0, "./");
     95        p = 2;
     96    }
     97    while (p < safefile.size()) {
     98        // Don't escape some safe characters which are common in filenames.
     99        unsigned char ch = safefile[p];
     100        if (!isalnum(ch) && strchr("/._-", ch) == NULL) {
     101            safefile.insert(p, "\\");
     102            ++p;
     103        }
     104        ++p;
     105    }
     106#endif
     107    return safefile;
     108}
     109
     110/// Remove a directory and contents.
     111void
     112rm_rf(const string &filename)
     113{
     114    // Check filename exists and is actually a directory
     115    struct stat sb;
     116    if (stat(filename.c_str(), &sb) != 0 || !S_ISDIR(sb.st_mode)) return;
     117
     118    string safefile = shell_protect(filename);
     119#ifdef __WIN32__
     120# if 1
     121    static int win95 = -1;
     122    if (win95 == -1) {
     123        OSVERSIONINFO info;
     124        memset(&info, 0, sizeof(OSVERSIONINFO));
     125        info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
     126        if (GetVersionEx(&info)) {
     127            win95 = (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS);
     128        }
     129    }
     130
     131    if (win95) {
     132        // for 95 like systems:
     133        system("deltree /y \"" + safefile + "\"");
     134    } else {
     135        // for NT like systems:
     136        system("rd /s /q \"" + safefile + "\"");
     137    }
     138# else
     139    safefile.append("\0", 2);
     140    SHFILEOPSTRUCT shfo;
     141    memset((void*)&shfo, 0, sizeof(shfo));
     142    shfo.hwnd = 0;
     143    shfo.wFunc = FO_DELETE;
     144    shfo.pFrom = safefile.data();
     145    shfo.fFlags = FOF_NOCONFIRMATION|FOF_NOERRORUI|FOF_SILENT;
     146    (void)SHFileOperation(&shfo);
     147# endif
     148#else
     149    system("rm -rf " + safefile);
     150#endif
     151}
    42152#else
    43153#define CONVERT_TO_STRING(FMT) \
    44154    char buf[BUFSIZE];\
  • ChangeLog

     
     12006-08-22 09:30:12 Reini Urban <reinhard.urban@avl.com>
     2
     3        omega-0.9.6c:
     4        * omindex.cc: Fix wrong timestamp comparison in cache logic
     5        * scriptindex.cc: Add lastmod and size records and values.
     6        * excel2text, outlook2text.in: New scripts
     7
     82006-08-18 15:13:32 Reini Urban <reinhard.urban@avl.com>
     9
     10        omega-0.9.6b:
     11        * omindex.cc: Add HAVE_UNRAR, HAVE_MSGCONVERT, HAVE_READPST checks.
     12       
     132006-08-17 18:06:26 Reini Urban <reinhard.urban@avl.com>
     14
     15        omega-0.9.6a:
     16        * omindex.cc: Added cached virtual directories zip,msg,pst,...).
     17        Consistently log stderr to /var/log/omega/omindex-error.log.
     18
    119Wed Apr 20 07:00:56 GMT 2011  Olly Betts <olly@survex.com>
    220
    321        * NEWS: Fix typo; clarify wording.
  • scriptindex.cc

     
    44 * Copyright 2001 Sam Liddicott
    55 * Copyright 2001,2002 Ananova Ltd
    66 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010 Olly Betts
     7 * Copyright 2006,2007 AVL List GesmbH
    78 *
    89 * This program is free software; you can redistribute it and/or
    910 * modify it under the terms of the GNU General Public License as
     
    3940#include <cstdio>
    4041#include <ctime>
    4142#include "safeunistd.h"
     43#include <sys/stat.h>
    4244
    4345#include "commonhelp.h"
    4446#include "hashterm.h"
    4547#include "loadfile.h"
    4648#include "myhtmlparse.h"
     49#include "str.h"
    4750#include "stringutils.h"
    4851#include "utf8truncate.h"
    4952#include "utils.h"
     53#include "values.h"
    5054
    5155#include "gnu_getopt.h"
    5256
     
    431435{
    432436    string line;
    433437    size_t line_no = 0;
     438    time_t last_mod = 0;
     439    long   file_size = 0;
     440
     441    if (strcmp(fname,"<stdin>") != 0) {
     442        struct stat statbuf;
     443        stat(fname, &statbuf);
     444        if (! statbuf.st_size) {
     445            cout << "Empty \"" << fname << "\" - skipping\n";
     446            return false;
     447        }
     448        file_size = statbuf.st_size;
     449        last_mod = statbuf.st_mtime;
     450    }
    434451    while (!stream.eof() && getline(stream, line)) {
    435452        ++line_no;
    436453        Xapian::Document doc;
     
    677694            for (i = fields.begin(); i != fields.end(); ++i) {
    678695                list<string>::const_iterator j;
    679696                for (j = i->second.begin(); j != i->second.end(); j++) {
     697                    if (i->first == "lastmod")  last_mod = 0;
     698                    if (i->first == "size")     file_size = 0;
    680699                    data += i->first;
    681700                    data += '=';
    682701                    data += *j;
    683702                    data += '\n';
    684703                }
    685704            }
     705            // provide some extra fields if not already provided by the script
     706            if (last_mod) {        // if indexed per filename
     707                data += "lastmod="+str(last_mod)+'\n';
     708                doc.add_value(VALUE_LASTMOD, int_to_binary_string((uint32_t)last_mod));
     709            }
     710            if (file_size) {        // if indexed per filename
     711                data += "size="+str(file_size)+'\n';
     712                doc.add_value(VALUE_SIZE, Xapian::sortable_serialise(file_size));
     713            }
    686714
    687715            // Put the data in the document
    688716            doc.set_data(data);
  • excel2text

     
     1#! /bin/sh
     2# strip numbers, to stdout
     3xls2csv -q0 "$1" | sed -re's/[0123456789.]+,//g'
  • mimeexplode

     
     1#!/usr/bin/perl -w
     2
     3=head1 NAME
     4
     5mimeexplode - explode one or more MIME messages
     6
     7=head1 SYNOPSIS
     8
     9    mimeexplode [-d <dir>] <mime-msg-file> <mime-msg-file> ...
     10
     11    someprocess | mimeexplode -
     12
     13=head1 DESCRIPTION
     14
     15Takes one or more files from the command line that contain MIME
     16messages, and explodes their contents out into subdirectories
     17of the current working directory.  The subdirectories are
     18just called C<msg0>, C<msg1>, C<msg2>, etc.  Existing directories are
     19skipped over.
     20
     21The message information is output to the stdout, like this:
     22
     23    Message: msg3 (inputfile1.msg)
     24        Part: msg3/filename-1.dat (text/plain)
     25        Part: msg3/filename-2.dat (text/plain)
     26    Message: msg5 (input-file2.msg)
     27        Part: msg5/dir.gif (image/gif)
     28        Part: msg5/face.jpg (image/jpeg)
     29    Message: msg6 (infile3)
     30        Part: msg6/filename-1.dat (text/plain)
     31
     32This was written as an example of the MIME:: modules in the
     33MIME-parser package I wrote.  It may prove useful as a quick-and-dirty
     34way of splitting a MIME message if you need to decode something, and
     35you don't have a MIME mail reader on hand.
     36
     37=head1 COMMAND LINE OPTIONS
     38
     39-d outdir
     40
     41=head1 AUTHOR
     42
     43Eryq C<eryq@zeegee.com>, in a big hurry...
     44Reini Urban C<rurban@x-ray.at>: -d option to always explode into the same dir
     45
     46=cut
     47
     48#BEGIN { unshift @INC, ".." }    # to test MIME:: stuff before installing it!
     49
     50require 5.001;
     51
     52use strict;
     53use vars;
     54
     55use MIME::Parser;
     56use Getopt::Std;
     57my %opts;
     58my $outbase = '';
     59my $postfix = '';
     60
     61#------------------------------------------------------------
     62# make_msg - make and return the name of a msgXXX directory
     63#------------------------------------------------------------
     64
     65#ignored
     66#sub make_msg {
     67#    while (-d "msg$Msgno") {
     68#       ++$Msgno;
     69#       die "self-imposed limit reached" if $Msgno == 256;
     70#    }
     71#   mkdir "msg$Msgno",0755 or die "couldn't make msg$Msgno: $!";
     72#    "msg$Msgno";
     73#}
     74
     75#------------------------------------------------------------
     76# dump_entity - dump an entity's file info
     77#------------------------------------------------------------
     78sub dump_entity {
     79    my $ent = shift;
     80    my @parts = $ent->parts;
     81
     82    if (@parts) {        # multipart...
     83        map { dump_entity($_) } @parts;
     84    }
     85    else {               # single part...
     86        print "    Part: ", $ent->bodyhandle->path,
     87              " (", scalar($ent->head->mime_type), ")\n";
     88    }
     89}
     90
     91#------------------------------------------------------------
     92# main
     93#------------------------------------------------------------
     94sub main {
     95    my $file;
     96    my $entity;
     97
     98    # make sure the same message gets exploded into the same dir
     99    getopts('d:', \%opts);
     100    $outbase = $opts{d} ? $opts{d} : "msg0";
     101    my $outdir = $outbase;
     102
     103    # Go through messages:
     104    @ARGV or unshift @ARGV, "-";
     105    while (defined($file = shift @ARGV)) {
     106
     107      # Sanity:
     108      (-d $outdir) or mkdir "$outdir",0755;
     109      (-w "$outdir") or die "cwd $outdir not writable!";
     110      #my $msgdir = make_msg();
     111      #print "Message: $msgdir ($file)\n";
     112
     113      # Create a new parser object:
     114      my $parser = new MIME::Parser;
     115      ### $parser->parse_nested_messages('REPLACE');
     116
     117      # Optional: set up parameters that will affect how it extracts
     118      #   documents from the input stream:
     119      $parser->output_dir($outdir);
     120
     121      # Parse an input stream:
     122      open FILE, $file or die "couldn't open $file";
     123      $entity = $parser->read(\*FILE) or
     124        print STDERR "Couldn't parse MIME in $file; continuing...\n";
     125      close FILE;
     126
     127      # Congratulations: you now have a (possibly multipart) MIME entity!
     128      dump_entity($entity) if $entity;
     129      ### $entity->dump_skeleton if $entity;
     130
     131      $postfix++;
     132      $outdir = $outbase.$postfix;
     133    }
     134    1;
     135}
     136
     137exit (&main ? 0 : -1);
     138#------------------------------------------------------------
     1391;
     140
  • msgconvert.pl

     
     1#!/usr/bin/perl -w
     2#
     3# msgconvert.pl:
     4#
     5# Convert .MSG files (made by Outlook (Express)) to multipart MIME messages.
     6#
     7# Copyright 2002, 2004, 2006 Matijs van Zuijlen
     8#
     9# This program is free software; you can redistribute it and/or modify it
     10# under the terms of the GNU General Public License as published by the
     11# Free Software Foundation; either version 2 of the License, or (at your
     12# option) any later version.
     13#
     14# This program is distributed in the hope that it will be useful, but
     15# WITHOUT ANY WARRANTY; without even the implied warranty of
     16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
     17# Public License for more details.
     18#
     19# CHANGES:
     20# 20020715  Recognize new items 'Cc', mime type of attachment, long
     21#           filename of attachment, and full headers. Attachments turn out
     22#           to be numbered, so a regexp is now used to recognize label of
     23#           items that are attachments.
     24# 20020831  long file name will definitely be used if present. Full headers
     25#           and mime type information are used when present. Created
     26#           generic system for specifying known items to be skipped.
     27#           Unexpected contents is never reason to bail out anymore. Added
     28#           support for usage message and option processing (--verbose).
     29# 20040104  Handle address data slightly better, make From line less fake,
     30#           make $verbose and $skippable_entries global vars, handle HTML
     31#           variant of body text if present (though not optimally).
     32# 20040214  Fix typos and incorrect comments.
     33# 20040307  - Complete rewrite: All functional parts are now in the package
     34#             MSGParser;
     35#           - Creation of MIME::Entity object is delayed until the output
     36#             routines, which means all data is known; This means I can
     37#             create a multipart/alternative body.
     38#           - Item names are parsed (thanks to bfrederi@alumni.sfu.ca for
     39#             the information).
     40# 20040514  Check if $self->{HEAD} actually exists before trying to add its
     41#           contents to the output Mime object's header data.
     42#           (Bug reported by Thomas Ng).
     43#           Don't produce multipart messages if not needed.
     44#           (Bug reported by Justin B. Scout).
     45# 20040529  Correctly format OLEDATE.
     46# 20040530  - Extract date from property 0047 (thanks, Marc Goodman).
     47#           - Use address data to make To: and Cc: lines complete
     48#           - Use the in-reply-to property
     49#           - More unknown properties named.
     50#           - Found another property containing an SMTP address.
     51#           - Put non-SMTP type addresses back in output.
     52# 20040825  Replace 'our' to declare globals with 'use vars'. This means
     53#           the globals our now properly scoped inside the package and not
     54#           the file.
     55#           This also fixes the bug that this program did not work on perl
     56#           versions below 5.6. (Bug reported by Tim Gustafson)
     57# 20060218  More sensible encoding warnings.
     58# 20060219  Move OLE parsing to main program.
     59#           Parse nested MSG files (Bug reported by Christof Lukas).
     60# 20060225  Simplify code.
     61#
     62
     63#
     64# Import modules.
     65#
     66package MSGParser;
     67use strict;
     68use OLE::Storage_Lite;
     69use MIME::Entity;
     70use MIME::Parser;
     71use Date::Format;
     72use POSIX qw(mktime);
     73use constant DIR_TYPE => 1;
     74use constant FILE_TYPE => 2;
     75
     76use vars qw($skipproperties $skipheaders);
     77#
     78# Descriptions partially based on mapitags.h
     79#
     80$skipproperties = {
     81  # Envelope properties
     82  '000B' => "Conversation key?",
     83  '001A' => "Type of message",
     84  '003B' => "Sender address variant",
     85  '003D' => "Contains 'Re: '",
     86  '003F' => "'recieved by' id",
     87  '0040' => "'recieved by' name",
     88  '0041' => "Sender variant address id",
     89  '0042' => "Sender variant name",
     90  '0043' => "'recieved representing' id",
     91  '0044' => "'recieved representing' name",
     92  '0046' => "Read receipt address id",
     93  '0051' => "'recieved by' search key",
     94  '0052' => "'recieved representing' search key",
     95  '0053' => "Read receipt search key",
     96  '0064' => "Sender variant address type",
     97  '0065' => "Sender variant address",
     98  '0070' => "Conversation topic",
     99  '0071' => "Conversation index",
     100  '0075' => "'recieved by' address type",
     101  '0076' => "'recieved by' email address",
     102  '0077' => "'recieved representing' address type",
     103  '0078' => "'recieved representing' email address",
     104  '007F' => "something like a message id",
     105  # Recipient properties
     106  '0C19' => "Reply address variant",
     107  '0C1D' => "Reply address variant",
     108  '0C1E' => "Reply address type",
     109  # Non-transmittable properties
     110  '0E02' => "?Should BCC be displayed",
     111  '0E0A' => "sent mail id",
     112  '0E1D' => "Subject w/o Re",
     113  '0E27' => "64 bytes: Unknown",
     114  '0FF6' => "Index",
     115  '0FF9' => "Index",
     116  '0FFF' => "Address variant",
     117  # Content properties
     118  '1008' => "Summary or something",
     119  '1009' => "RTF Compressed",
     120  # 'Common property'
     121  '3001' => "Display name",
     122  '3002' => "Address Type",
     123  '300B' => "'Search key'",
     124  # Attachment properties
     125  '3702' => "Attachment encoding",
     126  '3703' => "Attachment extension",
     127  '3709' => "'Attachment rendering'", # Maybe an icon or something?
     128  '3713' => "Icon URL?",
     129  # 'Mail user'
     130  '3A20' => "Address variant",
     131  # 3900 -- 39FF: 'Address book'
     132  '39FF' => "7 bit display name",
     133  # 'Display table properties'
     134  '3FF8' => "Routing data?",
     135  '3FF9' => "Routing data?",
     136  '3FFA' => "Routing data?",
     137  '3FFB' => "Routing data?",
     138  # 'Transport-defined envelope property'
     139  '4029' => "Sender variant address type",
     140  '402A' => "Sender variant address",
     141  '402B' => "Sender variant name",
     142  '5FF6' => "Recipient name",
     143  '5FF7' => "Recipient address variant",
     144  # 'Provider-defined internal non-transmittable property'
     145  '6740' => "Unknown, binary data",
     146  # User defined id's
     147  '8000' => "Content Class",
     148  '8002' => "Unknown, binary data",
     149};
     150
     151$skipheaders = {
     152  "MIME-Version" => 1,
     153  "Content-Type" => 1,
     154  "Content-Transfer-Encoding" => 1,
     155  "X-Mailer" => 1,
     156  "X-Msgconvert" => 1,
     157  "X-MS-Tnef-Correlator" => 1,
     158  "X-MS-Has-Attach" => 1,
     159};
     160
     161use constant ENCODING_UNICODE => '001F';
     162use constant KNOWN_ENCODINGS => {
     163    '000D' => 'Directory',
     164    '001F' => 'Unicode',
     165    '001E' => 'Ascii?',
     166    '0102' => 'Binary',
     167};
     168
     169use constant MAP_ATTACHMENT_FILE => {
     170  '3701' => ["DATA",        0], # Data
     171  '3704' => ["SHORTNAME",   1], # Short file name
     172  '3707' => ["LONGNAME",    1], # Long file name
     173  '370E' => ["MIMETYPE",    1], # mime type
     174  '3716' => ["DISPOSITION", 1], # disposition
     175};
     176
     177use constant MAP_SUBITEM_FILE => {
     178  '1000' => ["BODY_PLAIN",      0], # Body
     179  '1013' => ["BODY_HTML",       0], # HTML Version of body
     180  '0037' => ["SUBJECT",         1], # Subject
     181  '0047' => ["SUBMISSION_ID",   1], # Seems to contain the date
     182  '007D' => ["HEAD",            1], # Full headers
     183  '0C1A' => ["FROM",            1], # Reply-To: Name
     184  '0C1E' => ["FROM_ADDR_TYPE",  1], # From: Address type
     185  '0C1F' => ["FROM_ADDR",       1], # Reply-To: Address
     186  '0E04' => ["TO",              1], # To: Names
     187  '0E03' => ["CC",              1], # Cc: Names
     188  '1035' => ["MESSAGEID",       1], # Message-Id
     189  '1042' => ["INREPLYTO",       1], # In reply to Message-Id
     190};
     191
     192use constant MAP_ADDRESSITEM_FILE => {
     193  '3001' => ["NAME",            1], # Real name
     194  '3002' => ["TYPE",            1], # Address type
     195  '403D' => ["TYPE",            1], # Address type
     196  '3003' => ["ADDRESS",         1], # Address
     197  '403E' => ["ADDRESS",         1], # Address
     198  '39FE' => ["SMTPADDRESS",     1], # SMTP Address variant
     199};
     200
     201#
     202# Main body of module
     203#
     204
     205sub new {
     206  my $that = shift;
     207  my $class = ref $that || $that;
     208
     209  my $self = {
     210    ATTACHMENTS => [],
     211    ADDRESSES => [],
     212    VERBOSE => 0,
     213    HAS_UNICODE => 0,
     214    FROM_ADDR_TYPE => "",
     215  };
     216  bless $self, $class;
     217}
     218
     219#
     220# Main sub: parse the PPS tree, and return
     221#
     222sub parse {
     223  my $self = shift;
     224  my $PPS = shift or die "Internal error: No PPS tree";
     225  $self->_RootDir($PPS);
     226}
     227
     228sub mime_object {
     229  my $self = shift;
     230
     231  my $bodymime;
     232  my $mime;
     233
     234  if ($self->_IsMultiPart) {
     235    # Construct a multipart message object
     236
     237    $mime = MIME::Entity->build(Type => "multipart/mixed");
     238
     239    # Set the entity that we'll save the body parts to. If there's more than
     240    # one part, it's a new entity, otherwise, it's the main $mime object.
     241    if ($self->{BODY_HTML} and $self->{BODY_PLAIN}) {
     242      $bodymime = MIME::Entity->build(
     243        Type => "multipart/alternative",
     244        Encoding => "8bit",
     245      );
     246      $mime->add_part($bodymime);
     247    } else {
     248      $bodymime = $mime;
     249    }
     250    if ($self->{BODY_PLAIN}) {
     251      $self->_SaveAttachment($bodymime, {
     252        MIMETYPE => 'text/plain; charset=ISO-8859-1',
     253        ENCODING => '8bit',
     254        DATA => $self->{BODY_PLAIN},
     255        DISPOSITION => 'inline',
     256      });
     257    }
     258    if ($self->{BODY_HTML}) {
     259      $self->_SaveAttachment($bodymime, {
     260        MIMETYPE => 'text/html',
     261        ENCODING => '8bit',
     262        DATA => $self->{BODY_HTML},
     263        DISPOSITION => 'inline',
     264      });
     265    }
     266    foreach my $att (@{$self->{ATTACHMENTS}}) {
     267      $self->_SaveAttachment($mime, $att);
     268    }
     269  } elsif ($self->{BODY_PLAIN}) {
     270    # Construct a single part message object with a plain text body
     271    $mime = MIME::Entity->build(
     272      Type => "text/plain",
     273      Data => $self->{BODY_PLAIN}
     274    );
     275  } elsif ($self->{BODY_HTML}) {
     276    # Construct a single part message object with an HTML body
     277    $mime = MIME::Entity->build(
     278      Type => "text/html",
     279      Data => $self->{BODY_HTML}
     280    );
     281  }
     282
     283  $self->_CopyHeaderData($mime);
     284
     285  $self->_SetHeaderFields($mime);
     286
     287  return $mime;
     288}
     289
     290# Actually output the message in mbox format
     291sub print {
     292  my $self = shift;
     293
     294  my $mime = $self->mime_object;
     295
     296  # Construct From line from whatever we know.
     297  my $string = "";
     298  $string = (
     299    $self->{FROM_ADDR_TYPE} eq "SMTP" ?
     300    $self->{FROM_ADDR} :
     301    'someone@somewhere'
     302  );
     303  $string =~ s/\n//g;
     304
     305  # The date used here is not really important.
     306  print "From ", $string, " ", scalar localtime, "\n";
     307  $mime->print(\*STDOUT);
     308  print "\n";
     309}
     310
     311sub set_verbosity {
     312  my ($self, $verbosity) = @_;
     313  defined $verbosity or die "Internal error: no verbosity level";
     314  $self->{VERBOSE} = $verbosity;
     315}
     316
     317#
     318# Below are functions that walk the PPS tree. The *Dir functions handle
     319# processing the directory nodes of the tree (mainly, iterating over the
     320# children), whereas the *Item functions handle processing the items in the
     321# directory (if such an item is itself a directory, it will in turn be
     322# processed by the relevant *Dir function).
     323#
     324
     325#
     326# RootItem: Check Root Entry, parse sub-entries.
     327# The OLE file consists of a single entry called Root Entry, which has
     328# several children. These children are parsed in the sub SubItem.
     329#
     330sub _RootDir {
     331  my ($self, $PPS) = @_;
     332
     333  foreach my $child (@{$PPS->{Child}}) {
     334    $self->_SubItem($child);
     335  }
     336}
     337
     338sub _SubItem {
     339  my ($self, $PPS) = @_;
     340 
     341  if ($PPS->{Type} == DIR_TYPE) {
     342    $self->_SubItemDir($PPS);
     343  } elsif ($PPS->{Type} == FILE_TYPE) {
     344    $self->_SubItemFile($PPS);
     345  } else {
     346    warn "Unknown entry type: $PPS->{Type}";
     347  }
     348}
     349
     350sub _SubItemDir {
     351  my ($self, $PPS) = @_;
     352
     353  $self->_GetOLEDate($PPS);
     354
     355  my $name = $self->_GetName($PPS);
     356
     357  if ($name =~ /__recip_version1 0_ /) { # Address of one recipient
     358    $self->_AddressDir($PPS);
     359  } elsif ($name =~ '__attach_version1 0_ ') { # Attachment
     360    $self->_AttachmentDir($PPS);
     361  } else {
     362    $self->_UnknownDir($self->_GetName($PPS));
     363  }
     364}
     365
     366sub _SubItemFile {
     367  my ($self, $PPS) = @_;
     368
     369  my $name = $self->_GetName($PPS);
     370  my ($property, $encoding) = $self->_ParseItemName($name);
     371
     372  $self->_MapProperty($self, $PPS->{Data}, $property,
     373    MAP_SUBITEM_FILE) or $self->_UnknownFile($name);
     374}
     375
     376sub _AddressDir {
     377  my ($self, $PPS) = @_;
     378
     379  my $address = {
     380    NAME        => undef,
     381    ADDRESS     => undef,
     382    TYPE        => "",
     383  };
     384  foreach my $child (@{$PPS->{Child}}) {
     385    $self->_AddressItem($child, $address);
     386  }
     387  push @{$self->{ADDRESSES}}, $address;
     388}
     389
     390sub _AddressItem {
     391  my ($self, $PPS, $addr_info) = @_;
     392
     393  my $name = $self->_GetName($PPS);
     394
     395  # DIR Entries: There should be none.
     396  if ($PPS->{Type} == DIR_TYPE) {
     397    $self->_UnknownDir($name);
     398  } elsif ($PPS->{Type} == FILE_TYPE) {
     399    my ($property, $encoding) = $self->_ParseItemName($name);
     400    $self->_MapProperty($addr_info, $PPS->{Data}, $property,
     401      MAP_ADDRESSITEM_FILE) or $self->_UnknownFile($name);
     402  } else {
     403    warn "Unknown entry type: $PPS->{Type}";
     404  }
     405}
     406
     407sub _AttachmentDir {
     408  my ($self, $PPS) = @_;
     409
     410  my $attachment = {
     411    SHORTNAME   => undef,
     412    LONGNAME    => undef,
     413    MIMETYPE    => 'application/octet-stream',
     414    ENCODING    => 'base64',
     415    DISPOSITION => 'attachment',
     416    DATA        => undef
     417  };
     418  foreach my $child (@{$PPS->{Child}}) {
     419    $self->_AttachmentItem($child, $attachment);
     420  }
     421  push @{$self->{ATTACHMENTS}}, $attachment;
     422}
     423
     424sub _AttachmentItem {
     425  my ($self, $PPS, $att_info) = @_;
     426
     427  my $name = $self->_GetName($PPS);
     428
     429  my ($property, $encoding) = $self->_ParseItemName($name);
     430
     431  if ($PPS->{Type} == DIR_TYPE) {
     432
     433    if ($property eq '3701') {  # Nested MSG file
     434      my $msgp = new MSGParser();
     435      $msgp->parse($PPS);
     436      my $data = $msgp->mime_object->as_string;
     437      $att_info->{DATA} = $data;
     438      $att_info->{MIMETYPE} = 'message/rfc822';
     439      $att_info->{ENCODING} = '8bit';
     440    } else {
     441      $self->_UnknownDir($name);
     442    }
     443
     444  } elsif ($PPS->{Type} == FILE_TYPE) {
     445    $self->_MapProperty($att_info, $PPS->{Data}, $property,
     446      MAP_ATTACHMENT_FILE) or $self->_UnknownFile($name);
     447  } else {
     448    warn "Unknown entry type: $PPS->{Type}";
     449  }
     450}
     451
     452sub _MapProperty {
     453  my ($self, $hash, $data, $property, $map) = @_;
     454
     455  defined $property or return 0;
     456  my $arr = $map->{$property} or return 0;
     457
     458  $arr->[1] and $data =~ s/\000//g;
     459  $hash->{$arr->[0]} = $data;
     460
     461  return 1;
     462}
     463
     464sub _UnknownDir {
     465  my ($self, $name) = @_;
     466
     467  if ($name eq '__nameid_version1 0') {
     468    $self->{VERBOSE}
     469      and warn "Skipping DIR entry $name (Introductory stuff)\n";
     470    return;
     471  }
     472  warn "Unknown DIR entry $name\n";
     473}
     474
     475sub _UnknownFile {
     476  my ($self, $name) = @_;
     477
     478  if ($name eq '__properties_version1 0') {
     479    $self->{VERBOSE}
     480      and warn "Skipping FILE entry $name (Properties)\n";
     481    return;
     482  }
     483
     484  my ($property, $encoding) = $self->_ParseItemName($name);
     485  unless (defined $property) {
     486    warn "Unknown FILE entry $name\n";
     487    return;
     488  }
     489  if ($skipproperties->{$property}) {
     490    $self->{VERBOSE}
     491      and warn "Skipping property $property ($skipproperties->{$property})\n";
     492    return;
     493  } elsif ($property =~ /^80/) {
     494    $self->{VERBOSE}
     495      and warn "Skipping property $property (user-defined property)\n";
     496    return;
     497  } else {
     498    warn "Unknown property $property\n";
     499    return;
     500  }
     501}
     502
     503#
     504# Helper functions
     505#
     506
     507sub _GetName {
     508  my ($self, $PPS) = @_;
     509  return $self->_NormalizeWhiteSpace(OLE::Storage_Lite::Ucs2Asc($PPS->{Name}));
     510}
     511
     512sub _NormalizeWhiteSpace {
     513  my ($self, $name) = @_;
     514  $name =~ s/\W/ /g;
     515  return $name;
     516}
     517
     518sub _GetOLEDate {
     519  my ($self, $PPS) = @_;
     520  unless (defined ($self->{OLEDATE})) {
     521    # Make Date
     522    my $datearr;
     523    $datearr = $PPS->{Time2nd};
     524    $datearr = $PPS->{Time1st} unless($datearr);
     525    $self->{OLEDATE} = $self->_FormatDate($datearr) if $datearr;
     526  }
     527}
     528
     529sub _FormatDate {
     530  my ($self, $datearr) = @_;
     531
     532  # TODO: This is a little convoluted. Directly using strftime didn't seem
     533  # to work.
     534  my $datetime = mktime(@$datearr);
     535  return time2str("%a, %d %h %Y %X %z", $datetime);
     536}
     537
     538# If we didn't get the date from the original header data, we may be able
     539# to get it from the SUBMISSION_ID:
     540# It seems to have the format of a semicolon-separated list of key=value
     541# pairs. The key l has a value with the format:
     542# <SERVER>-<DATETIME>Z-<NUMBER>, where DATETIME is the date and time in
     543# the format YYMMDDHHMMSS.
     544sub _SubmissionIdDate {
     545  my $self = shift;
     546
     547  my $submission_id = $self->{SUBMISSION_ID} or return undef;
     548  $submission_id =~ m/l=.*-(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)Z-.*/
     549    or return undef;
     550  my $year = $1;
     551  $year += 100 if $year < 20;
     552  return $self->_FormatDate([$6,$5,$4,$3,$2-1,$year]);
     553}
     554
     555sub _ParseItemName {
     556  my ($self, $name) = @_;
     557
     558  if ($name =~ /^__substg1 0_(....)(....)$/) {
     559    my ($property, $encoding) = ($1, $2);
     560    if ($encoding eq ENCODING_UNICODE and not ($self->{HAS_UNICODE})) {
     561      warn "This MSG file contains Unicode fields."
     562        . " This is currently unsupported.\n";
     563      $self->{HAS_UNICODE} = 1;
     564    } elsif (not (KNOWN_ENCODINGS()->{$encoding})) {
     565      warn "Unknown encoding $encoding. Results may be strange or wrong.\n";
     566    }
     567    return ($property, $encoding);
     568  } else {
     569    return (undef, undef);
     570  }
     571}
     572
     573sub _SaveAttachment {
     574  my ($self, $mime, $att) = @_;
     575
     576  my $ent = $mime->attach(
     577    Type => $att->{MIMETYPE},
     578    Encoding => $att->{ENCODING},
     579    Data => [],
     580    Filename => ($att->{LONGNAME} ? $att->{LONGNAME} : $att->{SHORTNAME}),
     581    Disposition => $att->{DISPOSITION}
     582  );
     583
     584  my $handle;
     585  if ($handle = $ent->open("w")) {
     586    $handle->print($att->{DATA});
     587    $handle->close;
     588  } else {
     589    warn "Could not write data!";
     590  }
     591}
     592
     593sub _SetAddressPart {
     594  my ($self, $adrname, $partname, $data) = @_;
     595
     596  my $address = $self->{ADDRESSES}->{$adrname};
     597  $data =~ s/\000//g;
     598  #warn "Processing address data part $partname : $data\n";
     599  if (defined ($address->{$partname})) {
     600    if ($address->{$partname} eq $data) {
     601      warn "Skipping duplicate but identical address information for"
     602      . " $partname\n" if $self->{VERBOSE};
     603    } else {
     604      warn "Address information $partname inconsistent:\n";
     605      warn "    Original data: $address->{$partname}\n";
     606      warn "    New data: $data\n";
     607    }
     608  } else {
     609    $address->{$partname} = $data;
     610  }
     611}
     612
     613# Set header fields
     614sub _AddHeaderField {
     615  my ($self, $mime, $fieldname, $value) = @_;
     616
     617  my $oldvalue = $mime->head->get($fieldname);
     618  return if $oldvalue;
     619  $mime->head->add($fieldname, $value) if $value;
     620}
     621
     622sub _Address {
     623  my ($self, $tag) = @_;
     624  my $name = $self->{$tag} || "";
     625  my $address = $self->{$tag . "_ADDR"} || "";
     626  return "$name <$address>";
     627}
     628
     629# Find SMTP addresses for the given list of names
     630sub _ExpandAddressList {
     631  my ($self, $names) = @_;
     632
     633  my $addresspool = $self->{ADDRESSES};
     634  my @namelist = split /; */, $names;
     635  my @result;
     636  name: foreach my $name (@namelist) {
     637    foreach my $address (@$addresspool) {
     638      if ($name eq $address->{NAME}) {
     639        my $addresstext = $address->{NAME} . " <";
     640        if (defined ($address->{SMTPADDRESS})) {
     641          $addresstext .= $address->{SMTPADDRESS};
     642        } elsif ($address->{TYPE} eq "SMTP") {
     643          $addresstext .= $address->{ADDRESS};
     644        }
     645        $addresstext .= ">";
     646        push @result, $addresstext;
     647        next name;
     648      }
     649    }
     650    push @result, $name;
     651  }
     652  return join ", ", @result;
     653}
     654
     655sub _ParseHead {
     656  my ($self, $data) = @_;
     657  defined $data or return undef;
     658  # Parse full header date if we got that.
     659  my $parser = new MIME::Parser();
     660  $parser->output_to_core(1);
     661  $parser->decode_headers(1);
     662  $data =~ s/^Microsoft Mail.*$/X-MSGConvert: yes/m;
     663  my $entity = $parser->parse_data($data)
     664    or warn "Couldn't parse full headers!";
     665  my $head = $entity->head;
     666  $head->unfold;
     667  return $head;
     668}
     669
     670# Find out if we need to construct a multipart message
     671sub _IsMultiPart {
     672  my $self = shift;
     673
     674  return (
     675    ($self->{BODY_HTML} and $self->{BODY_PLAIN})
     676      or @{$self->{ATTACHMENTS}}>0
     677  );
     678}
     679
     680# Copy original header data.
     681# Note: This should contain the Date: header.
     682sub _CopyHeaderData {
     683  my ($self, $mime) = @_;
     684
     685  my $head = $self->_ParseHead($self->{HEAD}) or return;
     686
     687  foreach my $tag (grep {!$skipheaders->{$_}} $head->tags) {
     688    foreach my $value ($head->get_all($tag)) {
     689      $mime->head->add($tag, $value);
     690    }
     691  }
     692}
     693
     694# Set header fields
     695sub _SetHeaderFields {
     696  my ($self, $mime) = @_;
     697
     698  # If we didn't get the date from the original header data, we may be able
     699  # to get it from the SUBMISSION_ID:
     700  $self->_AddHeaderField($mime, 'Date', $self->_SubmissionIdDate());
     701
     702  # Third and last chance to set the Date: header; this uses the date the
     703  # MSG file was saved.
     704  $self->_AddHeaderField($mime, 'Date', $self->{OLEDATE});
     705  $self->_AddHeaderField($mime, 'Subject', $self->{SUBJECT});
     706  $self->_AddHeaderField($mime, 'From', $self->_Address("FROM"));
     707  #$self->_AddHeaderField($mime, 'Reply-To', $self->_Address("REPLYTO"));
     708  $self->_AddHeaderField($mime, 'To', $self->_ExpandAddressList($self->{TO}));
     709  $self->_AddHeaderField($mime, 'Cc', $self->_ExpandAddressList($self->{CC}));
     710  $self->_AddHeaderField($mime, 'Message-Id', $self->{MESSAGEID});
     711  $self->_AddHeaderField($mime, 'In-Reply-To', $self->{INREPLYTO});
     712}
     713
     714package main;
     715use Getopt::Long;
     716use Pod::Usage;
     717
     718# Setup command line processing.
     719my $verbose = '';
     720my $help = '';      # Print help message and exit.
     721GetOptions('verbose' => \$verbose, 'help|?' => \$help) or pod2usage(2);
     722pod2usage(1) if $help;
     723
     724# Get file name
     725my $file = $ARGV[0];
     726defined $file or pod2usage(2);
     727warn "Will parse file: $file\n" if $verbose;
     728
     729# Load and parse MSG file (is OLE)
     730my $Msg = OLE::Storage_Lite->new($file);
     731my $PPS = $Msg->getPpsTree(1);
     732$PPS or die "$file must be an OLE file";
     733
     734# parse PPS tree
     735my $parser = new MSGParser();
     736$parser->set_verbosity(1) if $verbose;
     737$parser->parse($PPS);
     738$parser->print();
     739
     740#
     741# Usage info follows.
     742#
     743__END__
     744
     745=head1 NAME
     746
     747msgconvert.pl - Convert Outlook .msg files to mbox format
     748
     749=head1 SYNOPSIS
     750
     751msgconvert.pl [options] <file.msg>
     752
     753  Options:
     754    --verbose   be verbose
     755    --help      help message
     756
     757=head1 OPTIONS
     758
     759=over 8
     760
     761=item B<--verbose>
     762
     763    Print information about skipped parts of the .msg file.
     764
     765=item B<--help>
     766
     767    Print a brief help message.
     768
     769=head1 DESCRIPTION
     770
     771This program will output the message contained in file.msg in mbox format
     772on stdout. It will complain about unrecognized OLE parts on
     773stderr.
     774
     775=head1 BUGS
     776
     777Not all data that's in the .MSG file is converted. There simply are some
     778parts whose meaning escapes me. One of these must contain the date the
     779message was sent, for example. Formatting of text messages will also be
     780lost. YMMV.
     781
     782=cut
  • Makefile.am

     
    7878pkglibbin_PROGRAMS = omega
    7979dist_pkglibbin_SCRIPTS = outlookmsg2html
    8080bin_PROGRAMS = omindex scriptindex
     81dist_libexec_SCRIPTS = outlook2text excel2text mimeexplode msgconvert.pl
    8182dist_bin_SCRIPTS = dbi2omega htdig2omega mbox2omega
    8283
    8384check_PROGRAMS = htmlparsetest md5test utf8converttest
     
    160161MAINTAINERCLEANFILES = $(dist_man_MANS)
    161162endif
    162163
     164CLEANFILES = outlook2text
     165
     166outlook2text: $(srcdir)/outlook2text.in Makefile
     167        sed "s,@MSGCONVERT@,$(MSGCONVERT),;s,@MIMEEXPLODE@,$(pkglibbindir)/mimeexplode," $(srcdir)/outlook2text.in > $@
     168
    163169if DOCUMENTATION_RULES
    164170omindex.1: omindex$(EXEEXT) makemanpage
    165171        ./makemanpage ./omindex $(srcdir)/omindex.cc omindex.1