Ticket #290: office2007.patch

File office2007.patch, 6.5 KB (added by Frank J Bruzzaniti, 15 years ago)

Patch adds support for .docx .dotx .xlsx .xltx .pptx .potx .ppsx

  • .cc

    old new  
    7272
    7373static bool skip_duplicates = false;
    7474static bool follow_symlinks = false;
     75static bool ignore_time = false;
    7576static string dbpath;
    7677static string root;
    7778static string indexroot;
     
    194195
    195196    if (urlterm.length() > MAX_SAFE_TERM_LENGTH)
    196197        urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH);
    197 
    198     if (skip_duplicates && db.term_exists(urlterm)) {
    199         cout << "duplicate. Ignored." << endl;
    200         return;
    201     }
    202 
     198      {
     199        // First find the docid with the urlterm.
     200        Xapian::docid docid = 0;
     201        Xapian::PostingIterator p = db.postlist_begin(urlterm);
     202        if (p != db.postlist_end(urlterm)) {
     203            docid = *p;
     204            if (skip_duplicates) {
     205                cout << "duplicate. Ignored." << endl;
     206                return;
     207            }
     208        }
     209        if (docid && !ignore_time) {
     210            // Check the timestamp.
     211            Xapian::Document doc = db.get_document(docid);
     212            string value = doc.get_value(VALUE_LASTMOD);
     213            time_t old_last_mod = binary_string_to_int(value);
     214            if (old_last_mod >= last_mod) {
     215                cout << "not newer. Ignored." << endl;
     216            if (docid < updated.size()) {
     217                updated[docid] = true;
     218                return;
     219            }}
     220        }
     221     }
     222   
    203223    string md5;
    204224    if (mimetype == "text/html") {
    205225        string text;
     
    351371            cout << "\"" << cmd << "\" failed - skipping\n";
    352372            return;
    353373        }
     374    } else if (startswith(mimetype, "application/vnd.openxmlformats-officedocument.wordprocessingml.")) {
     375    string safefile = shell_protect(file);
     376    string cmd = "unzip -p " + safefile + " word/document.xml";
     377    try {
     378        XmlParser xmlparser;
     379        xmlparser.parse_html(stdout_to_string(cmd));
     380        dump = xmlparser.dump;
     381    } catch (ReadError) {
     382        cout << "\"" << cmd << "\" failed - skipping\n";
     383        return;
     384    }
     385    } else if (startswith(mimetype, "application/vnd.openxmlformats-officedocument.spreadsheetml.")) {
     386    string safefile = shell_protect(file);
     387    string cmd = "unzip -p " + safefile + " xl/sharedStrings.xml";
     388    try {
     389        XmlParser xmlparser;
     390        xmlparser.parse_html(stdout_to_string(cmd));
     391        dump = xmlparser.dump;
     392    } catch (ReadError) {
     393        cout << "\"" << cmd << "\" failed - skipping\n";
     394        return;
     395    }
     396    } else if (startswith(mimetype, "application/vnd.openxmlformats-officedocument.presentationml.")) {
     397    string safefile = shell_protect(file);
     398    string cmd = "unzip -p " + safefile + " ppt/slides/slide*.xml";
     399    try {
     400        XmlParser xmlparser;
     401        xmlparser.parse_html(stdout_to_string(cmd));
     402        dump = xmlparser.dump;
     403    } catch (ReadError) {
     404        cout << "\"" << cmd << "\" failed - skipping\n";
     405        return;
     406    }
    354407    } else if (mimetype == "application/vnd.wordperfect") {
    355408        // Looking at the source of wpd2html and wpd2text I think both output
    356409        // utf-8, but it's hard to be sure without sample Unicode .wpd files
     
    362415            cout << "\"" << cmd << "\" failed - skipping\n";
    363416            return;
    364417        }
     418    } else if (mimetype == "application/vnd.ms-outlook") {
     419    string cmd = "msg2txt.pl " + shell_protect(file) + " | strings";
     420    try {
     421        dump = stdout_to_string(cmd);
     422    } catch (ReadError) {
     423        cout << "\"" << cmd << "\" failed - skipping\n";
     424        return;
     425    }
    365426    } else if (mimetype == "application/vnd.ms-works") {
    366427        // wps2text produces UTF-8 output from the sample files I've tested.
    367428        string cmd = "wps2text " + shell_protect(file);
     
    677738        { "depth-limit",required_argument,      NULL, 'l' },
    678739        { "follow",     no_argument,            NULL, 'f' },
    679740        { "stemmer",    required_argument,      NULL, 's' },
     741        { "ignore-time",no_argument,            NULL, 'i' },
    680742        { 0, 0, NULL, 0 }
    681743    };
    682744
     
    730792    // Some other word processor formats:
    731793    mime_map["doc"] = "application/msword";
    732794    mime_map["dot"] = "application/msword"; // Word template
     795    mime_map["docx"] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; //Word 2007
     796    mime_map["dotx"] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; //Word 2007 template
    733797    mime_map["wpd"] = "application/vnd.wordperfect";
    734798    mime_map["wps"] = "application/vnd.ms-works";
    735799    mime_map["wpt"] = "application/vnd.ms-works"; // Works template
     
    740804    mime_map["xls"] = "application/vnd.ms-excel";
    741805    mime_map["xlb"] = "application/vnd.ms-excel";
    742806    mime_map["xlt"] = "application/vnd.ms-excel"; // Excel template
     807    mime_map["xlsx"] = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; //Excel 2007
     808    mime_map["xltx"] = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; //Excel 2007 template
    743809    mime_map["ppt"] = "application/vnd.ms-powerpoint";
    744810    mime_map["pps"] = "application/vnd.ms-powerpoint"; // Powerpoint slideshow
     811    mime_map["pptx"] = "application/vnd.openxmlformats-officedocument.presentationml.presentation"; //PowerPoint 2007
     812    mime_map["ppsx"] = "application/vnd.openxmlformats-officedocument.presentationml.presentation"; //PowerPoint 2007 slideshow
     813    mime_map["potx"] = "application/vnd.openxmlformats-officedocument.presentationml.presentation"; //PowerPoint 2007 template
     814    mime_map["msg"] = "application/vnd.ms-outlook"; //Outlook .msg
    745815    // Perl:
    746816    mime_map["pl"] = "text/x-perl";
    747817    mime_map["pm"] = "text/x-perl";
     
    752822    mime_map["djv"] = "image/vnd.djvu";
    753823    mime_map["djvu"] = "image/vnd.djvu";
    754824
    755     while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:lpf", longopts, NULL)) != -1) {
     825    while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:lpfi", longopts, NULL)) != -1) {
    756826        switch (getopt_ret) {
    757827        case 'h': {
    758828            cout << PROG_NAME" - "PROG_DESC"\n\n"
     
    766836"  -M, --mime-type          additional MIME mapping ext:type\n"
    767837"  -l, --depth-limit=LIMIT  set recursion limit (0 = unlimited)\n"
    768838"  -f, --follow             follow symbolic links\n"
     839"  -i, --ignore-time        ignore timestamp comparison\n"
    769840"      --overwrite          create the database anew (the default is to update\n"
    770841"                           if the database already exists)" << endl;
    771842            print_stemmer_help("     ");
     
    795866        case 'p': // don't delete unupdated documents
    796867            preserve_unupdated = true;
    797868            break;
     869        case 'i': // --ignore-time: on updates parse the file again
     870            ignore_time = true;
     871            break;
    798872        case 'l': { // Set recursion limit
    799873            int arg = atoi(optarg);
    800874            if (arg < 0) arg = 0;