Ticket #290: office2007.patch
File office2007.patch, 6.5 KB (added by , 15 years ago) |
---|
-
.cc
old new 72 72 73 73 static bool skip_duplicates = false; 74 74 static bool follow_symlinks = false; 75 static bool ignore_time = false; 75 76 static string dbpath; 76 77 static string root; 77 78 static string indexroot; … … 194 195 195 196 if (urlterm.length() > MAX_SAFE_TERM_LENGTH) 196 197 urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH); 197 198 if (skip_duplicates && db.term_exists(urlterm)) { 199 cout << "duplicate. Ignored." << endl; 200 return; 201 } 202 198 { 199 // First find the docid with the urlterm. 200 Xapian::docid docid = 0; 201 Xapian::PostingIterator p = db.postlist_begin(urlterm); 202 if (p != db.postlist_end(urlterm)) { 203 docid = *p; 204 if (skip_duplicates) { 205 cout << "duplicate. Ignored." << endl; 206 return; 207 } 208 } 209 if (docid && !ignore_time) { 210 // Check the timestamp. 211 Xapian::Document doc = db.get_document(docid); 212 string value = doc.get_value(VALUE_LASTMOD); 213 time_t old_last_mod = binary_string_to_int(value); 214 if (old_last_mod >= last_mod) { 215 cout << "not newer. Ignored." << endl; 216 if (docid < updated.size()) { 217 updated[docid] = true; 218 return; 219 }} 220 } 221 } 222 203 223 string md5; 204 224 if (mimetype == "text/html") { 205 225 string text; … … 351 371 cout << "\"" << cmd << "\" failed - skipping\n"; 352 372 return; 353 373 } 374 } else if (startswith(mimetype, "application/vnd.openxmlformats-officedocument.wordprocessingml.")) { 375 string safefile = shell_protect(file); 376 string cmd = "unzip -p " + safefile + " word/document.xml"; 377 try { 378 XmlParser xmlparser; 379 xmlparser.parse_html(stdout_to_string(cmd)); 380 dump = xmlparser.dump; 381 } catch (ReadError) { 382 cout << "\"" << cmd << "\" failed - skipping\n"; 383 return; 384 } 385 } else if (startswith(mimetype, "application/vnd.openxmlformats-officedocument.spreadsheetml.")) { 386 string safefile = shell_protect(file); 387 string cmd = "unzip -p " + safefile + " xl/sharedStrings.xml"; 388 try { 389 XmlParser xmlparser; 390 xmlparser.parse_html(stdout_to_string(cmd)); 391 dump = xmlparser.dump; 392 } catch (ReadError) { 393 cout << "\"" << cmd << "\" failed - skipping\n"; 394 return; 395 } 396 } else if (startswith(mimetype, "application/vnd.openxmlformats-officedocument.presentationml.")) { 397 string safefile = shell_protect(file); 398 string cmd = "unzip -p " + safefile + " ppt/slides/slide*.xml"; 399 try { 400 XmlParser xmlparser; 401 xmlparser.parse_html(stdout_to_string(cmd)); 402 dump = xmlparser.dump; 403 } catch (ReadError) { 404 cout << "\"" << cmd << "\" failed - skipping\n"; 405 return; 406 } 354 407 } else if (mimetype == "application/vnd.wordperfect") { 355 408 // Looking at the source of wpd2html and wpd2text I think both output 356 409 // utf-8, but it's hard to be sure without sample Unicode .wpd files … … 362 415 cout << "\"" << cmd << "\" failed - skipping\n"; 363 416 return; 364 417 } 418 } else if (mimetype == "application/vnd.ms-outlook") { 419 string cmd = "msg2txt.pl " + shell_protect(file) + " | strings"; 420 try { 421 dump = stdout_to_string(cmd); 422 } catch (ReadError) { 423 cout << "\"" << cmd << "\" failed - skipping\n"; 424 return; 425 } 365 426 } else if (mimetype == "application/vnd.ms-works") { 366 427 // wps2text produces UTF-8 output from the sample files I've tested. 367 428 string cmd = "wps2text " + shell_protect(file); … … 677 738 { "depth-limit",required_argument, NULL, 'l' }, 678 739 { "follow", no_argument, NULL, 'f' }, 679 740 { "stemmer", required_argument, NULL, 's' }, 741 { "ignore-time",no_argument, NULL, 'i' }, 680 742 { 0, 0, NULL, 0 } 681 743 }; 682 744 … … 730 792 // Some other word processor formats: 731 793 mime_map["doc"] = "application/msword"; 732 794 mime_map["dot"] = "application/msword"; // Word template 795 mime_map["docx"] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; //Word 2007 796 mime_map["dotx"] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; //Word 2007 template 733 797 mime_map["wpd"] = "application/vnd.wordperfect"; 734 798 mime_map["wps"] = "application/vnd.ms-works"; 735 799 mime_map["wpt"] = "application/vnd.ms-works"; // Works template … … 740 804 mime_map["xls"] = "application/vnd.ms-excel"; 741 805 mime_map["xlb"] = "application/vnd.ms-excel"; 742 806 mime_map["xlt"] = "application/vnd.ms-excel"; // Excel template 807 mime_map["xlsx"] = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; //Excel 2007 808 mime_map["xltx"] = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; //Excel 2007 template 743 809 mime_map["ppt"] = "application/vnd.ms-powerpoint"; 744 810 mime_map["pps"] = "application/vnd.ms-powerpoint"; // Powerpoint slideshow 811 mime_map["pptx"] = "application/vnd.openxmlformats-officedocument.presentationml.presentation"; //PowerPoint 2007 812 mime_map["ppsx"] = "application/vnd.openxmlformats-officedocument.presentationml.presentation"; //PowerPoint 2007 slideshow 813 mime_map["potx"] = "application/vnd.openxmlformats-officedocument.presentationml.presentation"; //PowerPoint 2007 template 814 mime_map["msg"] = "application/vnd.ms-outlook"; //Outlook .msg 745 815 // Perl: 746 816 mime_map["pl"] = "text/x-perl"; 747 817 mime_map["pm"] = "text/x-perl"; … … 752 822 mime_map["djv"] = "image/vnd.djvu"; 753 823 mime_map["djvu"] = "image/vnd.djvu"; 754 824 755 while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:lpf ", longopts, NULL)) != -1) {825 while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:lpfi", longopts, NULL)) != -1) { 756 826 switch (getopt_ret) { 757 827 case 'h': { 758 828 cout << PROG_NAME" - "PROG_DESC"\n\n" … … 766 836 " -M, --mime-type additional MIME mapping ext:type\n" 767 837 " -l, --depth-limit=LIMIT set recursion limit (0 = unlimited)\n" 768 838 " -f, --follow follow symbolic links\n" 839 " -i, --ignore-time ignore timestamp comparison\n" 769 840 " --overwrite create the database anew (the default is to update\n" 770 841 " if the database already exists)" << endl; 771 842 print_stemmer_help(" "); … … 795 866 case 'p': // don't delete unupdated documents 796 867 preserve_unupdated = true; 797 868 break; 869 case 'i': // --ignore-time: on updates parse the file again 870 ignore_time = true; 871 break; 798 872 case 'l': { // Set recursion limit 799 873 int arg = atoi(optarg); 800 874 if (arg < 0) arg = 0;