Ticket #282: omindex-assorted-enhancements.patch
File omindex-assorted-enhancements.patch, 14.0 KB (added by , 16 years ago) |
---|
-
omindex.cc
4 4 * Copyright 2001,2005 James Aylett 5 5 * Copyright 2001,2002 Ananova Ltd 6 6 * Copyright 2002,2003,2004,2005,2006,2007,2008 Olly Betts 7 * Copyright 2006 AVL List GesmbH 7 8 * 8 9 * This program is free software; you can redistribute it and/or 9 10 * modify it under the terms of the GNU General Public License as … … 42 43 #include <xapian.h> 43 44 44 45 #include "commonhelp.h" 46 #include "configfile.h" 45 47 #include "diritor.h" 46 48 #include "hashterm.h" 47 49 #include "loadfile.h" … … 72 74 73 75 static bool skip_duplicates = false; 74 76 static bool follow_symlinks = false; 77 static bool ignore_time = false; 78 static bool nocleanup = false; 75 79 static string dbpath; 76 80 static string root; 77 81 static string indexroot; 78 82 static string baseurl; 83 static string error_log; 79 84 static Xapian::WritableDatabase db; 80 85 static Xapian::Stem stemmer("english"); 81 86 static Xapian::TermGenerator indexer; … … 111 116 return safefile; 112 117 } 113 118 119 inline string 120 run_cmd(const string & cmd) 121 { 122 return stdout_to_string(cmd + error_log); 123 } 124 114 125 static bool ensure_tmpdir() { 115 126 if (!tmpdir.empty()) return true; 116 127 … … 140 151 get_pdf_metainfo(const string & safefile, string &title, string &keywords) 141 152 { 142 153 try { 143 string pdfinfo = stdout_to_string("pdfinfo -enc UTF-8 " + safefile);154 string pdfinfo = run_cmd("pdfinfo -enc UTF-8 " + safefile); 144 155 145 156 string::size_type idx; 146 157 … … 195 206 if (urlterm.length() > MAX_SAFE_TERM_LENGTH) 196 207 urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH); 197 208 198 if (skip_duplicates && db.term_exists(urlterm)) { 199 cout << "duplicate. Ignored." << endl; 200 return; 209 { 210 // First find the docid with the urlterm. 211 Xapian::docid docid = 0; 212 Xapian::PostingIterator p = db.postlist_begin(urlterm); 213 if (p != db.postlist_end(urlterm)) { 214 docid = *p; 215 if (skip_duplicates) { 216 cout << "duplicate. Ignored." << endl; 217 return; 218 } 219 } 220 if (docid && !ignore_time) { 221 // Check the timestamp. 222 Xapian::Document doc = db.get_document(docid); 223 string value = doc.get_value(VALUE_LASTMOD); 224 time_t old_last_mod = binary_string_to_int(value); 225 if (old_last_mod >= last_mod) { 226 cout << "not newer. Ignored." << endl; 227 return; 228 } 229 } 201 230 } 202 231 203 232 string md5; … … 253 282 string safefile = shell_protect(file); 254 283 string cmd = "pdftotext -enc UTF-8 " + safefile + " -"; 255 284 try { 256 dump = stdout_to_string(cmd);285 dump = run_cmd(cmd); 257 286 } catch (ReadError) { 258 287 cout << "\"" << cmd << "\" failed - skipping\n"; 259 288 return; … … 276 305 string safetmp = shell_protect(tmpfile); 277 306 string cmd = "ps2pdf " + shell_protect(file) + " " + safetmp; 278 307 try { 279 (void) stdout_to_string(cmd);308 (void)run_cmd(cmd); 280 309 cmd = "pdftotext -enc UTF-8 " + safetmp + " -"; 281 dump = stdout_to_string(cmd);310 dump = run_cmd(cmd); 282 311 } catch (ReadError) { 283 312 cout << "\"" << cmd << "\" failed - skipping" << endl; 284 313 unlink(tmpfile.c_str()); … … 302 331 string cmd = "unzip -p " + safefile + " content.xml"; 303 332 try { 304 333 XmlParser xmlparser; 305 xmlparser.parse_html( stdout_to_string(cmd));334 xmlparser.parse_html(run_cmd(cmd)); 306 335 dump = xmlparser.dump; 307 336 } catch (ReadError) { 308 337 cout << "\"" << cmd << "\" failed - skipping\n"; … … 312 341 cmd = "unzip -p " + safefile + " meta.xml"; 313 342 try { 314 343 MetaXmlParser metaxmlparser; 315 metaxmlparser.parse_html( stdout_to_string(cmd));344 metaxmlparser.parse_html(run_cmd(cmd)); 316 345 title = metaxmlparser.title; 317 346 keywords = metaxmlparser.keywords; 318 347 sample = metaxmlparser.sample; … … 322 351 } else if (mimetype == "application/msword") { 323 352 string cmd = "antiword -mUTF-8.txt " + shell_protect(file); 324 353 try { 325 dump = stdout_to_string(cmd);354 dump = run_cmd(cmd); 326 355 } catch (ReadError) { 327 356 cout << "\"" << cmd << "\" failed - skipping\n"; 328 357 return; … … 330 359 } else if (mimetype == "application/vnd.ms-excel") { 331 360 string cmd = "xls2csv -q0 -dutf-8 " + shell_protect(file); 332 361 try { 333 dump = stdout_to_string(cmd);362 dump = run_cmd(cmd); 334 363 } catch (ReadError) { 335 364 cout << "\"" << cmd << "\" failed - skipping\n"; 336 365 return; … … 338 367 } else if (mimetype == "application/vnd.ms-powerpoint") { 339 368 string cmd = "catppt -dutf-8 " + shell_protect(file); 340 369 try { 341 dump = stdout_to_string(cmd);370 dump = run_cmd(cmd); 342 371 } catch (ReadError) { 343 372 cout << "\"" << cmd << "\" failed - skipping\n"; 344 373 return; … … 349 378 // as they don't seem to be at all well documented. 350 379 string cmd = "wpd2text " + shell_protect(file); 351 380 try { 352 dump = stdout_to_string(cmd);381 dump = run_cmd(cmd); 353 382 } catch (ReadError) { 354 383 cout << "\"" << cmd << "\" failed - skipping\n"; 355 384 return; … … 358 387 // wps2text produces UTF-8 output from the sample files I've tested. 359 388 string cmd = "wps2text " + shell_protect(file); 360 389 try { 361 dump = stdout_to_string(cmd);390 dump = run_cmd(cmd); 362 391 } catch (ReadError) { 363 392 cout << "\"" << cmd << "\" failed - skipping\n"; 364 393 return; … … 380 409 string cmd = "gzip -dc " + shell_protect(file); 381 410 try { 382 411 XmlParser xmlparser; 383 xmlparser.parse_html( stdout_to_string(cmd));412 xmlparser.parse_html(run_cmd(cmd)); 384 413 dump = xmlparser.dump; 385 414 } catch (ReadError) { 386 415 cout << "\"" << cmd << "\" failed - skipping\n"; … … 389 418 } else if (mimetype == "text/rtf") { 390 419 // The --text option unhelpfully converts all non-ASCII characters to 391 420 // "?" so we use --html instead, which produces HTML entities. 392 string cmd = "unrtf --nopict --html 2>/dev/null" + shell_protect(file);421 string cmd = "unrtf --nopict --html " + shell_protect(file); 393 422 MyHtmlParser p; 394 423 try { 395 p.parse_html( stdout_to_string(cmd));424 p.parse_html(run_cmd(cmd)); 396 425 } catch (ReadError) { 397 426 cout << "\"" << cmd << "\" failed - skipping\n"; 398 427 return; … … 413 442 // from inspecting the source it looks like it's probably iso-8859-1. 414 443 string cmd = "pod2text " + shell_protect(file); 415 444 try { 416 dump = stdout_to_string(cmd);445 dump = run_cmd(cmd); 417 446 convert_to_utf8(dump, "ISO-8859-1"); 418 447 } catch (ReadError) { 419 448 cout << "\"" << cmd << "\" failed - skipping\n"; … … 427 456 // decompositions". 428 457 string cmd = "catdvi -e2 -s " + shell_protect(file); 429 458 try { 430 dump = stdout_to_string(cmd);459 dump = run_cmd(cmd); 431 460 convert_to_utf8(dump, "ISO-8859-1"); 432 461 } catch (ReadError) { 433 462 cout << "\"" << cmd << "\" failed - skipping\n"; … … 440 469 // (as it is in CP1250). 441 470 string cmd = "djvutxt " + shell_protect(file); 442 471 try { 443 dump = stdout_to_string(cmd);472 dump = run_cmd(cmd); 444 473 } catch (ReadError) { 445 474 cout << "\"" << cmd << "\" failed - skipping\n"; 446 475 return; 447 476 } 477 #if 0 // FIXME: this won't work as omindex will have the database locked... 478 } else if (mimetype == "message/rfc822") { 479 // => mbox2script 480 //for stemmer lang, parse stemmer.get_description => Xapian::Stem(bla) 481 string cmd = "(mbox2omega " + shell_protect(file) + "|" 482 "scriptindex " + shell_protect(dbpath) + " /usr/share/omega/mbox2script.script)"; 483 try { 484 dump = run_cmd(cmd); 485 } catch (ReadError) { 486 cout << "\"" << cmd << "\" failed - skipping\n"; 487 return; 488 } 489 #endif 490 } else if (mimetype == "application/vnd.ms-outlook") { // msg 491 string cmd = "outlook2text " + shell_protect(file); 492 try { 493 dump = run_cmd(cmd); 494 } catch (ReadError) { 495 cout << "\"" << cmd << "\" failed - skipping\n"; 496 return; 497 } 448 498 } else { 449 499 // Don't know how to index this type. 450 500 cout << "unknown MIME type - skipping\n"; … … 581 631 } 582 632 continue; 583 633 case DirectoryIterator::REGULAR_FILE: { 634 if (strcmp(d.leafname(), "mbox") == 0) { 635 // Special filename. 636 off_t size = d.get_size(); 637 time_t mtime = d.get_mtime(); 638 index_file(indexroot + url, "message/rfc822", mtime, 639 size); 640 continue; 641 } 642 584 643 string ext; 585 644 string::size_type dot = url.find_last_of('.'); 586 645 if (dot != string::npos) ext = url.substr(dot + 1); … … 613 672 614 673 // It's in our MIME map so we know how to index it. 615 674 const string & mimetype = mt->second; 675 676 // NOTE: unpacking does not work on MSWin32 this way! 677 // We'd really have to pull in utils.cc:rmdir from 678 // xapian-core. 679 #ifndef _MSC_VER 680 if (ext == "rar") { 681 // TODO: Check timestamp 682 string x = root+indexroot+"/.rar"; 683 cout << "[UNRAR into " << shell_protect(x+url) << "]" << endl; 684 run_cmd("mkdir -p "+shell_protect(x+url)); 685 string cmd = "unrar x -o+ " +shell_protect(file) + " " + shell_protect(x+url+"/"); 686 run_cmd(cmd); 687 index_directory(5, "/.rar"+url, mime_map); 688 if (!nocleanup) { 689 cout << "[CLEANUP " << "rm -rf " << shell_protect(x) << "]" << endl; 690 run_cmd("rm -rf "+shell_protect(x)); 691 } 692 } else if (ext == "zip") { 693 // TODO: Check timestamp 694 string x = root+indexroot+"/.zip"; 695 cout << "[UNZIP into " << shell_protect(x+url) << "]" << endl; 696 run_cmd("mkdir -p "+shell_protect(x+url)); 697 string cmd = "unzip -o " +shell_protect(file) + " -d " +shell_protect(x+url+"/"); 698 run_cmd(cmd); 699 index_directory(5, "/.zip"+url, mime_map); 700 if (!nocleanup) { 701 cout << "[CLEANUP " << "rm -rf " << shell_protect(x) << "]" << endl; 702 run_cmd("rm -rf "+shell_protect(x)); 703 } 704 } else if (ext == "pst") { 705 // TODO: Check timestamp 706 string x = root+indexroot+"/.pst"; 707 cout << "[READPST into " << shell_protect(x+url) << "]" << endl; 708 run_cmd("mkdir -p "+shell_protect(x+url)); 709 // unpack attachments also, together with mbox files 710 string cmd = "readpst -r -cv -w -o "+shell_protect(x+url)+" "+shell_protect(file); 711 run_cmd(cmd); 712 cout << "[UNPACK mbox attachments in " << shell_protect(x+url) << "]" << endl; 713 cmd = "/usr/bin/find "+shell_protect(x+url)+" -name mbox -execdir uudeview -a -o -i '{}' ';'"; 714 run_cmd(cmd); 715 index_directory(5, "/.pst"+url, mime_map); // mbox handling 716 if (!nocleanup) { 717 cout << "[CLEANUP " << "rm -rf " << shell_protect(x) << "]" << endl; 718 run_cmd("rm -rf "+shell_protect(x)); 719 } 720 } else 721 #endif 616 722 try { 617 723 time_t mtime = d.get_mtime(); 618 724 index_file(indexroot + url, mimetype, mtime, size); … … 653 759 // If preserve_unupdated is false, delete any documents we don't 654 760 // replace (if in replace duplicates mode) 655 761 bool preserve_unupdated = false; 762 // If ignore_time is true, the existing timestamps are not checked on 763 // updates and every file will be parsed. 656 764 size_t depth_limit = 0; 657 765 658 766 static const struct option longopts[] = { … … 667 775 { "depth-limit",required_argument, NULL, 'l' }, 668 776 { "follow", no_argument, NULL, 'f' }, 669 777 { "stemmer", required_argument, NULL, 's' }, 778 { "ignore-time",no_argument, NULL, 'i' }, 779 { "nocleanup", no_argument, NULL, 'c' }, 670 780 { 0, 0, NULL, 0 } 671 781 }; 672 782 … … 742 852 mime_map["djv"] = "image/vnd.djvu"; 743 853 mime_map["djvu"] = "image/vnd.djvu"; 744 854 745 while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:lpf", longopts, NULL)) != -1) { 855 mime_map["msg"] = "application/vnd.ms-outlook"; // outlook2text - single message 856 mime_map["pst"] = "application/vnd.ms-outlook-pst"; // readpst | mimeexpand (libpst, Mime-tools) Outlook messager folder 857 mime_map["mbox"] = "message/rfc822"; // => mbox2script 858 859 #ifndef _MSC_VER 860 mime_map["zip"] = "application/x-zip"; // recursive scanning 861 mime_map["rar"] = "application/x-rar"; // recursive scanning 862 #endif 863 864 read_config_file(); 865 866 while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:lpfi", longopts, NULL)) != -1) { 746 867 switch (getopt_ret) { 747 868 case 'h': { 748 869 cout << PROG_NAME" - "PROG_DESC"\n\n" … … 756 877 " -M, --mime-type additional MIME mapping ext:type\n" 757 878 " -l, --depth-limit=LIMIT set recursion limit (0 = unlimited)\n" 758 879 " -f, --follow follow symbolic links\n" 880 " -i, --ignore-time ignore timestamp comparison\n" 881 " --nocleanup don't delete temporary created from zip/rar/pst\n" 759 882 " --overwrite create the database anew (the default is to update\n" 760 883 " if the database already exists)" << endl; 761 884 print_stemmer_help(" "); … … 769 892 << "Copyright (c) 2001,2005 James Aylett\n" 770 893 << "Copyright (c) 2001,2002 Ananova Ltd\n" 771 894 << "Copyright (c) 2002,2003,2004,2005,2006 Olly Betts\n\n" 895 << "Copyright (c) 2006 AVL List GesmbH\n\n" 772 896 << "This is free software, and may be redistributed under\n" 773 897 << "the terms of the GNU Public License." << endl; 774 898 return 0; … … 785 909 case 'p': // don't delete unupdated documents 786 910 preserve_unupdated = true; 787 911 break; 912 case 'i': // --ignore-time: on updates parse the file again 913 ignore_time = true; 914 break; 915 case 'c': 916 nocleanup = true; 917 break; 788 918 case 'l': { // Set recursion limit 789 919 int arg = atoi(optarg); 790 920 if (arg < 0) arg = 0; … … 844 974 if (baseurl.empty()) { 845 975 cerr << PROG_NAME": --url not specified, assuming `/'.\n"; 846 976 } 977 error_log = " 2>>"+log_dir+"omindex-error.log"; 847 978 // baseurl mustn't end '/' or you end up with the wrong URL 848 979 // (//thing is different to /thing). We could probably make this 849 980 // safe a different way, by ensuring that we don't put a leading '/' -
Makefile.am
103 103 omindex_SOURCES = omindex.cc myhtmlparse.cc htmlparse.cc\ 104 104 common/getopt.cc commonhelp.cc utils.cc hashterm.cc loadfile.cc md5.cc\ 105 105 md5wrap.cc xmlparse.cc metaxmlparse.cc utf8convert.cc sample.cc diritor.cc\ 106 runfilter.cc freemem.cc common/msvc_dirent.cc 106 runfilter.cc freemem.cc common/msvc_dirent.cc configfile.cc 107 107 if NEED_MKDTEMP 108 108 omindex_SOURCES += portability/mkdtemp.cc 109 109 endif