Ticket #114: libextractor.patch
File libextractor.patch, 4.9 KB (added by , 18 years ago) |
---|
-
omega/omindex.cc
26 26 #include <algorithm> 27 27 #include <fstream> 28 28 #include <iostream> 29 #include <sstream> 29 30 #include <string> 30 31 #include <map> 31 32 #include <vector> … … 41 42 42 43 #include <xapian.h> 43 44 45 #include <magic.h> 46 #include <extractor.h> 47 44 48 #include "commonhelp.h" 45 49 #include "hashterm.h" 46 50 #include "indextext.h" … … 208 212 } 209 213 210 214 static void 211 index_file(const string &url, const string &mimetype, time_t last_mod, off_t size)215 index_file(const string &url, EXTRACTOR_ExtractorList* extractors, time_t last_mod, off_t size, magic_t cookie) 212 216 { 213 217 string file = root + url; 214 218 string title, sample, keywords, dump; 215 219 216 cout << "Indexing \"" << url << " \" as " << mimetype << "... " << flush;220 cout << "Indexing \"" << url << " ... " << flush; 217 221 218 222 string urlterm("U"); 219 223 urlterm += baseurl; … … 228 232 } 229 233 230 234 string md5; 235 #if 0 231 236 if (mimetype == "text/html") { 232 237 string text; 233 238 try { … … 429 434 cout << "unknown MIME type - skipping\n"; 430 435 return; 431 436 } 437 #endif 432 438 433 // Compute the MD5 of the file if we haven't already. 434 if (md5.empty() && md5_file(file, md5) == 0) { 435 cout << "failed to read file to calculate MD5 checksum - skipping\n"; 436 return; 439 // Find mime type of file using libmagic 440 const char* mime = magic_file(cookie, file.c_str()); 441 if (mime == NULL) { 442 std::cerr << magic_error(cookie) << std::endl; 443 throw; 437 444 } 445 string mimetype(mime); 446 cout << "type " << mimetype << "..."; 447 // If file is plain text, just index it as-is 448 // This ignores the charset determined by libmagic... i.e. us-ascii 449 if (strncmp(mime, "text/plain", strlen("text/plain")) == 0) { 450 try { 451 // Currently we assume that text files are UTF-8. 452 // FIXME: What charset is the file? Look for BOM? Look at contents? 453 dump = file_to_string(file); 454 } catch (ReadError) { 455 cout << "can't read \"" << file << "\" - skipping\n"; 456 return; 457 } 458 } 459 else { 460 stringstream ss; 461 EXTRACTOR_KeywordList* exkeywords = EXTRACTOR_getKeywords(extractors, file.c_str()); 462 //exkeywords = EXTRACTOR_removeDuplicateKeywords(exkeywords, EXTRACTOR_DUPLICATES_TYPELESS); 438 463 464 if (exkeywords == NULL) { 465 cout << "ignored by libextractor.\n"; 466 return; 467 } 468 469 while (exkeywords != NULL) { 470 if (exkeywords->keywordType == EXTRACTOR_TITLE) 471 title = exkeywords->keyword; 472 else if (exkeywords->keywordType == EXTRACTOR_KEYWORDS) 473 keywords = exkeywords->keyword; 474 else if (exkeywords->keywordType == EXTRACTOR_DESCRIPTION) 475 sample = exkeywords->keyword; 476 477 ss << exkeywords->keyword << " "; 478 exkeywords = exkeywords->next; 479 } 480 dump = ss.str(); 481 EXTRACTOR_freeKeywords(exkeywords); 482 483 // Compute the MD5 of the file if we haven't already. 484 if (md5.empty() && md5_file(file, md5) == 0) { 485 cout << "failed to read file to calculate MD5 checksum - skipping\n"; 486 return; 487 } 488 } 489 439 490 // Produce a sample 440 491 if (sample.empty()) { 441 492 sample = truncate_to_word(dump, SAMPLE_SIZE); … … 535 586 { 536 587 struct dirent *ent; 537 588 string path = root + indexroot + dir; 589 EXTRACTOR_ExtractorList* extractors = EXTRACTOR_loadDefaultLibraries(); 538 590 591 #if 1 592 magic_t cookie = magic_open(MAGIC_MIME); 593 int result = magic_load(cookie, NULL); 594 if (result == -1) { 595 cout << "Can't open MIME database: " << magic_error(cookie) << endl; 596 throw; 597 } 598 #endif 599 539 600 cout << "[Entering directory " << dir << "]" << endl; 540 601 541 602 DIR *d = opendir(path.c_str()); … … 583 644 cout << "Skipping empty file: \"" << file << "\"" << endl; 584 645 continue; 585 646 } 586 string ext;587 string::size_type dot = url.find_last_of('.');588 if (dot != string::npos) ext = url.substr(dot + 1);589 647 590 map<string,string>::iterator mt = mime_map.find(ext); 591 if (mt != mime_map.end()) { 648 //if (mimetype != "data") { 592 649 // It's in our MIME map so we know how to index it. 593 const string & mimetype = mt->second;594 650 try { 595 index_file(indexroot + url, mimetype, statbuf.st_mtime,596 statbuf.st_size );651 index_file(indexroot + url, extractors, statbuf.st_mtime, 652 statbuf.st_size, cookie); 597 653 } catch (NoSuchFilter) { 598 654 // FIXME: we ought to ignore by mime-type not extension. 599 cout << "Filter for \"" << mimetype << "\" not installed - ignoring extension \"" << ext << "\"" << endl; 600 mime_map.erase(mt); 655 // cout << "Filter for \"" << mimetype << "\" not installed - ignoring file \"" << file << "\"" << endl; 601 656 } 602 }657 //} 603 658 continue; 604 659 } 605 660 cout << "Not a regular file \"" << file << "\" - skipping\n"; 606 661 } 607 662 closedir(d); 663 EXTRACTOR_removeAll(extractors); 664 magic_close(cookie); 608 665 } 609 666 610 667 int