Ticket #114: libextractor.patch

File libextractor.patch, 4.9 KB (added by Ryan Underwood, 18 years ago)

patch to use libmagic and libextractor

  • omega/omindex.cc

     
    2626#include <algorithm>
    2727#include <fstream>
    2828#include <iostream>
     29#include <sstream>
    2930#include <string>
    3031#include <map>
    3132#include <vector>
     
    4142
    4243#include <xapian.h>
    4344
     45#include <magic.h>
     46#include <extractor.h>
     47
    4448#include "commonhelp.h"
    4549#include "hashterm.h"
    4650#include "indextext.h"
     
    208212}
    209213
    210214static void
    211 index_file(const string &url, const string &mimetype, time_t last_mod, off_t size)
     215index_file(const string &url, EXTRACTOR_ExtractorList* extractors, time_t last_mod, off_t size, magic_t cookie)
    212216{
    213217    string file = root + url;
    214218    string title, sample, keywords, dump;
    215219
    216     cout << "Indexing \"" << url << "\" as " << mimetype << " ... " << flush;
     220    cout << "Indexing \"" << url << " ... " << flush;
    217221
    218222    string urlterm("U");
    219223    urlterm += baseurl;
     
    228232    }
    229233
    230234    string md5;
     235#if 0
    231236    if (mimetype == "text/html") {
    232237        string text;
    233238        try {
     
    429434        cout << "unknown MIME type - skipping\n";
    430435        return;
    431436    }
     437#endif
    432438
    433     // Compute the MD5 of the file if we haven't already.
    434     if (md5.empty() && md5_file(file, md5) == 0) {
    435         cout << "failed to read file to calculate MD5 checksum - skipping\n";
    436         return;
     439    // Find mime type of file using libmagic
     440    const char* mime = magic_file(cookie, file.c_str());
     441    if (mime == NULL) {
     442        std::cerr << magic_error(cookie) << std::endl;
     443        throw;
    437444    }
     445    string mimetype(mime);
     446    cout << "type " << mimetype << "...";
     447    // If file is plain text, just index it as-is
     448    // This ignores the charset determined by libmagic... i.e. us-ascii
     449    if (strncmp(mime, "text/plain", strlen("text/plain")) == 0) {
     450        try {
     451            // Currently we assume that text files are UTF-8.
     452            // FIXME: What charset is the file?  Look for BOM?  Look at contents?
     453            dump = file_to_string(file);
     454        } catch (ReadError) {
     455            cout << "can't read \"" << file << "\" - skipping\n";
     456            return;
     457        }
     458    }
     459    else {
     460        stringstream ss;
     461        EXTRACTOR_KeywordList* exkeywords = EXTRACTOR_getKeywords(extractors, file.c_str());
     462        //exkeywords = EXTRACTOR_removeDuplicateKeywords(exkeywords, EXTRACTOR_DUPLICATES_TYPELESS);
    438463
     464        if (exkeywords == NULL) {
     465            cout << "ignored by libextractor.\n";
     466            return;
     467        }
     468
     469        while (exkeywords != NULL) {
     470            if (exkeywords->keywordType == EXTRACTOR_TITLE)
     471                title = exkeywords->keyword;
     472            else if (exkeywords->keywordType == EXTRACTOR_KEYWORDS)
     473                keywords = exkeywords->keyword;
     474            else if (exkeywords->keywordType == EXTRACTOR_DESCRIPTION)
     475                sample = exkeywords->keyword;
     476
     477            ss << exkeywords->keyword << " ";
     478            exkeywords = exkeywords->next;
     479        }
     480        dump = ss.str();
     481        EXTRACTOR_freeKeywords(exkeywords);
     482
     483        // Compute the MD5 of the file if we haven't already.
     484        if (md5.empty() && md5_file(file, md5) == 0) {
     485            cout << "failed to read file to calculate MD5 checksum - skipping\n";
     486            return;
     487        }
     488    }
     489
    439490    // Produce a sample
    440491    if (sample.empty()) {
    441492        sample = truncate_to_word(dump, SAMPLE_SIZE);
     
    535586{
    536587    struct dirent *ent;
    537588    string path = root + indexroot + dir;
     589    EXTRACTOR_ExtractorList* extractors = EXTRACTOR_loadDefaultLibraries();
    538590
     591#if 1
     592    magic_t cookie = magic_open(MAGIC_MIME);
     593    int result = magic_load(cookie, NULL);
     594    if (result == -1) {
     595        cout << "Can't open MIME database: " << magic_error(cookie) << endl;
     596        throw;
     597    }
     598#endif
     599
    539600    cout << "[Entering directory " << dir << "]" << endl;
    540601
    541602    DIR *d = opendir(path.c_str());
     
    583644                cout << "Skipping empty file: \"" << file << "\"" << endl;
    584645                continue;
    585646            }
    586             string ext;
    587             string::size_type dot = url.find_last_of('.');
    588             if (dot != string::npos) ext = url.substr(dot + 1);
    589647
    590             map<string,string>::iterator mt = mime_map.find(ext);
    591             if (mt != mime_map.end()) {
     648            //if (mimetype != "data") {
    592649                // It's in our MIME map so we know how to index it.
    593                 const string & mimetype = mt->second;
    594650                try {
    595                     index_file(indexroot + url, mimetype, statbuf.st_mtime,
    596                                statbuf.st_size);
     651                    index_file(indexroot + url, extractors, statbuf.st_mtime,
     652                               statbuf.st_size, cookie);
    597653                } catch (NoSuchFilter) {
    598654                    // FIXME: we ought to ignore by mime-type not extension.
    599                     cout << "Filter for \"" << mimetype << "\" not installed - ignoring extension \"" << ext << "\"" << endl;
    600                     mime_map.erase(mt);
     655//                  cout << "Filter for \"" << mimetype << "\" not installed - ignoring file \"" << file << "\"" << endl;
    601656                }
    602             }
     657            //}
    603658            continue;
    604659        }
    605660        cout << "Not a regular file \"" << file << "\" - skipping\n";
    606661    }
    607662    closedir(d);
     663    EXTRACTOR_removeAll(extractors);
     664    magic_close(cookie);
    608665}
    609666
    610667int