Ticket #595: atomparse.diff

File atomparse.diff, 6.0 KB (added by Mihai Bivol, 12 years ago)

Atom parser and allow Omega to index Atom files.

  • xapian-applications/omega/Makefile.am

    diff --git a/xapian-applications/omega/Makefile.am b/xapian-applications/omega/Makefile.am
    index 48aaf97..741c83d 100644
    a b noinst_HEADERS = omega.h query.h cgiparam.h\  
    9797 md5.h md5wrap.h xmlparse.h metaxmlparse.h values.h utf8convert.h\
    9898 namedentities.h pkglibbindir.h datematchdecider.h sample.h strcasecmp.h\
    9999 utf8truncate.h diritor.h runfilter.h freemem.h xpsxmlparse.h transform.h\
    100  weight.h svgparse.h tmpdir.h urldecode.h urlencode.h unixperm.h
     100 weight.h svgparse.h tmpdir.h urldecode.h urlencode.h unixperm.h atomparse.h
    101101
    102102# headers maintained in xapian-core
    103103noinst_HEADERS +=\
    omindex_SOURCES = omindex.cc myhtmlparse.cc htmlparse.cc\  
    138138 common/getopt.cc commonhelp.cc utils.cc hashterm.cc loadfile.cc md5.cc\
    139139 md5wrap.cc xmlparse.cc metaxmlparse.cc utf8convert.cc sample.cc diritor.cc\
    140140 runfilter.cc freemem.cc common/msvc_dirent.cc xpsxmlparse.cc common/str.cc\
    141  pkglibbindir.cc svgparse.cc tmpdir.cc urlencode.cc
     141 pkglibbindir.cc svgparse.cc tmpdir.cc urlencode.cc atomparse.cc
    142142if NEED_MKDTEMP
    143143omindex_SOURCES += portability/mkdtemp.cc
    144144endif
  • new file xapian-applications/omega/atomparse.cc

    diff --git a/xapian-applications/omega/atomparse.cc b/xapian-applications/omega/atomparse.cc
    new file mode 100644
    index 0000000..6194016
    - +  
     1/** @file atomparse.cc
     2 * @brief Extract text from an RSS atom file.
     3 */
     4/* Copyright (C) 2010,2011 Olly Betts
     5 *
     6 * This program is free software; you can redistribute it and/or modify
     7 * it under the terms of the GNU General Public License as published by
     8 * the Free Software Foundation; either version 2 of the License, or
     9 * (at your option) any later version.
     10 *
     11 * This program is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 * GNU General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU General Public License
     17 * along with this program; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
     19 */
     20
     21#include <config.h>
     22
     23#include "atomparse.h"
     24#include "stringutils.h"
     25
     26using namespace std;
     27
     28void
     29AtomParser::process_text(const string &text)
     30{
     31    string * target = NULL;
     32    switch (state) {
     33        case TEXT:
     34            target = &dump;
     35            break;
     36        case TITLE:
     37            target = &title;
     38            break;
     39        case KEYWORDS:
     40            target = &keywords;
     41            break;
     42        case AUTHOR:
     43            target = &author;
     44            break;
     45        case OTHER:
     46            // Ignore context in other places.
     47            return;
     48    }
     49    if (!target->empty())
     50        *target += ' ';
     51    *target += text;
     52}
     53
     54bool
     55AtomParser::opening_tag(const string &tag)
     56{
     57        if (state == OTHER) {
     58                if (tag == "title")
     59                        state = in_entry ? KEYWORDS : TITLE;
     60                else if (tag == "summary" || tag == "subtitle")
     61                        state = TEXT;
     62                else if (tag == "author")
     63                        state = AUTHOR;
     64                else if (tag == "category")
     65                        state = KEYWORDS;
     66                else if (tag == "entry")
     67                        in_entry = true;
     68        }
     69    return true;
     70}
     71
     72bool
     73AtomParser::closing_tag(const string &tag)
     74{
     75        if (tag == "entry")
     76                in_entry = false;
     77        else if (tag == "title" || tag == "summary" || tag == "subtitle" ||
     78                 tag == "author" || tag == "category")
     79                state = OTHER;
     80    return true;
     81}
  • new file xapian-applications/omega/atomparse.h

    diff --git a/xapian-applications/omega/atomparse.h b/xapian-applications/omega/atomparse.h
    new file mode 100644
    index 0000000..342cb33
    - +  
     1/** @file atomparse.h
     2 * @brief Extract text from an RSS atom file.
     3 */
     4/* Copyright (C) 2010,2011 Olly Betts
     5 *
     6 * This program is free software; you can redistribute it and/or modify
     7 * it under the terms of the GNU General Public License as published by
     8 * the Free Software Foundation; either version 2 of the License, or
     9 * (at your option) any later version.
     10 *
     11 * This program is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 * GNU General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU General Public License
     17 * along with this program; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
     19 */
     20
     21#ifndef OMEGA_INCLUDED_ATOMPARSE_H
     22#define OMEGA_INCLUDED_ATOMPARSE_H
     23
     24#include "htmlparse.h"
     25
     26class AtomParser : public HtmlParser {
     27    enum { OTHER, TITLE, AUTHOR, KEYWORDS, TEXT } state;
     28        bool in_entry;
     29  public:
     30    AtomParser() : state(OTHER), in_entry(false) { }
     31    void process_text(const string &text);
     32    bool opening_tag(const string &tag);
     33    bool closing_tag(const string &tag);
     34    string title, keywords, dump, author;
     35};
     36
     37#endif // OMEGA_INCLUDED_ATOMPARSE_H
  • xapian-applications/omega/omindex.cc

    diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc
    index 7760941..bab3961 100644
    a b  
    6262#include "values.h"
    6363#include "xmlparse.h"
    6464#include "xpsxmlparse.h"
     65#include "atomparse.h"
    6566
    6667#include "gnu_getopt.h"
    6768
    index_mimetype(const string & file, const string & url, const string & ext,  
    765766            if (idx != string::npos) {
    766767                dump.assign(desc, idx + 1, string::npos);
    767768            }
     769        } else if (mimetype == "application/atom+xml") {
     770                AtomParser atomparser;
     771                atomparser.parse_html(d.file_to_string());
     772                dump = atomparser.dump;
     773                title = atomparser.title;
     774                keywords = atomparser.keywords;
     775                author = atomparser.author;
    768776        } else {
    769777            // Don't know how to index this type.
    770778            skip_unknown_mimetype(file, mimetype);
    main(int argc, char **argv)  
    11531161    // RPM packages:
    11541162    mime_map["rpm"] = "application/x-redhat-package-manager";
    11551163
     1164        // Atom feeds:
     1165        mime_map["atom"] = "application/atom+xml";
     1166
    11561167    // Extensions to quietly ignore:
    11571168    mime_map["a"] = "ignore";
    11581169    mime_map["bin"] = "ignore";