Ticket #595: atomparse.2.diff

File atomparse.2.diff, 7.2 KB (added by Mihai Bivol, 12 years ago)
  • xapian-applications/omega/Makefile.am

    diff --git a/xapian-applications/omega/Makefile.am b/xapian-applications/omega/Makefile.am
    index 48aaf97..741c83d 100644
    a b noinst_HEADERS = omega.h query.h cgiparam.h\  
    9797 md5.h md5wrap.h xmlparse.h metaxmlparse.h values.h utf8convert.h\
    9898 namedentities.h pkglibbindir.h datematchdecider.h sample.h strcasecmp.h\
    9999 utf8truncate.h diritor.h runfilter.h freemem.h xpsxmlparse.h transform.h\
    100  weight.h svgparse.h tmpdir.h urldecode.h urlencode.h unixperm.h
     100 weight.h svgparse.h tmpdir.h urldecode.h urlencode.h unixperm.h atomparse.h
    101101
    102102# headers maintained in xapian-core
    103103noinst_HEADERS +=\
    omindex_SOURCES = omindex.cc myhtmlparse.cc htmlparse.cc\  
    138138 common/getopt.cc commonhelp.cc utils.cc hashterm.cc loadfile.cc md5.cc\
    139139 md5wrap.cc xmlparse.cc metaxmlparse.cc utf8convert.cc sample.cc diritor.cc\
    140140 runfilter.cc freemem.cc common/msvc_dirent.cc xpsxmlparse.cc common/str.cc\
    141  pkglibbindir.cc svgparse.cc tmpdir.cc urlencode.cc
     141 pkglibbindir.cc svgparse.cc tmpdir.cc urlencode.cc atomparse.cc
    142142if NEED_MKDTEMP
    143143omindex_SOURCES += portability/mkdtemp.cc
    144144endif
  • new file xapian-applications/omega/atomparse.cc

    diff --git a/xapian-applications/omega/atomparse.cc b/xapian-applications/omega/atomparse.cc
    new file mode 100644
    index 0000000..d10c6f8
    - +  
     1/** @file atomparse.cc
     2 * @brief Extract text from an RSS atom file.
     3 */
     4/* Copyright (C) 2010,2011 Olly Betts
     5 *
     6 * This program is free software; you can redistribute it and/or modify
     7 * it under the terms of the GNU General Public License as published by
     8 * the Free Software Foundation; either version 2 of the License, or
     9 * (at your option) any later version.
     10 *
     11 * This program is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 * GNU General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU General Public License
     17 * along with this program; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
     19 */
     20
     21#include <config.h>
     22#include <iostream>
     23
     24#include "atomparse.h"
     25#include "stringutils.h"
     26#include "myhtmlparse.h"
     27
     28using namespace std;
     29
     30void
     31AtomParser::process_text(const string &text)
     32{
     33        if (is_escaped)
     34                return;
     35
     36    string * target = NULL;
     37        string text_copy = text;
     38
     39    switch (state) {
     40        case TEXT:
     41            target = &dump;
     42            break;
     43        case TITLE:
     44            target = &title;
     45            break;
     46        case KEYWORDS:
     47            target = &keywords;
     48            break;
     49        case AUTHOR:
     50            target = &author;
     51            break;
     52        case OTHER:
     53            // Ignore context in other places.
     54            return;
     55    }
     56
     57        if (type == "html") {
     58                MyHtmlParser p;
     59                p.parse_html(text_copy, "iso-8859-1", false);
     60                text_copy = p.dump;
     61        }
     62
     63    if (!target->empty())
     64                *target += ' ';
     65
     66        *target += text_copy;
     67}
     68
     69bool
     70AtomParser::opening_tag(const string &tag)
     71{
     72        if (state == OTHER) {
     73                if (tag == "title")
     74                        state = in_entry ? KEYWORDS : TITLE;
     75                else if (tag == "summary" || tag == "subtitle" || tag == "content")
     76                        state = TEXT;
     77                else if (tag == "author")
     78                        state = AUTHOR;
     79                else if (tag == "entry")
     80                        in_entry = true;
     81                else if (tag == "category") {
     82                        //handle category term sepparatley
     83                        string new_keyword;
     84                        get_parameter("term", new_keyword);
     85                        keywords = keywords + ' ' + new_keyword;
     86                }
     87        } else if (state == AUTHOR) {
     88                if (tag == "uri")
     89                        is_escaped = true;
     90        }
     91
     92        get_parameter("type", type);
     93    return true;
     94}
     95
     96bool
     97AtomParser::closing_tag(const string &tag)
     98{
     99        if (tag == "entry")
     100                in_entry = false;
     101        if (tag == "uri")
     102                is_escaped = false;
     103
     104        else if (tag == "title" || tag == "summary" || tag == "subtitle" ||
     105                 tag == "author" || tag == "content")
     106                state = OTHER;
     107    return true;
     108}
  • new file xapian-applications/omega/atomparse.h

    diff --git a/xapian-applications/omega/atomparse.h b/xapian-applications/omega/atomparse.h
    new file mode 100644
    index 0000000..690d8d1
    - +  
     1/** @file atomparse.h
     2 * @brief Extract text from an RSS atom file.
     3 */
     4/* Copyright (C) 2010,2011 Olly Betts
     5 *
     6 * This program is free software; you can redistribute it and/or modify
     7 * it under the terms of the GNU General Public License as published by
     8 * the Free Software Foundation; either version 2 of the License, or
     9 * (at your option) any later version.
     10 *
     11 * This program is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 * GNU General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU General Public License
     17 * along with this program; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
     19 */
     20
     21#ifndef OMEGA_INCLUDED_ATOMPARSE_H
     22#define OMEGA_INCLUDED_ATOMPARSE_H
     23
     24#include "htmlparse.h"
     25
     26class AtomParser : public HtmlParser {
     27    enum { OTHER, TITLE, AUTHOR, KEYWORDS, TEXT } state;
     28        bool in_entry, is_escaped;
     29        string type;
     30  public:
     31    AtomParser() : state(OTHER), in_entry(false), is_escaped(false) { }
     32    void process_text(const string &text);
     33    bool opening_tag(const string &tag);
     34    bool closing_tag(const string &tag);
     35    string title, keywords, dump, author;
     36};
     37
     38#endif // OMEGA_INCLUDED_ATOMPARSE_H
  • xapian-applications/omega/omega.conf

    diff --git a/xapian-applications/omega/omega.conf b/xapian-applications/omega/omega.conf
    index 1dff85d..796d50c 100644
    a b  
    11# Directory containing Xapian databases:
    2 database_dir /var/lib/omega/data
     2database_dir /home/mihai/sandbox/xapiandb/
    33
    44# Directory containing OmegaScript templates:
    5 template_dir /var/lib/omega/templates
     5template_dir /home/mihai/repos/xapian/xapian-applications/omega/templates
    66
    77# Directory to write Omega logs to:
    88log_dir /var/log/omega
  • xapian-applications/omega/omindex.cc

    diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc
    index 7760941..bab3961 100644
    a b  
    6262#include "values.h"
    6363#include "xmlparse.h"
    6464#include "xpsxmlparse.h"
     65#include "atomparse.h"
    6566
    6667#include "gnu_getopt.h"
    6768
    index_mimetype(const string & file, const string & url, const string & ext,  
    765766            if (idx != string::npos) {
    766767                dump.assign(desc, idx + 1, string::npos);
    767768            }
     769        } else if (mimetype == "application/atom+xml") {
     770                AtomParser atomparser;
     771                atomparser.parse_html(d.file_to_string());
     772                dump = atomparser.dump;
     773                title = atomparser.title;
     774                keywords = atomparser.keywords;
     775                author = atomparser.author;
    768776        } else {
    769777            // Don't know how to index this type.
    770778            skip_unknown_mimetype(file, mimetype);
    main(int argc, char **argv)  
    11531161    // RPM packages:
    11541162    mime_map["rpm"] = "application/x-redhat-package-manager";
    11551163
     1164        // Atom feeds:
     1165        mime_map["atom"] = "application/atom+xml";
     1166
    11561167    // Extensions to quietly ignore:
    11571168    mime_map["a"] = "ignore";
    11581169    mime_map["bin"] = "ignore";