Ticket #595: atomparse.additional.patch

File atomparse.additional.patch, 12.0 KB (added by Olly Betts, 12 years ago)

additional changes applied

  • xapian-applications/omega/Makefile.am

    diff --git a/xapian-applications/omega/Makefile.am b/xapian-applications/omega/Makefile.am
    index 741c83d..01742ac 100644
    a b dist_pkglibbin_SCRIPTS = outlookmsg2html  
    8080bin_PROGRAMS = omindex scriptindex
    8181dist_bin_SCRIPTS = dbi2omega htdig2omega mbox2omega
    8282
    83 check_PROGRAMS = htmlparsetest md5test urlenctest utf8converttest
    84 TESTS = htmlparsetest$(EXEEXT)\
     83check_PROGRAMS = atomparsetest htmlparsetest md5test urlenctest utf8converttest
     84TESTS = atomparsetest$(EXEEXT)\
     85        htmlparsetest$(EXEEXT)\
    8586        md5test$(EXEEXT)\
    8687        urlenctest$(EXEEXT)\
    8788        utf8converttest$(EXEEXT)
    scriptindex_SOURCES = scriptindex.cc myhtmlparse.cc htmlparse.cc\  
    149150 common/safe.cc common/stringutils.cc utf8convert.cc utf8truncate.cc
    150151scriptindex_LDADD = $(XAPIAN_LIBS)
    151152
     153atomparsetest_SOURCES = atomparsetest.cc atomparse.cc htmlparse.cc\
     154 myhtmlparse.cc utf8convert.cc
     155atomparsetest_LDADD = $(XAPIAN_LIBS)
     156
    152157htmlparsetest_SOURCES = htmlparsetest.cc myhtmlparse.cc htmlparse.cc\
    153158 utf8convert.cc
    154159htmlparsetest_LDADD = $(XAPIAN_LIBS)
  • xapian-applications/omega/atomparse.cc

    diff --git a/xapian-applications/omega/atomparse.cc b/xapian-applications/omega/atomparse.cc
    index d10c6f8..2c286e5 100644
    a b  
    11/** @file atomparse.cc
    22 * @brief Extract text from an RSS atom file.
    33 */
    4 /* Copyright (C) 2010,2011 Olly Betts
     4/* Copyright (C) 2010,2011,2012 Olly Betts
     5 * Copyright (C) 2012 Mihai Bivol
    56 *
    67 * This program is free software; you can redistribute it and/or modify
    78 * it under the terms of the GNU General Public License as published by
     
    1920 */
    2021
    2122#include <config.h>
    22 #include <iostream>
    2323
    2424#include "atomparse.h"
    25 #include "stringutils.h"
    2625#include "myhtmlparse.h"
     26#include "stringutils.h"
    2727
    2828using namespace std;
    2929
    3030void
    3131AtomParser::process_text(const string &text)
    3232{
    33         if (is_escaped)
    34                 return;
     33    if (is_ignored)
     34        return;
    3535
    3636    string * target = NULL;
    37         string text_copy = text;
    3837
    3938    switch (state) {
    4039        case TEXT:
    AtomParser::process_text(const string &text)  
    5453            return;
    5554    }
    5655
    57         if (type == "html") {
    58                 MyHtmlParser p;
    59                 p.parse_html(text_copy, "iso-8859-1", false);
    60                 text_copy = p.dump;
    61         }
    62 
    6356    if (!target->empty())
    64                 *target += ' ';
     57        *target += ' ';
    6558
    66         *target += text_copy;
     59    if (type == "html") {
     60        MyHtmlParser p;
     61        p.parse_html(text, "utf-8", true);
     62        *target += p.dump;
     63    } else {
     64        *target += text;
     65    }
    6766}
    6867
    6968bool
    7069AtomParser::opening_tag(const string &tag)
    7170{
    72         if (state == OTHER) {
    73                 if (tag == "title")
    74                         state = in_entry ? KEYWORDS : TITLE;
    75                 else if (tag == "summary" || tag == "subtitle" || tag == "content")
    76                         state = TEXT;
    77                 else if (tag == "author")
    78                         state = AUTHOR;
    79                 else if (tag == "entry")
    80                         in_entry = true;
    81                 else if (tag == "category") {
    82                         //handle category term sepparatley
    83                         string new_keyword;
    84                         get_parameter("term", new_keyword);
    85                         keywords = keywords + ' ' + new_keyword;
    86                 }
    87         } else if (state == AUTHOR) {
    88                 if (tag == "uri")
    89                         is_escaped = true;
     71    if (state == OTHER) {
     72        if (tag == "title")
     73            state = in_entry ? KEYWORDS : TITLE;
     74        else if (tag == "summary" || tag == "subtitle" || tag == "content")
     75            state = TEXT;
     76        else if (tag == "author")
     77            state = AUTHOR;
     78        else if (tag == "entry")
     79            in_entry = true;
     80        else if (tag == "category") {
     81            // Handle category term separately.
     82            string new_keyword;
     83            get_parameter("term", new_keyword);
     84            if (!keywords.empty())
     85                keywords += ' ';
     86            keywords += new_keyword;
    9087        }
     88    } else if (state == AUTHOR) {
     89        if (tag == "uri")
     90            is_ignored = true;
     91    }
    9192
    92         get_parameter("type", type);
     93    if (!get_parameter("type", type))
     94        type = "text";
    9395    return true;
    9496}
    9597
    9698bool
    9799AtomParser::closing_tag(const string &tag)
    98100{
    99         if (tag == "entry")
    100                 in_entry = false;
    101         if (tag == "uri")
    102                 is_escaped = false;
    103 
    104         else if (tag == "title" || tag == "summary" || tag == "subtitle" ||
    105                  tag == "author" || tag == "content")
    106                 state = OTHER;
     101    if (tag == "entry")
     102        in_entry = false;
     103    else if (tag == "uri")
     104        is_ignored = false;
     105    else if (tag == "title" || tag == "summary" || tag == "subtitle" ||
     106             tag == "author" || tag == "content")
     107        state = OTHER;
    107108    return true;
    108109}
  • xapian-applications/omega/atomparse.h

    diff --git a/xapian-applications/omega/atomparse.h b/xapian-applications/omega/atomparse.h
    index 690d8d1..32a8f1c 100644
    a b  
    11/** @file atomparse.h
    22 * @brief Extract text from an RSS atom file.
    33 */
    4 /* Copyright (C) 2010,2011 Olly Betts
     4/* Copyright (C) 2010,2011,2012 Olly Betts
     5 * Copyright (C) 2012 Mihai Bivol
    56 *
    67 * This program is free software; you can redistribute it and/or modify
    78 * it under the terms of the GNU General Public License as published by
     
    2526
    2627class AtomParser : public HtmlParser {
    2728    enum { OTHER, TITLE, AUTHOR, KEYWORDS, TEXT } state;
    28         bool in_entry, is_escaped;
    29         string type;
     29    bool in_entry, is_ignored;
     30    string type;
    3031  public:
    31     AtomParser() : state(OTHER), in_entry(false), is_escaped(false) { }
     32    AtomParser() : state(OTHER), in_entry(false), is_ignored(false) { }
    3233    void process_text(const string &text);
    3334    bool opening_tag(const string &tag);
    3435    bool closing_tag(const string &tag);
  • new file xapian-applications/omega/atomparsetest.cc

    diff --git a/xapian-applications/omega/atomparsetest.cc b/xapian-applications/omega/atomparsetest.cc
    new file mode 100644
    index 0000000..cf8e07d
    - +  
     1/* atomparsetest.cc: test the AtomParser class
     2 *
     3 * Copyright (C) 2006,2008,2011,2012 Olly Betts
     4 *
     5 * This program is free software; you can redistribute it and/or
     6 * modify it under the terms of the GNU General Public License as
     7 * published by the Free Software Foundation; either version 2 of the
     8 * License, or (at your option) any later version.
     9 *
     10 * This program is distributed in the hope that it will be useful,
     11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13 * GNU General Public License for more details.
     14 *
     15 * You should have received a copy of the GNU General Public License
     16 * along with this program; if not, write to the Free Software
     17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
     18 * USA
     19 */
     20
     21#include <config.h>
     22
     23#include <cstdlib>
     24#include <iostream>
     25#include <string>
     26
     27#include "atomparse.h"
     28
     29using namespace std;
     30
     31struct testcase {
     32    const char * html;
     33    const char * dump;
     34    const char * title;
     35    const char * keywords;
     36    const char * author;
     37};
     38
     39static const testcase tests[] = {
     40    { "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
     41      "<feed xmlns=\"http://www.w3.org/2005/Atom\" xml:lang=\"en\">\n"
     42      "<title type=\"text\">Hydrogen</title>\n"
     43      "<subtitle type=\"html\">&lt;b&gt;Subtitle&lt;b&gt;</subtitle>\n"
     44      "<author><name>Mr X</name><uri>http://example.org/x.atom</uri><email>x@example.org</email></author>\n"
     45      "<entry>\n"
     46      "<title>&lt;Post&gt;</title><category term=\"a\" /><category term=\"b\" /></entry>\n"
     47      "<content type=\"text\">Lorem ipsum</content>\n"
     48      "</entry>\n"
     49      "</feed>\n",
     50      "Subtitle Lorem ipsum",
     51      "Hydrogen",
     52      "<Post> a b",
     53      "Mr X x@example.org" },
     54    { "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
     55      "<feed xmlns=\"http://www.w3.org/2005/Atom\" xml:lang=\"en\">\n"
     56      "<title type=\"html\">&lt;meta charset=\"iso-8859-1\"&gt;Helium</title>\n"
     57      "<entry>\n"
     58      "<category term=\"x\" />\n"
     59      "</entry>\n"
     60      "</feed>\n",
     61      "",
     62      "Helium",
     63      "x",
     64      "" },
     65    { 0, 0, 0, 0, 0 }
     66};
     67
     68int
     69main()
     70{
     71    for (size_t i = 0; tests[i].html; ++i) {
     72        AtomParser p;
     73        p.parse_html(tests[i].html);
     74        if (tests[i].dump != p.dump) {
     75            cout << "DUMP " << i << ": [" << p.dump << "] != [" << tests[i].dump << "]" << endl;
     76            exit(1);
     77        }
     78        if (tests[i].title != p.title) {
     79            cout << "TITLE " << i << ": [" << p.title << "] != [" << tests[i].title << "]" << endl;
     80            exit(1);
     81        }
     82        if (tests[i].keywords != p.keywords) {
     83            cout << "KEYWORDS " << i << ": [" << p.keywords << "] != [" << tests[i].keywords << "]" << endl;
     84            exit(1);
     85        }
     86        if (tests[i].author != p.author) {
     87            cout << "AUTHOR " << i << ": [" << p.author << "] != [" << tests[i].author << "]" << endl;
     88            exit(1);
     89        }
     90    }
     91}
  • xapian-applications/omega/docs/overview.rst

    diff --git a/xapian-applications/omega/docs/overview.rst b/xapian-applications/omega/docs/overview.rst
    index e5e04ef..2a4db4a 100644
    a b site. (Note that the ``--depth-limit`` option may come in handy if you have  
    175175sites '/products' and '/products/large', or similar.)
    176176
    177177omindex has built-in support for indexing HTML, PHP, text files, CSV
    178 (Comma-Separated Values) files, and AbiWord documents.  It can also index a
    179 number of other formats using external programs.  Filter programs are run with
    180 CPU, time and memory limits to prevent a runaway filter from blocking indexing
    181 of other files.
     178(Comma-Separated Values) files, Atom feeds, and AbiWord documents.  It can also
     179index a number of other formats using external programs.  Filter programs are
     180run with CPU, time and memory limits to prevent a runaway filter from blocking
     181indexing of other files.
    182182
    183183The way omindex decides how to index a file is based around MIME content-types.
    184184First of all omindex will look up a file's extension in its extension to MIME
    other filters too - see below):  
    221221* XPS files (.xps) if unzip is available
    222222* Debian packages (.deb, .udeb) if dpkg-deb is available
    223223* RPM packages (.rpm) if rpm is available
     224* Atom feeds (.atom)
    224225
    225226If you have additional extensions that represent one of these types, you can
    226227add an additional MIME mapping using the ``--mime-type`` option.  For
    string, but to be useful there either needs to be a filter set for that type  
    238239   - text/plain
    239240   - text/rtf
    240241   - text/x-perl
     242   - application/atom+xml
    241243   - application/msword
    242244   - application/pdf
    243245   - application/postscript
  • xapian-applications/omega/omindex.cc

    diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc
    index bab3961..aa92ae6 100644
    a b  
    55 * Copyright 2001,2002 Ananova Ltd
    66 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012 Olly Betts
    77 * Copyright 2009 Frank J Bruzzaniti
     8 * Copyright 2012 Mihai Bivol
    89 *
    910 * This program is free software; you can redistribute it and/or
    1011 * modify it under the terms of the GNU General Public License as
     
    4344
    4445#include <xapian.h>
    4546
     47#include "atomparse.h"
    4648#include "commonhelp.h"
    4749#include "diritor.h"
    4850#include "hashterm.h"
     
    6264#include "values.h"
    6365#include "xmlparse.h"
    6466#include "xpsxmlparse.h"
    65 #include "atomparse.h"
    6667
    6768#include "gnu_getopt.h"
    6869
    index_mimetype(const string & file, const string & url, const string & ext,  
    767768                dump.assign(desc, idx + 1, string::npos);
    768769            }
    769770        } else if (mimetype == "application/atom+xml") {
    770                 AtomParser atomparser;
    771                 atomparser.parse_html(d.file_to_string());
    772                 dump = atomparser.dump;
    773                 title = atomparser.title;
    774                 keywords = atomparser.keywords;
    775                 author = atomparser.author;
     771            AtomParser atomparser;
     772            atomparser.parse_html(d.file_to_string());
     773            dump = atomparser.dump;
     774            title = atomparser.title;
     775            keywords = atomparser.keywords;
     776            author = atomparser.author;
    776777        } else {
    777778            // Don't know how to index this type.
    778779            skip_unknown_mimetype(file, mimetype);
    main(int argc, char **argv)  
    11611162    // RPM packages:
    11621163    mime_map["rpm"] = "application/x-redhat-package-manager";
    11631164
    1164         // Atom feeds:
    1165         mime_map["atom"] = "application/atom+xml";
     1165    // Atom feeds:
     1166    mime_map["atom"] = "application/atom+xml";
    11661167
    11671168    // Extensions to quietly ignore:
    11681169    mime_map["a"] = "ignore";