atomparse.additional.patch on Ticket #595 – Attachment – Xapian

xapian-applications/omega/Makefile.am

diff --git a/xapian-applications/omega/Makefile.am b/xapian-applications/omega/Makefile.am
index 741c83d..01742ac 100644

                dist_pkglibbin_SCRIPTS = outlookmsg2html
 bin_PROGRAMS = omindex scriptindex
 dist_bin_SCRIPTS = dbi2omega htdig2omega mbox2omega
+check_PROGRAMS = htmlparsetest md5test urlenctest utf8converttest
+TESTS = htmlparsetest$(EXEEXT)\
+check_PROGRAMS = atomparsetest htmlparsetest md5test urlenctest utf8converttest
+TESTS = atomparsetest$(EXEEXT)\
+        htmlparsetest$(EXEEXT)\
         md5test$(EXEEXT)\
         urlenctest$(EXEEXT)\
         utf8converttest$(EXEEXT)
-…
+               scriptindex_SOURCES = scriptindex.cc myhtmlparse.cc htmlparse.cc\
  common/safe.cc common/stringutils.cc utf8convert.cc utf8truncate.cc
 scriptindex_LDADD = $(XAPIAN_LIBS)
+atomparsetest_SOURCES = atomparsetest.cc atomparse.cc htmlparse.cc\
+ myhtmlparse.cc utf8convert.cc
+atomparsetest_LDADD = $(XAPIAN_LIBS)
 htmlparsetest_SOURCES = htmlparsetest.cc myhtmlparse.cc htmlparse.cc\
  utf8convert.cc
 htmlparsetest_LDADD = $(XAPIAN_LIBS)

xapian-applications/omega/atomparse.cc

diff --git a/xapian-applications/omega/atomparse.cc b/xapian-applications/omega/atomparse.cc
index d10c6f8..2c286e5 100644

-              a
 /** @file atomparse.cc
  * @brief Extract text from an RSS atom file.
  */
+/* Copyright (C) 2010,2011 Olly Betts
+/* Copyright (C) 2010,2011,2012 Olly Betts
+ * Copyright (C) 2012 Mihai Bivol
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
 …
  */
 #include <config.h>
-#include <iostream>
 #include "atomparse.h"
-#include "stringutils.h"
 #include "myhtmlparse.h"
+#include "stringutils.h"
 using namespace std;
 void
 AtomParser::process_text(const string &text)
+{
         if (is_escaped)
                 return;
+    if (is_ignored)
+        return;
     string * target = NULL;
-        string text_copy = text;
     switch (state) {
         case TEXT:
-…
+               AtomParser::process_text(const string &text)
             return;
+    }
-        if (type == "html") {
-                MyHtmlParser p;
-                p.parse_html(text_copy, "iso-8859-1", false);
-                text_copy = p.dump;
+        }
     if (!target->empty())
                 *target += ' ';
+        *target += ' ';
+        *target += text_copy;
+    if (type == "html") {
+        MyHtmlParser p;
+        p.parse_html(text, "utf-8", true);
+        *target += p.dump;
+    } else {
+        *target += text;
+    }
+}
 bool
 AtomParser::opening_tag(const string &tag)
+{
+        if (state == OTHER) {
+                if (tag == "title")
+                        state = in_entry ? KEYWORDS : TITLE;
+                else if (tag == "summary" || tag == "subtitle" || tag == "content")
+                        state = TEXT;
+                else if (tag == "author")
+                        state = AUTHOR;
+                else if (tag == "entry")
+                        in_entry = true;
+                else if (tag == "category") {
+                        //handle category term sepparatley
+                        string new_keyword;
+                        get_parameter("term", new_keyword);
+                        keywords = keywords + ' ' + new_keyword;
+                }
+        } else if (state == AUTHOR) {
+                if (tag == "uri")
+                        is_escaped = true;
+    if (state == OTHER) {
+        if (tag == "title")
+            state = in_entry ? KEYWORDS : TITLE;
+        else if (tag == "summary" || tag == "subtitle" || tag == "content")
+            state = TEXT;
+        else if (tag == "author")
+            state = AUTHOR;
+        else if (tag == "entry")
+            in_entry = true;
+        else if (tag == "category") {
+            // Handle category term separately.
+            string new_keyword;
+            get_parameter("term", new_keyword);
+            if (!keywords.empty())
+                keywords += ' ';
+            keywords += new_keyword;
+        }
+    } else if (state == AUTHOR) {
+        if (tag == "uri")
+            is_ignored = true;
+    }
+        get_parameter("type", type);
+    if (!get_parameter("type", type))
+        type = "text";
     return true;
+}
 bool
 AtomParser::closing_tag(const string &tag)
+{
+        if (tag == "entry")
+                in_entry = false;
+        if (tag == "uri")
+                is_escaped = false;
+        else if (tag == "title" || tag == "summary" || tag == "subtitle" ||
+                 tag == "author" || tag == "content")
+                state = OTHER;
+    if (tag == "entry")
+        in_entry = false;
+    else if (tag == "uri")
+        is_ignored = false;
+    else if (tag == "title" || tag == "summary" || tag == "subtitle" ||
+             tag == "author" || tag == "content")
+        state = OTHER;
     return true;
+}

xapian-applications/omega/atomparse.h

diff --git a/xapian-applications/omega/atomparse.h b/xapian-applications/omega/atomparse.h
index 690d8d1..32a8f1c 100644

-              a
 /** @file atomparse.h
  * @brief Extract text from an RSS atom file.
  */
+/* Copyright (C) 2010,2011 Olly Betts
+/* Copyright (C) 2010,2011,2012 Olly Betts
+ * Copyright (C) 2012 Mihai Bivol
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
 …
 class AtomParser : public HtmlParser {
     enum { OTHER, TITLE, AUTHOR, KEYWORDS, TEXT } state;
         bool in_entry, is_escaped;
         string type;
+    bool in_entry, is_ignored;
+    string type;
   public:
     AtomParser() : state(OTHER), in_entry(false), is_escaped(false) { }
+    AtomParser() : state(OTHER), in_entry(false), is_ignored(false) { }
     void process_text(const string &text);
     bool opening_tag(const string &tag);
     bool closing_tag(const string &tag);

new file xapian-applications/omega/atomparsetest.cc

diff --git a/xapian-applications/omega/atomparsetest.cc b/xapian-applications/omega/atomparsetest.cc
new file mode 100644
index 0000000..cf8e07d

-              -
+/* atomparsetest.cc: test the AtomParser class
+ *
+ * Copyright (C) 2006,2008,2011,2012 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+#include <config.h>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include "atomparse.h"
+using namespace std;
+struct testcase {
+    const char * html;
+    const char * dump;
+    const char * title;
+    const char * keywords;
+    const char * author;
+};
+static const testcase tests[] = {
+    { "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+      "<feed xmlns=\"http://www.w3.org/2005/Atom\" xml:lang=\"en\">\n"
+      "<title type=\"text\">Hydrogen</title>\n"
+      "<subtitle type=\"html\">&lt;b&gt;Subtitle&lt;b&gt;</subtitle>\n"
+      "<author><name>Mr X</name><uri>http://example.org/x.atom</uri><email>x@example.org</email></author>\n"
+      "<entry>\n"
+      "<title>&lt;Post&gt;</title><category term=\"a\" /><category term=\"b\" /></entry>\n"
+      "<content type=\"text\">Lorem ipsum</content>\n"
+      "</entry>\n"
+      "</feed>\n",
+      "Subtitle Lorem ipsum",
+      "Hydrogen",
+      "<Post> a b",
+      "Mr X x@example.org" },
+    { "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+      "<feed xmlns=\"http://www.w3.org/2005/Atom\" xml:lang=\"en\">\n"
+      "<title type=\"html\">&lt;meta charset=\"iso-8859-1\"&gt;Helium</title>\n"
+      "<entry>\n"
+      "<category term=\"x\" />\n"
+      "</entry>\n"
+      "</feed>\n",
+      "",
+      "Helium",
+      "x",
+      "" },
+    { 0, 0, 0, 0, 0 }
+};
+int
+main()
+{
+    for (size_t i = 0; tests[i].html; ++i) {
+        AtomParser p;
+        p.parse_html(tests[i].html);
+        if (tests[i].dump != p.dump) {
+            cout << "DUMP " << i << ": [" << p.dump << "] != [" << tests[i].dump << "]" << endl;
+            exit(1);
+        }
+        if (tests[i].title != p.title) {
+            cout << "TITLE " << i << ": [" << p.title << "] != [" << tests[i].title << "]" << endl;
+            exit(1);
+        }
+        if (tests[i].keywords != p.keywords) {
+            cout << "KEYWORDS " << i << ": [" << p.keywords << "] != [" << tests[i].keywords << "]" << endl;
+            exit(1);
+        }
+        if (tests[i].author != p.author) {
+            cout << "AUTHOR " << i << ": [" << p.author << "] != [" << tests[i].author << "]" << endl;
+            exit(1);
+        }
+    }
+}

xapian-applications/omega/docs/overview.rst

diff --git a/xapian-applications/omega/docs/overview.rst b/xapian-applications/omega/docs/overview.rst
index e5e04ef..2a4db4a 100644

                site. (Note that the ``--depth-limit`` option may come in handy if you have
 sites '/products' and '/products/large', or similar.)
 omindex has built-in support for indexing HTML, PHP, text files, CSV
 (Comma-Separated Values) files, and AbiWord documents.  It can also index a
+number of other formats using external programs.  Filter programs are run with
 CPU, time and memory limits to prevent a runaway filter from blocking indexing
 of other files.
+(Comma-Separated Values) files, Atom feeds, and AbiWord documents.  It can also
+index a number of other formats using external programs.  Filter programs are
+run with CPU, time and memory limits to prevent a runaway filter from blocking
+indexing of other files.
 The way omindex decides how to index a file is based around MIME content-types.
 First of all omindex will look up a file's extension in its extension to MIME
-…
+               other filters too - see below):
 * XPS files (.xps) if unzip is available
 * Debian packages (.deb, .udeb) if dpkg-deb is available
 * RPM packages (.rpm) if rpm is available
+* Atom feeds (.atom)
 If you have additional extensions that represent one of these types, you can
 add an additional MIME mapping using the ``--mime-type`` option.  For
-…
+               string, but to be useful there either needs to be a filter set for that type
    - text/plain
    - text/rtf
    - text/x-perl
+   - application/atom+xml
    - application/msword
    - application/pdf
    - application/postscript

xapian-applications/omega/omindex.cc

diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc
index bab3961..aa92ae6 100644

-              a
  * Copyright 2001,2002 Ananova Ltd
  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012 Olly Betts
  * Copyright 2009 Frank J Bruzzaniti
+ * Copyright 2012 Mihai Bivol
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
 …
 #include <xapian.h>
+#include "atomparse.h"
 #include "commonhelp.h"
 #include "diritor.h"
 #include "hashterm.h"
 …
 #include "values.h"
 #include "xmlparse.h"
 #include "xpsxmlparse.h"
-#include "atomparse.h"
 #include "gnu_getopt.h"
-…
+               index_mimetype(const string & file, const string & url, const string & ext,
                 dump.assign(desc, idx + 1, string::npos);
+            }
         } else if (mimetype == "application/atom+xml") {
                 AtomParser atomparser;
                 atomparser.parse_html(d.file_to_string());
                 dump = atomparser.dump;
                 title = atomparser.title;
                 keywords = atomparser.keywords;
                 author = atomparser.author;
+            AtomParser atomparser;
+            atomparser.parse_html(d.file_to_string());
+            dump = atomparser.dump;
+            title = atomparser.title;
+            keywords = atomparser.keywords;
+            author = atomparser.author;
         } else {
             // Don't know how to index this type.
             skip_unknown_mimetype(file, mimetype);
-…
+               main(int argc, char **argv)
     // RPM packages:
     mime_map["rpm"] = "application/x-redhat-package-manager";
         // Atom feeds:
         mime_map["atom"] = "application/atom+xml";
+    // Atom feeds:
+    mime_map["atom"] = "application/atom+xml";
     // Extensions to quietly ignore:
     mime_map["a"] = "ignore";

Context Navigation

Ticket #595: atomparse.additional.patch

xapian-applications/omega/Makefile.am

xapian-applications/omega/atomparse.cc

xapian-applications/omega/atomparse.h

new file xapian-applications/omega/atomparsetest.cc

xapian-applications/omega/docs/overview.rst

xapian-applications/omega/omindex.cc

Download in other formats: