Ticket #595: atomparse.additional.patch
File atomparse.additional.patch, 12.0 KB (added by , 13 years ago) |
---|
-
xapian-applications/omega/Makefile.am
diff --git a/xapian-applications/omega/Makefile.am b/xapian-applications/omega/Makefile.am index 741c83d..01742ac 100644
a b dist_pkglibbin_SCRIPTS = outlookmsg2html 80 80 bin_PROGRAMS = omindex scriptindex 81 81 dist_bin_SCRIPTS = dbi2omega htdig2omega mbox2omega 82 82 83 check_PROGRAMS = htmlparsetest md5test urlenctest utf8converttest 84 TESTS = htmlparsetest$(EXEEXT)\ 83 check_PROGRAMS = atomparsetest htmlparsetest md5test urlenctest utf8converttest 84 TESTS = atomparsetest$(EXEEXT)\ 85 htmlparsetest$(EXEEXT)\ 85 86 md5test$(EXEEXT)\ 86 87 urlenctest$(EXEEXT)\ 87 88 utf8converttest$(EXEEXT) … … scriptindex_SOURCES = scriptindex.cc myhtmlparse.cc htmlparse.cc\ 149 150 common/safe.cc common/stringutils.cc utf8convert.cc utf8truncate.cc 150 151 scriptindex_LDADD = $(XAPIAN_LIBS) 151 152 153 atomparsetest_SOURCES = atomparsetest.cc atomparse.cc htmlparse.cc\ 154 myhtmlparse.cc utf8convert.cc 155 atomparsetest_LDADD = $(XAPIAN_LIBS) 156 152 157 htmlparsetest_SOURCES = htmlparsetest.cc myhtmlparse.cc htmlparse.cc\ 153 158 utf8convert.cc 154 159 htmlparsetest_LDADD = $(XAPIAN_LIBS) -
xapian-applications/omega/atomparse.cc
diff --git a/xapian-applications/omega/atomparse.cc b/xapian-applications/omega/atomparse.cc index d10c6f8..2c286e5 100644
a b 1 1 /** @file atomparse.cc 2 2 * @brief Extract text from an RSS atom file. 3 3 */ 4 /* Copyright (C) 2010,2011 Olly Betts 4 /* Copyright (C) 2010,2011,2012 Olly Betts 5 * Copyright (C) 2012 Mihai Bivol 5 6 * 6 7 * This program is free software; you can redistribute it and/or modify 7 8 * it under the terms of the GNU General Public License as published by … … 19 20 */ 20 21 21 22 #include <config.h> 22 #include <iostream>23 23 24 24 #include "atomparse.h" 25 #include "stringutils.h"26 25 #include "myhtmlparse.h" 26 #include "stringutils.h" 27 27 28 28 using namespace std; 29 29 30 30 void 31 31 AtomParser::process_text(const string &text) 32 32 { 33 if (is_escaped)34 33 if (is_ignored) 34 return; 35 35 36 36 string * target = NULL; 37 string text_copy = text;38 37 39 38 switch (state) { 40 39 case TEXT: … … AtomParser::process_text(const string &text) 54 53 return; 55 54 } 56 55 57 if (type == "html") {58 MyHtmlParser p;59 p.parse_html(text_copy, "iso-8859-1", false);60 text_copy = p.dump;61 }62 63 56 if (!target->empty()) 64 57 *target += ' '; 65 58 66 *target += text_copy; 59 if (type == "html") { 60 MyHtmlParser p; 61 p.parse_html(text, "utf-8", true); 62 *target += p.dump; 63 } else { 64 *target += text; 65 } 67 66 } 68 67 69 68 bool 70 69 AtomParser::opening_tag(const string &tag) 71 70 { 72 if (state == OTHER) { 73 if (tag == "title") 74 state = in_entry ? KEYWORDS : TITLE; 75 else if (tag == "summary" || tag == "subtitle" || tag == "content") 76 state = TEXT; 77 else if (tag == "author") 78 state = AUTHOR; 79 else if (tag == "entry") 80 in_entry = true; 81 else if (tag == "category") { 82 //handle category term sepparatley 83 string new_keyword; 84 get_parameter("term", new_keyword); 85 keywords = keywords + ' ' + new_keyword; 86 } 87 } else if (state == AUTHOR) { 88 if (tag == "uri") 89 is_escaped = true; 71 if (state == OTHER) { 72 if (tag == "title") 73 state = in_entry ? KEYWORDS : TITLE; 74 else if (tag == "summary" || tag == "subtitle" || tag == "content") 75 state = TEXT; 76 else if (tag == "author") 77 state = AUTHOR; 78 else if (tag == "entry") 79 in_entry = true; 80 else if (tag == "category") { 81 // Handle category term separately. 82 string new_keyword; 83 get_parameter("term", new_keyword); 84 if (!keywords.empty()) 85 keywords += ' '; 86 keywords += new_keyword; 90 87 } 88 } else if (state == AUTHOR) { 89 if (tag == "uri") 90 is_ignored = true; 91 } 91 92 92 get_parameter("type", type); 93 if (!get_parameter("type", type)) 94 type = "text"; 93 95 return true; 94 96 } 95 97 96 98 bool 97 99 AtomParser::closing_tag(const string &tag) 98 100 { 99 if (tag == "entry") 100 in_entry = false; 101 if (tag == "uri") 102 is_escaped = false; 103 104 else if (tag == "title" || tag == "summary" || tag == "subtitle" || 105 tag == "author" || tag == "content") 106 state = OTHER; 101 if (tag == "entry") 102 in_entry = false; 103 else if (tag == "uri") 104 is_ignored = false; 105 else if (tag == "title" || tag == "summary" || tag == "subtitle" || 106 tag == "author" || tag == "content") 107 state = OTHER; 107 108 return true; 108 109 } -
xapian-applications/omega/atomparse.h
diff --git a/xapian-applications/omega/atomparse.h b/xapian-applications/omega/atomparse.h index 690d8d1..32a8f1c 100644
a b 1 1 /** @file atomparse.h 2 2 * @brief Extract text from an RSS atom file. 3 3 */ 4 /* Copyright (C) 2010,2011 Olly Betts 4 /* Copyright (C) 2010,2011,2012 Olly Betts 5 * Copyright (C) 2012 Mihai Bivol 5 6 * 6 7 * This program is free software; you can redistribute it and/or modify 7 8 * it under the terms of the GNU General Public License as published by … … 25 26 26 27 class AtomParser : public HtmlParser { 27 28 enum { OTHER, TITLE, AUTHOR, KEYWORDS, TEXT } state; 28 bool in_entry, is_escaped;29 29 bool in_entry, is_ignored; 30 string type; 30 31 public: 31 AtomParser() : state(OTHER), in_entry(false), is_ escaped(false) { }32 AtomParser() : state(OTHER), in_entry(false), is_ignored(false) { } 32 33 void process_text(const string &text); 33 34 bool opening_tag(const string &tag); 34 35 bool closing_tag(const string &tag); -
new file xapian-applications/omega/atomparsetest.cc
diff --git a/xapian-applications/omega/atomparsetest.cc b/xapian-applications/omega/atomparsetest.cc new file mode 100644 index 0000000..cf8e07d
- + 1 /* atomparsetest.cc: test the AtomParser class 2 * 3 * Copyright (C) 2006,2008,2011,2012 Olly Betts 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation; either version 2 of the 8 * License, or (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 18 * USA 19 */ 20 21 #include <config.h> 22 23 #include <cstdlib> 24 #include <iostream> 25 #include <string> 26 27 #include "atomparse.h" 28 29 using namespace std; 30 31 struct testcase { 32 const char * html; 33 const char * dump; 34 const char * title; 35 const char * keywords; 36 const char * author; 37 }; 38 39 static const testcase tests[] = { 40 { "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" 41 "<feed xmlns=\"http://www.w3.org/2005/Atom\" xml:lang=\"en\">\n" 42 "<title type=\"text\">Hydrogen</title>\n" 43 "<subtitle type=\"html\"><b>Subtitle<b></subtitle>\n" 44 "<author><name>Mr X</name><uri>http://example.org/x.atom</uri><email>x@example.org</email></author>\n" 45 "<entry>\n" 46 "<title><Post></title><category term=\"a\" /><category term=\"b\" /></entry>\n" 47 "<content type=\"text\">Lorem ipsum</content>\n" 48 "</entry>\n" 49 "</feed>\n", 50 "Subtitle Lorem ipsum", 51 "Hydrogen", 52 "<Post> a b", 53 "Mr X x@example.org" }, 54 { "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" 55 "<feed xmlns=\"http://www.w3.org/2005/Atom\" xml:lang=\"en\">\n" 56 "<title type=\"html\"><meta charset=\"iso-8859-1\">Helium</title>\n" 57 "<entry>\n" 58 "<category term=\"x\" />\n" 59 "</entry>\n" 60 "</feed>\n", 61 "", 62 "Helium", 63 "x", 64 "" }, 65 { 0, 0, 0, 0, 0 } 66 }; 67 68 int 69 main() 70 { 71 for (size_t i = 0; tests[i].html; ++i) { 72 AtomParser p; 73 p.parse_html(tests[i].html); 74 if (tests[i].dump != p.dump) { 75 cout << "DUMP " << i << ": [" << p.dump << "] != [" << tests[i].dump << "]" << endl; 76 exit(1); 77 } 78 if (tests[i].title != p.title) { 79 cout << "TITLE " << i << ": [" << p.title << "] != [" << tests[i].title << "]" << endl; 80 exit(1); 81 } 82 if (tests[i].keywords != p.keywords) { 83 cout << "KEYWORDS " << i << ": [" << p.keywords << "] != [" << tests[i].keywords << "]" << endl; 84 exit(1); 85 } 86 if (tests[i].author != p.author) { 87 cout << "AUTHOR " << i << ": [" << p.author << "] != [" << tests[i].author << "]" << endl; 88 exit(1); 89 } 90 } 91 } -
xapian-applications/omega/docs/overview.rst
diff --git a/xapian-applications/omega/docs/overview.rst b/xapian-applications/omega/docs/overview.rst index e5e04ef..2a4db4a 100644
a b site. (Note that the ``--depth-limit`` option may come in handy if you have 175 175 sites '/products' and '/products/large', or similar.) 176 176 177 177 omindex has built-in support for indexing HTML, PHP, text files, CSV 178 (Comma-Separated Values) files, and AbiWord documents. It can also index a179 number of other formats using external programs. Filter programs are run with 180 CPU, time and memory limits to prevent a runaway filter from blocking indexing181 of other files.178 (Comma-Separated Values) files, Atom feeds, and AbiWord documents. It can also 179 index a number of other formats using external programs. Filter programs are 180 run with CPU, time and memory limits to prevent a runaway filter from blocking 181 indexing of other files. 182 182 183 183 The way omindex decides how to index a file is based around MIME content-types. 184 184 First of all omindex will look up a file's extension in its extension to MIME … … other filters too - see below): 221 221 * XPS files (.xps) if unzip is available 222 222 * Debian packages (.deb, .udeb) if dpkg-deb is available 223 223 * RPM packages (.rpm) if rpm is available 224 * Atom feeds (.atom) 224 225 225 226 If you have additional extensions that represent one of these types, you can 226 227 add an additional MIME mapping using the ``--mime-type`` option. For … … string, but to be useful there either needs to be a filter set for that type 238 239 - text/plain 239 240 - text/rtf 240 241 - text/x-perl 242 - application/atom+xml 241 243 - application/msword 242 244 - application/pdf 243 245 - application/postscript -
xapian-applications/omega/omindex.cc
diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc index bab3961..aa92ae6 100644
a b 5 5 * Copyright 2001,2002 Ananova Ltd 6 6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012 Olly Betts 7 7 * Copyright 2009 Frank J Bruzzaniti 8 * Copyright 2012 Mihai Bivol 8 9 * 9 10 * This program is free software; you can redistribute it and/or 10 11 * modify it under the terms of the GNU General Public License as … … 43 44 44 45 #include <xapian.h> 45 46 47 #include "atomparse.h" 46 48 #include "commonhelp.h" 47 49 #include "diritor.h" 48 50 #include "hashterm.h" … … 62 64 #include "values.h" 63 65 #include "xmlparse.h" 64 66 #include "xpsxmlparse.h" 65 #include "atomparse.h"66 67 67 68 #include "gnu_getopt.h" 68 69 … … index_mimetype(const string & file, const string & url, const string & ext, 767 768 dump.assign(desc, idx + 1, string::npos); 768 769 } 769 770 } else if (mimetype == "application/atom+xml") { 770 771 772 773 774 775 771 AtomParser atomparser; 772 atomparser.parse_html(d.file_to_string()); 773 dump = atomparser.dump; 774 title = atomparser.title; 775 keywords = atomparser.keywords; 776 author = atomparser.author; 776 777 } else { 777 778 // Don't know how to index this type. 778 779 skip_unknown_mimetype(file, mimetype); … … main(int argc, char **argv) 1161 1162 // RPM packages: 1162 1163 mime_map["rpm"] = "application/x-redhat-package-manager"; 1163 1164 1164 1165 1165 // Atom feeds: 1166 mime_map["atom"] = "application/atom+xml"; 1166 1167 1167 1168 // Extensions to quietly ignore: 1168 1169 mime_map["a"] = "ignore";