Ticket #595: atomparse.2.diff
File atomparse.2.diff, 7.2 KB (added by , 13 years ago) |
---|
-
xapian-applications/omega/Makefile.am
diff --git a/xapian-applications/omega/Makefile.am b/xapian-applications/omega/Makefile.am index 48aaf97..741c83d 100644
a b noinst_HEADERS = omega.h query.h cgiparam.h\ 97 97 md5.h md5wrap.h xmlparse.h metaxmlparse.h values.h utf8convert.h\ 98 98 namedentities.h pkglibbindir.h datematchdecider.h sample.h strcasecmp.h\ 99 99 utf8truncate.h diritor.h runfilter.h freemem.h xpsxmlparse.h transform.h\ 100 weight.h svgparse.h tmpdir.h urldecode.h urlencode.h unixperm.h 100 weight.h svgparse.h tmpdir.h urldecode.h urlencode.h unixperm.h atomparse.h 101 101 102 102 # headers maintained in xapian-core 103 103 noinst_HEADERS +=\ … … omindex_SOURCES = omindex.cc myhtmlparse.cc htmlparse.cc\ 138 138 common/getopt.cc commonhelp.cc utils.cc hashterm.cc loadfile.cc md5.cc\ 139 139 md5wrap.cc xmlparse.cc metaxmlparse.cc utf8convert.cc sample.cc diritor.cc\ 140 140 runfilter.cc freemem.cc common/msvc_dirent.cc xpsxmlparse.cc common/str.cc\ 141 pkglibbindir.cc svgparse.cc tmpdir.cc urlencode.cc 141 pkglibbindir.cc svgparse.cc tmpdir.cc urlencode.cc atomparse.cc 142 142 if NEED_MKDTEMP 143 143 omindex_SOURCES += portability/mkdtemp.cc 144 144 endif -
new file xapian-applications/omega/atomparse.cc
diff --git a/xapian-applications/omega/atomparse.cc b/xapian-applications/omega/atomparse.cc new file mode 100644 index 0000000..d10c6f8
- + 1 /** @file atomparse.cc 2 * @brief Extract text from an RSS atom file. 3 */ 4 /* Copyright (C) 2010,2011 Olly Betts 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #include <config.h> 22 #include <iostream> 23 24 #include "atomparse.h" 25 #include "stringutils.h" 26 #include "myhtmlparse.h" 27 28 using namespace std; 29 30 void 31 AtomParser::process_text(const string &text) 32 { 33 if (is_escaped) 34 return; 35 36 string * target = NULL; 37 string text_copy = text; 38 39 switch (state) { 40 case TEXT: 41 target = &dump; 42 break; 43 case TITLE: 44 target = &title; 45 break; 46 case KEYWORDS: 47 target = &keywords; 48 break; 49 case AUTHOR: 50 target = &author; 51 break; 52 case OTHER: 53 // Ignore context in other places. 54 return; 55 } 56 57 if (type == "html") { 58 MyHtmlParser p; 59 p.parse_html(text_copy, "iso-8859-1", false); 60 text_copy = p.dump; 61 } 62 63 if (!target->empty()) 64 *target += ' '; 65 66 *target += text_copy; 67 } 68 69 bool 70 AtomParser::opening_tag(const string &tag) 71 { 72 if (state == OTHER) { 73 if (tag == "title") 74 state = in_entry ? KEYWORDS : TITLE; 75 else if (tag == "summary" || tag == "subtitle" || tag == "content") 76 state = TEXT; 77 else if (tag == "author") 78 state = AUTHOR; 79 else if (tag == "entry") 80 in_entry = true; 81 else if (tag == "category") { 82 //handle category term sepparatley 83 string new_keyword; 84 get_parameter("term", new_keyword); 85 keywords = keywords + ' ' + new_keyword; 86 } 87 } else if (state == AUTHOR) { 88 if (tag == "uri") 89 is_escaped = true; 90 } 91 92 get_parameter("type", type); 93 return true; 94 } 95 96 bool 97 AtomParser::closing_tag(const string &tag) 98 { 99 if (tag == "entry") 100 in_entry = false; 101 if (tag == "uri") 102 is_escaped = false; 103 104 else if (tag == "title" || tag == "summary" || tag == "subtitle" || 105 tag == "author" || tag == "content") 106 state = OTHER; 107 return true; 108 } -
new file xapian-applications/omega/atomparse.h
diff --git a/xapian-applications/omega/atomparse.h b/xapian-applications/omega/atomparse.h new file mode 100644 index 0000000..690d8d1
- + 1 /** @file atomparse.h 2 * @brief Extract text from an RSS atom file. 3 */ 4 /* Copyright (C) 2010,2011 Olly Betts 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #ifndef OMEGA_INCLUDED_ATOMPARSE_H 22 #define OMEGA_INCLUDED_ATOMPARSE_H 23 24 #include "htmlparse.h" 25 26 class AtomParser : public HtmlParser { 27 enum { OTHER, TITLE, AUTHOR, KEYWORDS, TEXT } state; 28 bool in_entry, is_escaped; 29 string type; 30 public: 31 AtomParser() : state(OTHER), in_entry(false), is_escaped(false) { } 32 void process_text(const string &text); 33 bool opening_tag(const string &tag); 34 bool closing_tag(const string &tag); 35 string title, keywords, dump, author; 36 }; 37 38 #endif // OMEGA_INCLUDED_ATOMPARSE_H -
xapian-applications/omega/omega.conf
diff --git a/xapian-applications/omega/omega.conf b/xapian-applications/omega/omega.conf index 1dff85d..796d50c 100644
a b 1 1 # Directory containing Xapian databases: 2 database_dir / var/lib/omega/data2 database_dir /home/mihai/sandbox/xapiandb/ 3 3 4 4 # Directory containing OmegaScript templates: 5 template_dir / var/lib/omega/templates5 template_dir /home/mihai/repos/xapian/xapian-applications/omega/templates 6 6 7 7 # Directory to write Omega logs to: 8 8 log_dir /var/log/omega -
xapian-applications/omega/omindex.cc
diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc index 7760941..bab3961 100644
a b 62 62 #include "values.h" 63 63 #include "xmlparse.h" 64 64 #include "xpsxmlparse.h" 65 #include "atomparse.h" 65 66 66 67 #include "gnu_getopt.h" 67 68 … … index_mimetype(const string & file, const string & url, const string & ext, 765 766 if (idx != string::npos) { 766 767 dump.assign(desc, idx + 1, string::npos); 767 768 } 769 } else if (mimetype == "application/atom+xml") { 770 AtomParser atomparser; 771 atomparser.parse_html(d.file_to_string()); 772 dump = atomparser.dump; 773 title = atomparser.title; 774 keywords = atomparser.keywords; 775 author = atomparser.author; 768 776 } else { 769 777 // Don't know how to index this type. 770 778 skip_unknown_mimetype(file, mimetype); … … main(int argc, char **argv) 1153 1161 // RPM packages: 1154 1162 mime_map["rpm"] = "application/x-redhat-package-manager"; 1155 1163 1164 // Atom feeds: 1165 mime_map["atom"] = "application/atom+xml"; 1166 1156 1167 // Extensions to quietly ignore: 1157 1168 mime_map["a"] = "ignore"; 1158 1169 mime_map["bin"] = "ignore";