Ticket #595: atomparse.diff
File atomparse.diff, 6.0 KB (added by , 13 years ago) |
---|
-
xapian-applications/omega/Makefile.am
diff --git a/xapian-applications/omega/Makefile.am b/xapian-applications/omega/Makefile.am index 48aaf97..741c83d 100644
a b noinst_HEADERS = omega.h query.h cgiparam.h\ 97 97 md5.h md5wrap.h xmlparse.h metaxmlparse.h values.h utf8convert.h\ 98 98 namedentities.h pkglibbindir.h datematchdecider.h sample.h strcasecmp.h\ 99 99 utf8truncate.h diritor.h runfilter.h freemem.h xpsxmlparse.h transform.h\ 100 weight.h svgparse.h tmpdir.h urldecode.h urlencode.h unixperm.h 100 weight.h svgparse.h tmpdir.h urldecode.h urlencode.h unixperm.h atomparse.h 101 101 102 102 # headers maintained in xapian-core 103 103 noinst_HEADERS +=\ … … omindex_SOURCES = omindex.cc myhtmlparse.cc htmlparse.cc\ 138 138 common/getopt.cc commonhelp.cc utils.cc hashterm.cc loadfile.cc md5.cc\ 139 139 md5wrap.cc xmlparse.cc metaxmlparse.cc utf8convert.cc sample.cc diritor.cc\ 140 140 runfilter.cc freemem.cc common/msvc_dirent.cc xpsxmlparse.cc common/str.cc\ 141 pkglibbindir.cc svgparse.cc tmpdir.cc urlencode.cc 141 pkglibbindir.cc svgparse.cc tmpdir.cc urlencode.cc atomparse.cc 142 142 if NEED_MKDTEMP 143 143 omindex_SOURCES += portability/mkdtemp.cc 144 144 endif -
new file xapian-applications/omega/atomparse.cc
diff --git a/xapian-applications/omega/atomparse.cc b/xapian-applications/omega/atomparse.cc new file mode 100644 index 0000000..6194016
- + 1 /** @file atomparse.cc 2 * @brief Extract text from an RSS atom file. 3 */ 4 /* Copyright (C) 2010,2011 Olly Betts 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #include <config.h> 22 23 #include "atomparse.h" 24 #include "stringutils.h" 25 26 using namespace std; 27 28 void 29 AtomParser::process_text(const string &text) 30 { 31 string * target = NULL; 32 switch (state) { 33 case TEXT: 34 target = &dump; 35 break; 36 case TITLE: 37 target = &title; 38 break; 39 case KEYWORDS: 40 target = &keywords; 41 break; 42 case AUTHOR: 43 target = &author; 44 break; 45 case OTHER: 46 // Ignore context in other places. 47 return; 48 } 49 if (!target->empty()) 50 *target += ' '; 51 *target += text; 52 } 53 54 bool 55 AtomParser::opening_tag(const string &tag) 56 { 57 if (state == OTHER) { 58 if (tag == "title") 59 state = in_entry ? KEYWORDS : TITLE; 60 else if (tag == "summary" || tag == "subtitle") 61 state = TEXT; 62 else if (tag == "author") 63 state = AUTHOR; 64 else if (tag == "category") 65 state = KEYWORDS; 66 else if (tag == "entry") 67 in_entry = true; 68 } 69 return true; 70 } 71 72 bool 73 AtomParser::closing_tag(const string &tag) 74 { 75 if (tag == "entry") 76 in_entry = false; 77 else if (tag == "title" || tag == "summary" || tag == "subtitle" || 78 tag == "author" || tag == "category") 79 state = OTHER; 80 return true; 81 } -
new file xapian-applications/omega/atomparse.h
diff --git a/xapian-applications/omega/atomparse.h b/xapian-applications/omega/atomparse.h new file mode 100644 index 0000000..342cb33
- + 1 /** @file atomparse.h 2 * @brief Extract text from an RSS atom file. 3 */ 4 /* Copyright (C) 2010,2011 Olly Betts 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #ifndef OMEGA_INCLUDED_ATOMPARSE_H 22 #define OMEGA_INCLUDED_ATOMPARSE_H 23 24 #include "htmlparse.h" 25 26 class AtomParser : public HtmlParser { 27 enum { OTHER, TITLE, AUTHOR, KEYWORDS, TEXT } state; 28 bool in_entry; 29 public: 30 AtomParser() : state(OTHER), in_entry(false) { } 31 void process_text(const string &text); 32 bool opening_tag(const string &tag); 33 bool closing_tag(const string &tag); 34 string title, keywords, dump, author; 35 }; 36 37 #endif // OMEGA_INCLUDED_ATOMPARSE_H -
xapian-applications/omega/omindex.cc
diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc index 7760941..bab3961 100644
a b 62 62 #include "values.h" 63 63 #include "xmlparse.h" 64 64 #include "xpsxmlparse.h" 65 #include "atomparse.h" 65 66 66 67 #include "gnu_getopt.h" 67 68 … … index_mimetype(const string & file, const string & url, const string & ext, 765 766 if (idx != string::npos) { 766 767 dump.assign(desc, idx + 1, string::npos); 767 768 } 769 } else if (mimetype == "application/atom+xml") { 770 AtomParser atomparser; 771 atomparser.parse_html(d.file_to_string()); 772 dump = atomparser.dump; 773 title = atomparser.title; 774 keywords = atomparser.keywords; 775 author = atomparser.author; 768 776 } else { 769 777 // Don't know how to index this type. 770 778 skip_unknown_mimetype(file, mimetype); … … main(int argc, char **argv) 1153 1161 // RPM packages: 1154 1162 mime_map["rpm"] = "application/x-redhat-package-manager"; 1155 1163 1164 // Atom feeds: 1165 mime_map["atom"] = "application/atom+xml"; 1166 1156 1167 // Extensions to quietly ignore: 1157 1168 mime_map["a"] = "ignore"; 1158 1169 mime_map["bin"] = "ignore";