diff -udr xapian-omega-1.0.7/myhtmlparse.cc xapian-omega-1.0.7.patched/myhtmlparse.cc
|
old
|
new
|
|
| 22 | 22 | #include <config.h> |
| 23 | 23 | |
| 24 | 24 | #include "myhtmlparse.h" |
| | 25 | #include "utf8convert.h" |
| | 26 | |
| 25 | 27 | |
| 26 | 28 | #include <ctype.h> |
| 27 | 29 | #include <string.h> |
| 28 | 30 | |
| | 31 | #include <iostream> |
| | 32 | |
| 29 | 33 | inline void |
| 30 | 34 | lowercase_string(string &str) |
| 31 | 35 | { |
| … |
… |
|
| 37 | 41 | void |
| 38 | 42 | MyHtmlParser::parse_html(const string &text) |
| 39 | 43 | { |
| 40 | | // Default HTML character set is latin 1, though not specifying one is |
| | 44 | // Default HTML character set is utf-8, though not specifying one is |
| 41 | 45 | // deprecated these days. |
| 42 | | charset = "ISO-8859-1"; |
| | 46 | // (first 256 position of UTF-8 are the same, as in "ISO-8859-1") |
| | 47 | charset = "UTF-8"; |
| 43 | 48 | HtmlParser::parse_html(text); |
| 44 | 49 | } |
| 45 | 50 | |
| … |
… |
|
| 172 | 177 | ++end; |
| 173 | 178 | } |
| 174 | 179 | } |
| | 180 | string prev_charset = charset; |
| 175 | 181 | charset = value.substr(start, end - start); |
| | 182 | if (prev_charset!=charset) { |
| | 183 | // title and values can be filled |
| | 184 | // before setting charset |
| | 185 | if ( |
| | 186 | prev_charset.size()==0 |
| | 187 | || |
| | 188 | strcasecmp(prev_charset.c_str(),"utf-8")==0 |
| | 189 | || |
| | 190 | strcasecmp(prev_charset.c_str(),"utf8")==0 |
| | 191 | ) { |
| | 192 | if (title.size()!=0) { |
| | 193 | convert_to_utf8(title,charset); |
| | 194 | } |
| | 195 | if (sample.size()!=0) { |
| | 196 | convert_to_utf8(sample,charset); |
| | 197 | } |
| | 198 | } else { |
| | 199 | if (title.size()!=0) { |
| | 200 | string title1 = title; |
| | 201 | convert_to_utf8(title1,charset); |
| | 202 | if (title1!=title) { |
| | 203 | title=string("[incorrect conversion]"); |
| | 204 | } |
| | 205 | } |
| | 206 | if (sample.size()!=0) { |
| | 207 | string sample1 = sample; |
| | 208 | convert_to_utf8(sample1,charset); |
| | 209 | if (sample1!=sample) { |
| | 210 | sample=string("[incorrect conversion]"); |
| | 211 | } |
| | 212 | } |
| | 213 | } |
| | 214 | } |
| 176 | 215 | } |
| 177 | 216 | } |
| 178 | 217 | } |