diff -udr xapian-omega-1.0.7/myhtmlparse.cc xapian-omega-1.0.7.patched/myhtmlparse.cc
old
|
new
|
|
22 | 22 | #include <config.h> |
23 | 23 | |
24 | 24 | #include "myhtmlparse.h" |
| 25 | #include "utf8convert.h" |
| 26 | |
25 | 27 | |
26 | 28 | #include <ctype.h> |
27 | 29 | #include <string.h> |
28 | 30 | |
| 31 | #include <iostream> |
| 32 | |
29 | 33 | inline void |
30 | 34 | lowercase_string(string &str) |
31 | 35 | { |
… |
… |
|
37 | 41 | void |
38 | 42 | MyHtmlParser::parse_html(const string &text) |
39 | 43 | { |
40 | | // Default HTML character set is latin 1, though not specifying one is |
| 44 | // Default HTML character set is utf-8, though not specifying one is |
41 | 45 | // deprecated these days. |
42 | | charset = "ISO-8859-1"; |
| 46 | // (first 256 position of UTF-8 are the same, as in "ISO-8859-1") |
| 47 | charset = "UTF-8"; |
43 | 48 | HtmlParser::parse_html(text); |
44 | 49 | } |
45 | 50 | |
… |
… |
|
172 | 177 | ++end; |
173 | 178 | } |
174 | 179 | } |
| 180 | string prev_charset = charset; |
175 | 181 | charset = value.substr(start, end - start); |
| 182 | if (prev_charset!=charset) { |
| 183 | // title and values can be filled |
| 184 | // before setting charset |
| 185 | if ( |
| 186 | prev_charset.size()==0 |
| 187 | || |
| 188 | strcasecmp(prev_charset.c_str(),"utf-8")==0 |
| 189 | || |
| 190 | strcasecmp(prev_charset.c_str(),"utf8")==0 |
| 191 | ) { |
| 192 | if (title.size()!=0) { |
| 193 | convert_to_utf8(title,charset); |
| 194 | } |
| 195 | if (sample.size()!=0) { |
| 196 | convert_to_utf8(sample,charset); |
| 197 | } |
| 198 | } else { |
| 199 | if (title.size()!=0) { |
| 200 | string title1 = title; |
| 201 | convert_to_utf8(title1,charset); |
| 202 | if (title1!=title) { |
| 203 | title=string("[incorrect conversion]"); |
| 204 | } |
| 205 | } |
| 206 | if (sample.size()!=0) { |
| 207 | string sample1 = sample; |
| 208 | convert_to_utf8(sample1,charset); |
| 209 | if (sample1!=sample) { |
| 210 | sample=string("[incorrect conversion]"); |
| 211 | } |
| 212 | } |
| 213 | } |
| 214 | } |
176 | 215 | } |
177 | 216 | } |
178 | 217 | } |