diff options
| author | Aníbal Limón <anibal.limon@linux.intel.com> | 2014-11-05 12:10:27 -0600 |
|---|---|---|
| committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2014-11-06 16:45:23 +0000 |
| commit | 25e3e57c551297a9bcfe3b6a5d5c9d071774cce7 (patch) | |
| tree | 7b0d3d03e8eab4169012b97ff5eee60f77da8334 /bitbake/lib/bs4/tests/test_soup.py | |
| parent | bc6330cb7f288e76209410b0812aff1dbfa90950 (diff) | |
| download | poky-25e3e57c551297a9bcfe3b6a5d5c9d071774cce7.tar.gz | |
bitbake: bs4: Add beautifulsoup 4.3.2 to assist the fetcher
Added Beautifulsoup module because fetch/wget latest_versionstring
method depends on it.
This provides support to fetch/wget.py module for search new package
versions in upstream sites.
(Bitbake rev: 4626c9b77e5eded97507b6f9ca0d891f9a54bb8a)
Signed-off-by: Aníbal Limón <anibal.limon@linux.intel.com>
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake/lib/bs4/tests/test_soup.py')
| -rw-r--r-- | bitbake/lib/bs4/tests/test_soup.py | 434 |
1 files changed, 434 insertions, 0 deletions
diff --git a/bitbake/lib/bs4/tests/test_soup.py b/bitbake/lib/bs4/tests/test_soup.py new file mode 100644 index 0000000000..47ac245f99 --- /dev/null +++ b/bitbake/lib/bs4/tests/test_soup.py | |||
| @@ -0,0 +1,434 @@ | |||
| 1 | # -*- coding: utf-8 -*- | ||
| 2 | """Tests of Beautiful Soup as a whole.""" | ||
| 3 | |||
| 4 | import logging | ||
| 5 | import unittest | ||
| 6 | import sys | ||
| 7 | import tempfile | ||
| 8 | |||
| 9 | from bs4 import ( | ||
| 10 | BeautifulSoup, | ||
| 11 | BeautifulStoneSoup, | ||
| 12 | ) | ||
| 13 | from bs4.element import ( | ||
| 14 | CharsetMetaAttributeValue, | ||
| 15 | ContentMetaAttributeValue, | ||
| 16 | SoupStrainer, | ||
| 17 | NamespacedAttribute, | ||
| 18 | ) | ||
| 19 | import bs4.dammit | ||
| 20 | from bs4.dammit import ( | ||
| 21 | EntitySubstitution, | ||
| 22 | UnicodeDammit, | ||
| 23 | ) | ||
| 24 | from bs4.testing import ( | ||
| 25 | SoupTest, | ||
| 26 | skipIf, | ||
| 27 | ) | ||
| 28 | import warnings | ||
| 29 | |||
| 30 | try: | ||
| 31 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML | ||
| 32 | LXML_PRESENT = True | ||
| 33 | except ImportError, e: | ||
| 34 | LXML_PRESENT = False | ||
| 35 | |||
| 36 | PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) | ||
| 37 | PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) | ||
| 38 | |||
| 39 | class TestConstructor(SoupTest): | ||
| 40 | |||
| 41 | def test_short_unicode_input(self): | ||
| 42 | data = u"<h1>éé</h1>" | ||
| 43 | soup = self.soup(data) | ||
| 44 | self.assertEqual(u"éé", soup.h1.string) | ||
| 45 | |||
| 46 | def test_embedded_null(self): | ||
| 47 | data = u"<h1>foo\0bar</h1>" | ||
| 48 | soup = self.soup(data) | ||
| 49 | self.assertEqual(u"foo\0bar", soup.h1.string) | ||
| 50 | |||
| 51 | |||
| 52 | class TestDeprecatedConstructorArguments(SoupTest): | ||
| 53 | |||
| 54 | def test_parseOnlyThese_renamed_to_parse_only(self): | ||
| 55 | with warnings.catch_warnings(record=True) as w: | ||
| 56 | soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) | ||
| 57 | msg = str(w[0].message) | ||
| 58 | self.assertTrue("parseOnlyThese" in msg) | ||
| 59 | self.assertTrue("parse_only" in msg) | ||
| 60 | self.assertEqual(b"<b></b>", soup.encode()) | ||
| 61 | |||
| 62 | def test_fromEncoding_renamed_to_from_encoding(self): | ||
| 63 | with warnings.catch_warnings(record=True) as w: | ||
| 64 | utf8 = b"\xc3\xa9" | ||
| 65 | soup = self.soup(utf8, fromEncoding="utf8") | ||
| 66 | msg = str(w[0].message) | ||
| 67 | self.assertTrue("fromEncoding" in msg) | ||
| 68 | self.assertTrue("from_encoding" in msg) | ||
| 69 | self.assertEqual("utf8", soup.original_encoding) | ||
| 70 | |||
| 71 | def test_unrecognized_keyword_argument(self): | ||
| 72 | self.assertRaises( | ||
| 73 | TypeError, self.soup, "<a>", no_such_argument=True) | ||
| 74 | |||
| 75 | class TestWarnings(SoupTest): | ||
| 76 | |||
| 77 | def test_disk_file_warning(self): | ||
| 78 | filehandle = tempfile.NamedTemporaryFile() | ||
| 79 | filename = filehandle.name | ||
| 80 | try: | ||
| 81 | with warnings.catch_warnings(record=True) as w: | ||
| 82 | soup = self.soup(filename) | ||
| 83 | msg = str(w[0].message) | ||
| 84 | self.assertTrue("looks like a filename" in msg) | ||
| 85 | finally: | ||
| 86 | filehandle.close() | ||
| 87 | |||
| 88 | # The file no longer exists, so Beautiful Soup will no longer issue the warning. | ||
| 89 | with warnings.catch_warnings(record=True) as w: | ||
| 90 | soup = self.soup(filename) | ||
| 91 | self.assertEqual(0, len(w)) | ||
| 92 | |||
| 93 | def test_url_warning(self): | ||
| 94 | with warnings.catch_warnings(record=True) as w: | ||
| 95 | soup = self.soup("http://www.crummy.com/") | ||
| 96 | msg = str(w[0].message) | ||
| 97 | self.assertTrue("looks like a URL" in msg) | ||
| 98 | |||
| 99 | with warnings.catch_warnings(record=True) as w: | ||
| 100 | soup = self.soup("http://www.crummy.com/ is great") | ||
| 101 | self.assertEqual(0, len(w)) | ||
| 102 | |||
| 103 | class TestSelectiveParsing(SoupTest): | ||
| 104 | |||
| 105 | def test_parse_with_soupstrainer(self): | ||
| 106 | markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" | ||
| 107 | strainer = SoupStrainer("b") | ||
| 108 | soup = self.soup(markup, parse_only=strainer) | ||
| 109 | self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") | ||
| 110 | |||
| 111 | |||
| 112 | class TestEntitySubstitution(unittest.TestCase): | ||
| 113 | """Standalone tests of the EntitySubstitution class.""" | ||
| 114 | def setUp(self): | ||
| 115 | self.sub = EntitySubstitution | ||
| 116 | |||
| 117 | def test_simple_html_substitution(self): | ||
| 118 | # Unicode characters corresponding to named HTML entites | ||
| 119 | # are substituted, and no others. | ||
| 120 | s = u"foo\u2200\N{SNOWMAN}\u00f5bar" | ||
| 121 | self.assertEqual(self.sub.substitute_html(s), | ||
| 122 | u"foo∀\N{SNOWMAN}õbar") | ||
| 123 | |||
| 124 | def test_smart_quote_substitution(self): | ||
| 125 | # MS smart quotes are a common source of frustration, so we | ||
| 126 | # give them a special test. | ||
| 127 | quotes = b"\x91\x92foo\x93\x94" | ||
| 128 | dammit = UnicodeDammit(quotes) | ||
| 129 | self.assertEqual(self.sub.substitute_html(dammit.markup), | ||
| 130 | "‘’foo“”") | ||
| 131 | |||
| 132 | def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): | ||
| 133 | s = 'Welcome to "my bar"' | ||
| 134 | self.assertEqual(self.sub.substitute_xml(s, False), s) | ||
| 135 | |||
| 136 | def test_xml_attribute_quoting_normally_uses_double_quotes(self): | ||
| 137 | self.assertEqual(self.sub.substitute_xml("Welcome", True), | ||
| 138 | '"Welcome"') | ||
| 139 | self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), | ||
| 140 | '"Bob\'s Bar"') | ||
| 141 | |||
| 142 | def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): | ||
| 143 | s = 'Welcome to "my bar"' | ||
| 144 | self.assertEqual(self.sub.substitute_xml(s, True), | ||
| 145 | "'Welcome to \"my bar\"'") | ||
| 146 | |||
| 147 | def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): | ||
| 148 | s = 'Welcome to "Bob\'s Bar"' | ||
| 149 | self.assertEqual( | ||
| 150 | self.sub.substitute_xml(s, True), | ||
| 151 | '"Welcome to "Bob\'s Bar""') | ||
| 152 | |||
| 153 | def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): | ||
| 154 | quoted = 'Welcome to "Bob\'s Bar"' | ||
| 155 | self.assertEqual(self.sub.substitute_xml(quoted), quoted) | ||
| 156 | |||
| 157 | def test_xml_quoting_handles_angle_brackets(self): | ||
| 158 | self.assertEqual( | ||
| 159 | self.sub.substitute_xml("foo<bar>"), | ||
| 160 | "foo<bar>") | ||
| 161 | |||
| 162 | def test_xml_quoting_handles_ampersands(self): | ||
| 163 | self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") | ||
| 164 | |||
| 165 | def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): | ||
| 166 | self.assertEqual( | ||
| 167 | self.sub.substitute_xml("ÁT&T"), | ||
| 168 | "&Aacute;T&T") | ||
| 169 | |||
| 170 | def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): | ||
| 171 | self.assertEqual( | ||
| 172 | self.sub.substitute_xml_containing_entities("ÁT&T"), | ||
| 173 | "ÁT&T") | ||
| 174 | |||
| 175 | def test_quotes_not_html_substituted(self): | ||
| 176 | """There's no need to do this except inside attribute values.""" | ||
| 177 | text = 'Bob\'s "bar"' | ||
| 178 | self.assertEqual(self.sub.substitute_html(text), text) | ||
| 179 | |||
| 180 | |||
| 181 | class TestEncodingConversion(SoupTest): | ||
| 182 | # Test Beautiful Soup's ability to decode and encode from various | ||
| 183 | # encodings. | ||
| 184 | |||
| 185 | def setUp(self): | ||
| 186 | super(TestEncodingConversion, self).setUp() | ||
| 187 | self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' | ||
| 188 | self.utf8_data = self.unicode_data.encode("utf-8") | ||
| 189 | # Just so you know what it looks like. | ||
| 190 | self.assertEqual( | ||
| 191 | self.utf8_data, | ||
| 192 | b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') | ||
| 193 | |||
| 194 | def test_ascii_in_unicode_out(self): | ||
| 195 | # ASCII input is converted to Unicode. The original_encoding | ||
| 196 | # attribute is set to 'utf-8', a superset of ASCII. | ||
| 197 | chardet = bs4.dammit.chardet_dammit | ||
| 198 | logging.disable(logging.WARNING) | ||
| 199 | try: | ||
| 200 | def noop(str): | ||
| 201 | return None | ||
| 202 | # Disable chardet, which will realize that the ASCII is ASCII. | ||
| 203 | bs4.dammit.chardet_dammit = noop | ||
| 204 | ascii = b"<foo>a</foo>" | ||
| 205 | soup_from_ascii = self.soup(ascii) | ||
| 206 | unicode_output = soup_from_ascii.decode() | ||
| 207 | self.assertTrue(isinstance(unicode_output, unicode)) | ||
| 208 | self.assertEqual(unicode_output, self.document_for(ascii.decode())) | ||
| 209 | self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") | ||
| 210 | finally: | ||
| 211 | logging.disable(logging.NOTSET) | ||
| 212 | bs4.dammit.chardet_dammit = chardet | ||
| 213 | |||
| 214 | def test_unicode_in_unicode_out(self): | ||
| 215 | # Unicode input is left alone. The original_encoding attribute | ||
| 216 | # is not set. | ||
| 217 | soup_from_unicode = self.soup(self.unicode_data) | ||
| 218 | self.assertEqual(soup_from_unicode.decode(), self.unicode_data) | ||
| 219 | self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') | ||
| 220 | self.assertEqual(soup_from_unicode.original_encoding, None) | ||
| 221 | |||
| 222 | def test_utf8_in_unicode_out(self): | ||
| 223 | # UTF-8 input is converted to Unicode. The original_encoding | ||
| 224 | # attribute is set. | ||
| 225 | soup_from_utf8 = self.soup(self.utf8_data) | ||
| 226 | self.assertEqual(soup_from_utf8.decode(), self.unicode_data) | ||
| 227 | self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') | ||
| 228 | |||
| 229 | def test_utf8_out(self): | ||
| 230 | # The internal data structures can be encoded as UTF-8. | ||
| 231 | soup_from_unicode = self.soup(self.unicode_data) | ||
| 232 | self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) | ||
| 233 | |||
| 234 | @skipIf( | ||
| 235 | PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, | ||
| 236 | "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") | ||
| 237 | def test_attribute_name_containing_unicode_characters(self): | ||
| 238 | markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' | ||
| 239 | self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) | ||
| 240 | |||
| 241 | class TestUnicodeDammit(unittest.TestCase): | ||
| 242 | """Standalone tests of UnicodeDammit.""" | ||
| 243 | |||
| 244 | def test_unicode_input(self): | ||
| 245 | markup = u"I'm already Unicode! \N{SNOWMAN}" | ||
| 246 | dammit = UnicodeDammit(markup) | ||
| 247 | self.assertEqual(dammit.unicode_markup, markup) | ||
| 248 | |||
| 249 | def test_smart_quotes_to_unicode(self): | ||
| 250 | markup = b"<foo>\x91\x92\x93\x94</foo>" | ||
| 251 | dammit = UnicodeDammit(markup) | ||
| 252 | self.assertEqual( | ||
| 253 | dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") | ||
| 254 | |||
| 255 | def test_smart_quotes_to_xml_entities(self): | ||
| 256 | markup = b"<foo>\x91\x92\x93\x94</foo>" | ||
| 257 | dammit = UnicodeDammit(markup, smart_quotes_to="xml") | ||
| 258 | self.assertEqual( | ||
| 259 | dammit.unicode_markup, "<foo>‘’“”</foo>") | ||
| 260 | |||
| 261 | def test_smart_quotes_to_html_entities(self): | ||
| 262 | markup = b"<foo>\x91\x92\x93\x94</foo>" | ||
| 263 | dammit = UnicodeDammit(markup, smart_quotes_to="html") | ||
| 264 | self.assertEqual( | ||
| 265 | dammit.unicode_markup, "<foo>‘’“”</foo>") | ||
| 266 | |||
| 267 | def test_smart_quotes_to_ascii(self): | ||
| 268 | markup = b"<foo>\x91\x92\x93\x94</foo>" | ||
| 269 | dammit = UnicodeDammit(markup, smart_quotes_to="ascii") | ||
| 270 | self.assertEqual( | ||
| 271 | dammit.unicode_markup, """<foo>''""</foo>""") | ||
| 272 | |||
| 273 | def test_detect_utf8(self): | ||
| 274 | utf8 = b"\xc3\xa9" | ||
| 275 | dammit = UnicodeDammit(utf8) | ||
| 276 | self.assertEqual(dammit.unicode_markup, u'\xe9') | ||
| 277 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | ||
| 278 | |||
| 279 | def test_convert_hebrew(self): | ||
| 280 | hebrew = b"\xed\xe5\xec\xf9" | ||
| 281 | dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) | ||
| 282 | self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') | ||
| 283 | self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') | ||
| 284 | |||
| 285 | def test_dont_see_smart_quotes_where_there_are_none(self): | ||
| 286 | utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" | ||
| 287 | dammit = UnicodeDammit(utf_8) | ||
| 288 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | ||
| 289 | self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) | ||
| 290 | |||
| 291 | def test_ignore_inappropriate_codecs(self): | ||
| 292 | utf8_data = u"Räksmörgås".encode("utf-8") | ||
| 293 | dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) | ||
| 294 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | ||
| 295 | |||
| 296 | def test_ignore_invalid_codecs(self): | ||
| 297 | utf8_data = u"Räksmörgås".encode("utf-8") | ||
| 298 | for bad_encoding in ['.utf8', '...', 'utF---16.!']: | ||
| 299 | dammit = UnicodeDammit(utf8_data, [bad_encoding]) | ||
| 300 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') | ||
| 301 | |||
| 302 | def test_detect_html5_style_meta_tag(self): | ||
| 303 | |||
| 304 | for data in ( | ||
| 305 | b'<html><meta charset="euc-jp" /></html>', | ||
| 306 | b"<html><meta charset='euc-jp' /></html>", | ||
| 307 | b"<html><meta charset=euc-jp /></html>", | ||
| 308 | b"<html><meta charset=euc-jp/></html>"): | ||
| 309 | dammit = UnicodeDammit(data, is_html=True) | ||
| 310 | self.assertEqual( | ||
| 311 | "euc-jp", dammit.original_encoding) | ||
| 312 | |||
| 313 | def test_last_ditch_entity_replacement(self): | ||
| 314 | # This is a UTF-8 document that contains bytestrings | ||
| 315 | # completely incompatible with UTF-8 (ie. encoded with some other | ||
| 316 | # encoding). | ||
| 317 | # | ||
| 318 | # Since there is no consistent encoding for the document, | ||
| 319 | # Unicode, Dammit will eventually encode the document as UTF-8 | ||
| 320 | # and encode the incompatible characters as REPLACEMENT | ||
| 321 | # CHARACTER. | ||
| 322 | # | ||
| 323 | # If chardet is installed, it will detect that the document | ||
| 324 | # can be converted into ISO-8859-1 without errors. This happens | ||
| 325 | # to be the wrong encoding, but it is a consistent encoding, so the | ||
| 326 | # code we're testing here won't run. | ||
| 327 | # | ||
| 328 | # So we temporarily disable chardet if it's present. | ||
| 329 | doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> | ||
| 330 | <html><b>\330\250\330\252\330\261</b> | ||
| 331 | <i>\310\322\321\220\312\321\355\344</i></html>""" | ||
| 332 | chardet = bs4.dammit.chardet_dammit | ||
| 333 | logging.disable(logging.WARNING) | ||
| 334 | try: | ||
| 335 | def noop(str): | ||
| 336 | return None | ||
| 337 | bs4.dammit.chardet_dammit = noop | ||
| 338 | dammit = UnicodeDammit(doc) | ||
| 339 | self.assertEqual(True, dammit.contains_replacement_characters) | ||
| 340 | self.assertTrue(u"\ufffd" in dammit.unicode_markup) | ||
| 341 | |||
| 342 | soup = BeautifulSoup(doc, "html.parser") | ||
| 343 | self.assertTrue(soup.contains_replacement_characters) | ||
| 344 | finally: | ||
| 345 | logging.disable(logging.NOTSET) | ||
| 346 | bs4.dammit.chardet_dammit = chardet | ||
| 347 | |||
| 348 | def test_byte_order_mark_removed(self): | ||
| 349 | # A document written in UTF-16LE will have its byte order marker stripped. | ||
| 350 | data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' | ||
| 351 | dammit = UnicodeDammit(data) | ||
| 352 | self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) | ||
| 353 | self.assertEqual("utf-16le", dammit.original_encoding) | ||
| 354 | |||
| 355 | def test_detwingle(self): | ||
| 356 | # Here's a UTF8 document. | ||
| 357 | utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") | ||
| 358 | |||
| 359 | # Here's a Windows-1252 document. | ||
| 360 | windows_1252 = ( | ||
| 361 | u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" | ||
| 362 | u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") | ||
| 363 | |||
| 364 | # Through some unholy alchemy, they've been stuck together. | ||
| 365 | doc = utf8 + windows_1252 + utf8 | ||
| 366 | |||
| 367 | # The document can't be turned into UTF-8: | ||
| 368 | self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") | ||
| 369 | |||
| 370 | # Unicode, Dammit thinks the whole document is Windows-1252, | ||
| 371 | # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" | ||
| 372 | |||
| 373 | # But if we run it through fix_embedded_windows_1252, it's fixed: | ||
| 374 | |||
| 375 | fixed = UnicodeDammit.detwingle(doc) | ||
| 376 | self.assertEqual( | ||
| 377 | u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) | ||
| 378 | |||
| 379 | def test_detwingle_ignores_multibyte_characters(self): | ||
| 380 | # Each of these characters has a UTF-8 representation ending | ||
| 381 | # in \x93. \x93 is a smart quote if interpreted as | ||
| 382 | # Windows-1252. But our code knows to skip over multibyte | ||
| 383 | # UTF-8 characters, so they'll survive the process unscathed. | ||
| 384 | for tricky_unicode_char in ( | ||
| 385 | u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' | ||
| 386 | u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' | ||
| 387 | u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. | ||
| 388 | ): | ||
| 389 | input = tricky_unicode_char.encode("utf8") | ||
| 390 | self.assertTrue(input.endswith(b'\x93')) | ||
| 391 | output = UnicodeDammit.detwingle(input) | ||
| 392 | self.assertEqual(output, input) | ||
| 393 | |||
| 394 | class TestNamedspacedAttribute(SoupTest): | ||
| 395 | |||
| 396 | def test_name_may_be_none(self): | ||
| 397 | a = NamespacedAttribute("xmlns", None) | ||
| 398 | self.assertEqual(a, "xmlns") | ||
| 399 | |||
| 400 | def test_attribute_is_equivalent_to_colon_separated_string(self): | ||
| 401 | a = NamespacedAttribute("a", "b") | ||
| 402 | self.assertEqual("a:b", a) | ||
| 403 | |||
| 404 | def test_attributes_are_equivalent_if_prefix_and_name_identical(self): | ||
| 405 | a = NamespacedAttribute("a", "b", "c") | ||
| 406 | b = NamespacedAttribute("a", "b", "c") | ||
| 407 | self.assertEqual(a, b) | ||
| 408 | |||
| 409 | # The actual namespace is not considered. | ||
| 410 | c = NamespacedAttribute("a", "b", None) | ||
| 411 | self.assertEqual(a, c) | ||
| 412 | |||
| 413 | # But name and prefix are important. | ||
| 414 | d = NamespacedAttribute("a", "z", "c") | ||
| 415 | self.assertNotEqual(a, d) | ||
| 416 | |||
| 417 | e = NamespacedAttribute("z", "b", "c") | ||
| 418 | self.assertNotEqual(a, e) | ||
| 419 | |||
| 420 | |||
| 421 | class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): | ||
| 422 | |||
| 423 | def test_content_meta_attribute_value(self): | ||
| 424 | value = CharsetMetaAttributeValue("euc-jp") | ||
| 425 | self.assertEqual("euc-jp", value) | ||
| 426 | self.assertEqual("euc-jp", value.original_value) | ||
| 427 | self.assertEqual("utf8", value.encode("utf8")) | ||
| 428 | |||
| 429 | |||
| 430 | def test_content_meta_attribute_value(self): | ||
| 431 | value = ContentMetaAttributeValue("text/html; charset=euc-jp") | ||
| 432 | self.assertEqual("text/html; charset=euc-jp", value) | ||
| 433 | self.assertEqual("text/html; charset=euc-jp", value.original_value) | ||
| 434 | self.assertEqual("text/html; charset=utf8", value.encode("utf8")) | ||
