From 79f368764295df109a37192f6182fb6f361d85b5 Mon Sep 17 00:00:00 2001
From: Adam Johnson <me@adamj.eu>
Date: Mon, 24 Jun 2024 15:30:59 +0200
Subject: [PATCH] [4.2.x] Fixed CVE-2024-38875 -- Mitigated potential DoS in
 urlize and urlizetrunc template filters.

Thank you to Elias Myllymäki for the report.

Co-authored-by: Sarah Boyce <42296566+sarahboyce@users.noreply.github.com>

CVE: CVE-2024-38875

Upstream-Status: Backport [https://github.com/django/django/commit/79f368764295df109a37192f6182fb6f361d85b5]

Signed-off-by: Soumya Sambu <soumya.sambu@windriver.com>
---
 django/utils/html.py           | 90 +++++++++++++++++++++++++---------
 tests/utils_tests/test_html.py |  7 +++
 2 files changed, 73 insertions(+), 21 deletions(-)

diff --git a/django/utils/html.py b/django/utils/html.py
index 7a33d5f..f1b74ab 100644
--- a/django/utils/html.py
+++ b/django/utils/html.py
@@ -234,6 +234,15 @@ def smart_urlquote(url):

     return urlunsplit((scheme, netloc, path, query, fragment))

+class CountsDict(dict):
+    def __init__(self, *args, word, **kwargs):
+        super().__init__(*args, *kwargs)
+        self.word = word
+
+    def __missing__(self, key):
+        self[key] = self.word.count(key)
+        return self[key]
+

 @keep_lazy_text
 def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
@@ -268,36 +277,69 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
         return text.replace('&amp;', '&').replace('&lt;', '<').replace(
             '&gt;', '>').replace('&quot;', '"').replace('&#39;', "'")

-    def trim_punctuation(lead, middle, trail):
+    def wrapping_punctuation_openings():
+        return "".join(dict(WRAPPING_PUNCTUATION).keys())
+
+    def trailing_punctuation_chars_no_semicolon():
+        return TRAILING_PUNCTUATION_CHARS.replace(";", "")
+
+    def trailing_punctuation_chars_has_semicolon():
+        return ";" in TRAILING_PUNCTUATION_CHARS
+
+    def trim_punctuation(word):
         """
         Trim trailing and wrapping punctuation from `middle`. Return the items
         of the new state.
         """
+        # Strip all opening wrapping punctuation.
+        middle = word.lstrip(wrapping_punctuation_openings())
+        lead = word[: len(word) - len(middle)]
+        trail = ""
+
         # Continue trimming until middle remains unchanged.
         trimmed_something = True
-        while trimmed_something:
+        counts = CountsDict(word=middle)
+        while trimmed_something and middle:
             trimmed_something = False
             # Trim wrapping punctuation.
             for opening, closing in WRAPPING_PUNCTUATION:
-                if middle.startswith(opening):
-                    middle = middle[len(opening):]
-                    lead += opening
-                    trimmed_something = True
-                # Keep parentheses at the end only if they're balanced.
-                if (middle.endswith(closing) and
-                        middle.count(closing) == middle.count(opening) + 1):
-                    middle = middle[:-len(closing)]
-                    trail = closing + trail
-                    trimmed_something = True
-            # Trim trailing punctuation (after trimming wrapping punctuation,
-            # as encoded entities contain ';'). Unescape entites to avoid
-            # breaking them by removing ';'.
-            middle_unescaped = unescape(middle)
-            stripped = middle_unescaped.rstrip(TRAILING_PUNCTUATION_CHARS)
-            if middle_unescaped != stripped:
-                trail = middle[len(stripped):] + trail
-                middle = middle[:len(stripped) - len(middle_unescaped)]
+                if counts[opening] < counts[closing]:
+                    rstripped = middle.rstrip(closing)
+                    if rstripped != middle:
+                        strip = counts[closing] - counts[opening]
+                        trail = middle[-strip:]
+                        middle = middle[:-strip]
+                        trimmed_something = True
+                        counts[closing] -= strip
+
+            rstripped = middle.rstrip(trailing_punctuation_chars_no_semicolon())
+            if rstripped != middle:
+                trail = middle[len(rstripped) :] + trail
+                middle = rstripped
                 trimmed_something = True
+
+            if trailing_punctuation_chars_has_semicolon() and middle.endswith(";"):
+                # Only strip if not part of an HTML entity.
+                amp = middle.rfind("&")
+                if amp == -1:
+                    can_strip = True
+                else:
+                    potential_entity = middle[amp:]
+                    escaped = unescape(potential_entity)
+                    can_strip = (escaped == potential_entity) or escaped.endswith(";")
+
+                if can_strip:
+                    rstripped = middle.rstrip(";")
+                    amount_stripped = len(middle) - len(rstripped)
+                    if amp > -1 and amount_stripped > 1:
+                        # Leave a trailing semicolon as might be an entity.
+                        trail = middle[len(rstripped) + 1 :] + trail
+                        middle = rstripped + ";"
+                    else:
+                        trail = middle[len(rstripped) :] + trail
+                        middle = rstripped
+                    trimmed_something = True
+
         return lead, middle, trail

     def is_email_simple(value):
@@ -321,9 +363,7 @@ def urlize(text, trim_url_limit=None, no
             # lead: Current punctuation trimmed from the beginning of the word.
             # middle: Current state of the word.
             # trail: Current punctuation trimmed from the end of the word.
-            lead, middle, trail = '', word, ''
-            # Deal with punctuation.
-            lead, middle, trail = trim_punctuation(lead, middle, trail)
+            lead, middle, trail = trim_punctuation(word)

             # Make URL we want to point to.
             url = None
diff --git a/tests/utils_tests/test_html.py b/tests/utils_tests/test_html.py
index 5cc2d9b..715c1c6 100644
--- a/tests/utils_tests/test_html.py
+++ b/tests/utils_tests/test_html.py
@@ -267,6 +267,13 @@ class TestUtilsHtml(SimpleTestCase):
             'foo@.example.com',
             'foo@localhost',
             'foo@localhost.',
+            # trim_punctuation catastrophic tests
+            "(" * 100_000 + ":" + ")" * 100_000,
+            "(" * 100_000 + "&:" + ")" * 100_000,
+            "([" * 100_000 + ":" + "])" * 100_000,
+            "[(" * 100_000 + ":" + ")]" * 100_000,
+            "([[" * 100_000 + ":" + "]])" * 100_000,
+            "&:" + ";" * 100_000,
         )
         for value in tests:
             with self.subTest(value=value):
--
2.40.0