From 8f62e6fadbc0399d9c059c29f5eeb41ce6196979 Mon Sep 17 00:00:00 2001 From: krcb197 <34693973+krcb197@users.noreply.github.com> Date: Mon, 24 Feb 2025 19:49:08 +0000 Subject: [PATCH 1/3] Escaping name and desc properties before converting to HMTL to avoid some unwanted behaviour. --- src/systemrdl/__about__.py | 2 +- src/systemrdl/core/rdlformatcode.py | 11 +++++++++-- test/rdl_src/rdlformatcode.rdl | 8 ++++++++ test/test_rdlformatcode.py | 17 +++++++++++++---- 4 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/systemrdl/__about__.py b/src/systemrdl/__about__.py index 994b88b..b31beec 100644 --- a/src/systemrdl/__about__.py +++ b/src/systemrdl/__about__.py @@ -1 +1 @@ -__version__ = "1.29.3" +__version__ = "1.29.4" diff --git a/src/systemrdl/core/rdlformatcode.py b/src/systemrdl/core/rdlformatcode.py index 49438f2..30fe34e 100755 --- a/src/systemrdl/core/rdlformatcode.py +++ b/src/systemrdl/core/rdlformatcode.py @@ -1,5 +1,6 @@ import re from typing import TYPE_CHECKING, Optional +import html from . import helpers from ..node import Node, AddressableNode @@ -12,11 +13,17 @@ def rdlfc_to_html(text: str, node: Optional[Node]=None, md: Optional['Markdown'] Convert an RDLFormatCode string to HTML """ + # -------------------------------------------------------------------------- + # Escape any characters which may cause problems when HTML is interpreted + # -------------------------------------------------------------------------- + text = html.escape(text, quote=True) + # -------------------------------------------------------------------------- # Remove any common indentation # -------------------------------------------------------------------------- text = helpers.dedent_text(text) + # -------------------------------------------------------------------------- # Parse and replace RDLFormatCode Tags # -------------------------------------------------------------------------- @@ -160,9 +167,9 @@ def rdlfc_to_html(text: str, node: Optional[Node]=None, md: Optional['Markdown'] list_end_tag.pop() elif m.lastgroup == 'quote': - text_segs.append('"') + text_segs.append('"') elif m.lastgroup == 'xquote': - text_segs.append('"') + text_segs.append('"') elif m.lastgroup == 'br': text_segs.append("
") elif m.lastgroup == 'lb': diff --git a/test/rdl_src/rdlformatcode.rdl b/test/rdl_src/rdlformatcode.rdl index 109112f..203aee1 100644 --- a/test/rdl_src/rdlformatcode.rdl +++ b/test/rdl_src/rdlformatcode.rdl @@ -36,6 +36,11 @@ addrmap rdlformatcode { r20->desc = "[index]"; r21->desc = "[index_parent]"; + reg_t r22,r23,r24; + r22->desc = "string with a \"quote\" in it"; + r23->desc = "tag to be escaped

h1"; + r24->desc = "signal <"; + r1->name = "[b]asdf[/b]"; r2->name = "[i]asdf[/i]"; @@ -52,4 +57,7 @@ addrmap rdlformatcode { r16->name = ""; r17->name = " "; + r22->name = "string with a \"quote\" in it"; + r23->name = "tag to be escaped

h1"; + r24->name = "signal <"; }; diff --git a/test/test_rdlformatcode.py b/test/test_rdlformatcode.py index efc5c37..b6633c0 100644 --- a/test/test_rdlformatcode.py +++ b/test/test_rdlformatcode.py @@ -1,5 +1,6 @@ from unittest_utils import RDLSourceTestCase + class TestRDLFormatCode(RDLSourceTestCase): def test_desc_tags(self): @@ -11,7 +12,7 @@ def test_desc_tags(self): self.assertIs(root.top.get_html_desc(), None) html = [] - for i in range(0,22): + for i in range(0,25): reg = root.find_by_path("rdlformatcode.r%d" % i) html.append(reg.get_html_desc()) @@ -29,7 +30,7 @@ def p(s): self.assertEqual(html[8], p('asdf@example.com')) self.assertEqual(html[9], p('')) self.assertEqual(html[10], p('asdf')) - self.assertEqual(html[11], p('"asdf"')) + self.assertEqual(html[11], p('"asdf"')) self.assertEqual(html[12], p('
[] ')) self.assertEqual(html[13], p("r13")) self.assertEqual(html[14], p("r14")) @@ -49,6 +50,10 @@ def p(s): self.assertEqual(html[20], p("[index]")) self.assertEqual(html[21], p("[index_parent]")) + self.assertEqual(html[22], p("string with a "quote" in it")) + self.assertEqual(html[23], p("tag to be escaped <h1> h1")) + self.assertEqual(html[24], p("signal &lt")) + def test_name_tags(self): root = self.compile( @@ -59,7 +64,7 @@ def test_name_tags(self): self.assertIs(root.top.get_html_name(), None) html = [] - for i in range(0,20): + for i in range(0,25): reg = root.find_by_path("rdlformatcode.r%d" % i) html.append(reg.get_html_name()) @@ -72,9 +77,13 @@ def test_name_tags(self): self.assertEqual(html[7], 'asdf') self.assertEqual(html[8], 'asdf@example.com') self.assertEqual(html[10], 'asdf') - self.assertEqual(html[11], '"asdf"') + self.assertEqual(html[11], '"asdf"') self.assertEqual(html[12], '[] ') self.assertEqual(html[14], "r14") self.assertEqual(html[16], "") self.assertEqual(html[17], "") + + self.assertEqual(html[22], "string with a "quote" in it") + self.assertEqual(html[23], "tag to be escaped <h1> h1") + self.assertEqual(html[24], "signal &lt") From 8942ca939f75cd554502265c1d17d958429e4599 Mon Sep 17 00:00:00 2001 From: krcb197 <34693973+krcb197@users.noreply.github.com> Date: Mon, 24 Feb 2025 20:01:49 +0000 Subject: [PATCH 2/3] Fix a the test_enums.py which needed to be updated not the test is escaped. --- test/test_enums.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_enums.py b/test/test_enums.py index eb31156..0f27b43 100644 --- a/test/test_enums.py +++ b/test/test_enums.py @@ -30,7 +30,7 @@ def test_enums(self): self.assertIsNone(f_default_enum['four'].get_html_name()) self.assertIsNone(f_default_enum['four'].get_html_desc()) - self.assertEqual(f_default_enum['five'].get_html_name(), "five's name") + self.assertEqual(f_default_enum['five'].get_html_name(), "five's name") self.assertEqual(f_default_enum['five'].get_html_desc(), "

this is five

") f0 = root.find_by_path("enum_test1.reg2.f0") From 4b4c74edac14883b8a0c449a5790d9fbe7c65d11 Mon Sep 17 00:00:00 2001 From: krcb197 <34693973+krcb197@users.noreply.github.com> Date: Sat, 1 Mar 2025 09:05:06 +0000 Subject: [PATCH 3/3] Addressed review feedback, made the HTML escaping behaviour an optional behaviour (off by default) --- src/systemrdl/core/rdlformatcode.py | 6 +- src/systemrdl/node.py | 21 +++- src/systemrdl/rdltypes/user_enum.py | 21 +++- test/test_enums.py | 3 +- test/test_rdlformatcode.py | 174 +++++++++++++++++----------- 5 files changed, 144 insertions(+), 81 deletions(-) diff --git a/src/systemrdl/core/rdlformatcode.py b/src/systemrdl/core/rdlformatcode.py index 30fe34e..717b362 100755 --- a/src/systemrdl/core/rdlformatcode.py +++ b/src/systemrdl/core/rdlformatcode.py @@ -8,7 +8,8 @@ if TYPE_CHECKING: from markdown import Markdown -def rdlfc_to_html(text: str, node: Optional[Node]=None, md: Optional['Markdown']=None, is_desc: bool=True) -> str: +def rdlfc_to_html(text: str, node: Optional[Node]=None, md: Optional['Markdown']=None, + is_desc: bool=True, escape_html: bool=False) -> str: """ Convert an RDLFormatCode string to HTML """ @@ -16,7 +17,8 @@ def rdlfc_to_html(text: str, node: Optional[Node]=None, md: Optional['Markdown'] # -------------------------------------------------------------------------- # Escape any characters which may cause problems when HTML is interpreted # -------------------------------------------------------------------------- - text = html.escape(text, quote=True) + if escape_html: + text = html.escape(text, quote=True) # -------------------------------------------------------------------------- # Remove any common indentation diff --git a/src/systemrdl/node.py b/src/systemrdl/node.py index 85baac9..fbadd5f 100644 --- a/src/systemrdl/node.py +++ b/src/systemrdl/node.py @@ -648,7 +648,7 @@ def get_rel_path(self, ref: 'Node', uplevel: str="^", hier_separator: str=".", a return hier_separator.join(self_segs_fmt) - def get_html_desc(self, markdown_inst: Optional['Markdown']=None) -> Optional[str]: + def get_html_desc(self, markdown_inst: Optional['Markdown']=None, escape_html: bool=False) -> Optional[str]: """ Translates the node's 'desc' property into HTML. @@ -665,6 +665,11 @@ def get_html_desc(self, markdown_inst: Optional['Markdown']=None) -> Optional[st Override the class instance of the Markdown processor. See the `Markdown module `_ for more details. + escape_html: + The desc property from the SystemRDL is passed through the python `html.escape` + function before processing. This can help avoid cases where plain text inadvertently + contains syntax that will result in undesirable behaviour when rendered. This option + should not be turned on if html tags are intentionally included in the `desc` property Returns ------- @@ -679,15 +684,23 @@ def get_html_desc(self, markdown_inst: Optional['Markdown']=None) -> Optional[st desc_str = self.get_property('desc') if desc_str is None: return None - return rdlformatcode.rdlfc_to_html(desc_str, self, md=markdown_inst) + return rdlformatcode.rdlfc_to_html(desc_str, self, md=markdown_inst, escape_html=escape_html) - def get_html_name(self) -> Optional[str]: + def get_html_name(self, escape_html: bool=False) -> Optional[str]: """ Translates the node's 'name' property into HTML. Any RDLFormatCode tags used are converted to HTML. + Parameters + ---------- + escape_html: + The desc property from the SystemRDL is passed through the python `html.escape` + function before processing. This can help avoid cases where plain text inadvertently + contains syntax that will result in undesirable behaviour when rendered. This option + should not be turned on if html tags are intentionally included in the `name` property + Returns ------- str or None @@ -700,7 +713,7 @@ def get_html_name(self) -> Optional[str]: name_str = self.get_property('name', default=None) if name_str is None: return None - return rdlformatcode.rdlfc_to_html(name_str, self, is_desc=False) + return rdlformatcode.rdlfc_to_html(name_str, self, is_desc=False, escape_html=escape_html) @property diff --git a/src/systemrdl/rdltypes/user_enum.py b/src/systemrdl/rdltypes/user_enum.py index 028e983..aa64f07 100644 --- a/src/systemrdl/rdltypes/user_enum.py +++ b/src/systemrdl/rdltypes/user_enum.py @@ -217,7 +217,7 @@ def rdl_desc(self) -> Optional[str]: """ return self._rdl_desc - def get_html_desc(self, markdown_inst: Optional['Markdown']=None) -> Optional[str]: + def get_html_desc(self, markdown_inst: Optional['Markdown']=None, escape_html: bool=False) -> Optional[str]: """ Translates the enum's 'desc' property into HTML. @@ -234,6 +234,11 @@ def get_html_desc(self, markdown_inst: Optional['Markdown']=None) -> Optional[st Override the class instance of the Markdown processor. See the `Markdown module `_ for more details. + escape_html: + The desc property from the SystemRDL is passed through the python `html.escape` + function before processing. This can help avoid cases where plain text inadvertently + contains syntax that will result in undesirable behaviour when rendered. This option + should not be turned on if html tags are intentionally included in the `desc` property Returns ------- @@ -248,14 +253,22 @@ def get_html_desc(self, markdown_inst: Optional['Markdown']=None) -> Optional[st desc_str = self._rdl_desc if desc_str is None: return None - return rdlformatcode.rdlfc_to_html(desc_str, md=markdown_inst) + return rdlformatcode.rdlfc_to_html(desc_str, md=markdown_inst, escape_html=escape_html) - def get_html_name(self) -> Optional[str]: + def get_html_name(self, escape_html: bool=False) -> Optional[str]: """ Translates the enum's 'name' property into HTML. Any RDLFormatCode tags used are converted to HTML. + Parameters + ---------- + escape_html: + The desc property from the SystemRDL is passed through the python `html.escape` + function before processing. This can help avoid cases where plain text inadvertently + contains syntax that will result in undesirable behaviour when rendered. This option + should not be turned on if html tags are intentionally included in the `name` property + Returns ------- str or None @@ -268,7 +281,7 @@ def get_html_name(self) -> Optional[str]: name_str = self._rdl_name if name_str is None: return None - return rdlformatcode.rdlfc_to_html(name_str, is_desc=False) + return rdlformatcode.rdlfc_to_html(name_str, is_desc=False, escape_html=escape_html) # Tell pickle how to reduce dynamically generated UserEnum classes diff --git a/test/test_enums.py b/test/test_enums.py index 0f27b43..1003797 100644 --- a/test/test_enums.py +++ b/test/test_enums.py @@ -30,7 +30,8 @@ def test_enums(self): self.assertIsNone(f_default_enum['four'].get_html_name()) self.assertIsNone(f_default_enum['four'].get_html_desc()) - self.assertEqual(f_default_enum['five'].get_html_name(), "five's name") + self.assertEqual(f_default_enum['five'].get_html_name(escape_html=False), "five's name") + self.assertEqual(f_default_enum['five'].get_html_name(escape_html=True),"five's name") self.assertEqual(f_default_enum['five'].get_html_desc(), "

this is five

") f0 = root.find_by_path("enum_test1.reg2.f0") diff --git a/test/test_rdlformatcode.py b/test/test_rdlformatcode.py index b6633c0..bf0861d 100644 --- a/test/test_rdlformatcode.py +++ b/test/test_rdlformatcode.py @@ -9,50 +9,67 @@ def test_desc_tags(self): "rdlformatcode" ) - self.assertIs(root.top.get_html_desc(), None) - - html = [] - for i in range(0,25): - reg = root.find_by_path("rdlformatcode.r%d" % i) - html.append(reg.get_html_desc()) - - def p(s): - return "

%s

" % s - - self.assertEqual(html[0], "

asdf

") - self.assertEqual(html[1], p("asdf")) - self.assertEqual(html[2], p("asdf")) - self.assertEqual(html[3], p("asdf")) - self.assertEqual(html[4], p('asdf')) - self.assertEqual(html[5], p('asdf')) - self.assertEqual(html[6], p('github.com')) - self.assertEqual(html[7], p('asdf')) - self.assertEqual(html[8], p('asdf@example.com')) - self.assertEqual(html[9], p('')) - self.assertEqual(html[10], p('asdf')) - self.assertEqual(html[11], p('"asdf"')) - self.assertEqual(html[12], p('
[] ')) - self.assertEqual(html[13], p("r13")) - self.assertEqual(html[14], p("r14")) - - r15 = root.find_by_path("rdlformatcode.r15[1]") - self.assertEqual(r15.get_html_desc(), p("[1]")) - - f = root.find_by_path("rdlformatcode.r15[2].f") - self.assertEqual(f.get_html_desc(), p("[2]")) - f = root.find_by_path("rdlformatcode.r15.f") - self.assertEqual(f.get_html_desc(), p("[0:2]")) - - self.assertEqual(html[16], "") - self.assertEqual(html[17], "") - self.assertEqual(html[18], "
  • a
  • b
  • c
") - self.assertEqual(html[19], '
  1. a
  2. b
  3. c
') - self.assertEqual(html[20], p("[index]")) - self.assertEqual(html[21], p("[index_parent]")) - - self.assertEqual(html[22], p("string with a "quote" in it")) - self.assertEqual(html[23], p("tag to be escaped <h1> h1")) - self.assertEqual(html[24], p("signal &lt")) + for escape_html in [True, False, None]: + with self.subTest(escape_html=escape_html): + + if escape_html is not None: + self.assertIs(root.top.get_html_desc(escape_html=escape_html), None) + else: + # escaping html needs to be off for the default behaviour so it is not + self.assertIs(root.top.get_html_desc(), None) + + html = [] + for i in range(0,25): + reg = root.find_by_path("rdlformatcode.r%d" % i) + if escape_html is not None: + html.append(reg.get_html_desc(escape_html=escape_html)) + else: + # escaping html needs to be off for the default behaviour so it is not + html.append(reg.get_html_desc()) + + + def p(s): + return "

%s

" % s + + self.assertEqual(html[0], "

asdf

") + self.assertEqual(html[1], p("asdf")) + self.assertEqual(html[2], p("asdf")) + self.assertEqual(html[3], p("asdf")) + self.assertEqual(html[4], p('asdf')) + self.assertEqual(html[5], p('asdf')) + self.assertEqual(html[6], p('github.com')) + self.assertEqual(html[7], p('asdf')) + self.assertEqual(html[8], p('asdf@example.com')) + self.assertEqual(html[9], p('')) + self.assertEqual(html[10], p('asdf')) + self.assertEqual(html[11], p('"asdf"')) + self.assertEqual(html[12], p('
[] ')) + self.assertEqual(html[13], p("r13")) + self.assertEqual(html[14], p("r14")) + + r15 = root.find_by_path("rdlformatcode.r15[1]") + self.assertEqual(r15.get_html_desc(), p("[1]")) + + f = root.find_by_path("rdlformatcode.r15[2].f") + self.assertEqual(f.get_html_desc(), p("[2]")) + f = root.find_by_path("rdlformatcode.r15.f") + self.assertEqual(f.get_html_desc(), p("[0:2]")) + + self.assertEqual(html[16], "") + self.assertEqual(html[17], "") + self.assertEqual(html[18], "
  • a
  • b
  • c
") + self.assertEqual(html[19], '
  1. a
  2. b
  3. c
') + self.assertEqual(html[20], p("[index]")) + self.assertEqual(html[21], p("[index_parent]")) + + if escape_html is True: + self.assertEqual(html[22], p("string with a "quote" in it")) + self.assertEqual(html[23], p("tag to be escaped <h1> h1")) + else: + self.assertEqual(html[22], p("string with a \"quote\" in it")) + self.assertEqual(html[23], p("tag to be escaped

h1")) + # the & character is escaped using the default Markdown processing anyway + self.assertEqual(html[24], p("signal &lt")) def test_name_tags(self): @@ -61,29 +78,46 @@ def test_name_tags(self): "rdlformatcode" ) - self.assertIs(root.top.get_html_name(), None) - - html = [] - for i in range(0,25): - reg = root.find_by_path("rdlformatcode.r%d" % i) - html.append(reg.get_html_name()) - - self.assertEqual(html[1], "asdf") - self.assertEqual(html[2], "asdf") - self.assertEqual(html[3], "asdf") - self.assertEqual(html[4], 'asdf') - self.assertEqual(html[5], 'asdf') - self.assertEqual(html[6], 'github.com') - self.assertEqual(html[7], 'asdf') - self.assertEqual(html[8], 'asdf@example.com') - self.assertEqual(html[10], 'asdf') - self.assertEqual(html[11], '"asdf"') - self.assertEqual(html[12], '[] ') - self.assertEqual(html[14], "r14") - - self.assertEqual(html[16], "") - self.assertEqual(html[17], "") - - self.assertEqual(html[22], "string with a "quote" in it") - self.assertEqual(html[23], "tag to be escaped <h1> h1") - self.assertEqual(html[24], "signal &lt") + for escape_html in [True, False, None]: + with self.subTest(escape_html=escape_html): + + if escape_html is not None: + self.assertIs(root.top.get_html_name(escape_html=escape_html), None) + else: + # escaping html needs to be off for the default behaviour so it is not + self.assertIs(root.top.get_html_name(), None) + + html = [] + for i in range(0,25): + reg = root.find_by_path("rdlformatcode.r%d" % i) + if escape_html is not None: + html.append(reg.get_html_name(escape_html=escape_html)) + else: + # escaping html needs to be off for the default behaviour so it is not + html.append(reg.get_html_name()) + + self.assertEqual(html[1], "asdf") + self.assertEqual(html[2], "asdf") + self.assertEqual(html[3], "asdf") + self.assertEqual(html[4], 'asdf') + self.assertEqual(html[5], 'asdf') + self.assertEqual(html[6], 'github.com') + self.assertEqual(html[7], 'asdf') + self.assertEqual(html[8], 'asdf@example.com') + self.assertEqual(html[10], 'asdf') + self.assertEqual(html[11], '"asdf"') + self.assertEqual(html[12], '[] ') + self.assertEqual(html[14], "r14") + + self.assertEqual(html[16], "") + self.assertEqual(html[17], "") + + if escape_html is True: + self.assertEqual(html[22], "string with a "quote" in it") + self.assertEqual(html[23], "tag to be escaped <h1> h1") + self.assertEqual(html[24], "signal &lt") + else: + self.assertEqual(html[22], "string with a \"quote\" in it") + self.assertEqual(html[23], "tag to be escaped

h1") + self.assertEqual(html[24], "signal <") +