Andrew Geissler | 7e0e3c0 | 2022-02-25 20:34:39 +0000 | [diff] [blame^] | 1 | From 85b1792e37b131e7a51af98a37f92472e8de5f3f Mon Sep 17 00:00:00 2001 |
| 2 | From: Nick Wellnhofer <wellnhofer@aevum.de> |
| 3 | Date: Tue, 18 May 2021 20:08:28 +0200 |
| 4 | Subject: [PATCH] Work around lxml API abuse |
| 5 | |
| 6 | Make xmlNodeDumpOutput and htmlNodeDumpFormatOutput work with corrupted |
| 7 | parent pointers. This used to work with the old recursive code but the |
| 8 | non-recursive rewrite required parent pointers to be set correctly. |
| 9 | |
| 10 | Unfortunately, lxml relies on the old behavior and passes subtrees with |
| 11 | a corrupted structure. Fall back to a recursive function call if an |
| 12 | invalid parent pointer is detected. |
| 13 | |
| 14 | Fixes #255. |
| 15 | |
| 16 | Upstream-Status: Backport [85b1792e37b131e7a51af98a37f92472e8de5f3f] |
| 17 | --- |
| 18 | HTMLtree.c | 46 ++++++++++++++++++++++++++++------------------ |
| 19 | xmlsave.c | 31 +++++++++++++++++++++---------- |
| 20 | 2 files changed, 49 insertions(+), 28 deletions(-) |
| 21 | |
| 22 | diff --git a/HTMLtree.c b/HTMLtree.c |
| 23 | index 24434d45..bdd639c7 100644 |
| 24 | --- a/HTMLtree.c |
| 25 | +++ b/HTMLtree.c |
| 26 | @@ -744,7 +744,7 @@ void |
| 27 | htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 28 | xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED, |
| 29 | int format) { |
| 30 | - xmlNodePtr root; |
| 31 | + xmlNodePtr root, parent; |
| 32 | xmlAttrPtr attr; |
| 33 | const htmlElemDesc * info; |
| 34 | |
| 35 | @@ -755,6 +755,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 36 | } |
| 37 | |
| 38 | root = cur; |
| 39 | + parent = cur->parent; |
| 40 | while (1) { |
| 41 | switch (cur->type) { |
| 42 | case XML_HTML_DOCUMENT_NODE: |
| 43 | @@ -762,13 +763,25 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 44 | if (((xmlDocPtr) cur)->intSubset != NULL) { |
| 45 | htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); |
| 46 | } |
| 47 | - if (cur->children != NULL) { |
| 48 | + /* Always validate cur->parent when descending. */ |
| 49 | + if ((cur->parent == parent) && (cur->children != NULL)) { |
| 50 | + parent = cur; |
| 51 | cur = cur->children; |
| 52 | continue; |
| 53 | } |
| 54 | break; |
| 55 | |
| 56 | case XML_ELEMENT_NODE: |
| 57 | + /* |
| 58 | + * Some users like lxml are known to pass nodes with a corrupted |
| 59 | + * tree structure. Fall back to a recursive call to handle this |
| 60 | + * case. |
| 61 | + */ |
| 62 | + if ((cur->parent != parent) && (cur->children != NULL)) { |
| 63 | + htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); |
| 64 | + break; |
| 65 | + } |
| 66 | + |
| 67 | /* |
| 68 | * Get specific HTML info for that node. |
| 69 | */ |
| 70 | @@ -817,6 +830,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 71 | (cur->name != NULL) && |
| 72 | (cur->name[0] != 'p')) /* p, pre, param */ |
| 73 | xmlOutputBufferWriteString(buf, "\n"); |
| 74 | + parent = cur; |
| 75 | cur = cur->children; |
| 76 | continue; |
| 77 | } |
| 78 | @@ -825,9 +839,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 79 | (info != NULL) && (!info->isinline)) { |
| 80 | if ((cur->next->type != HTML_TEXT_NODE) && |
| 81 | (cur->next->type != HTML_ENTITY_REF_NODE) && |
| 82 | - (cur->parent != NULL) && |
| 83 | - (cur->parent->name != NULL) && |
| 84 | - (cur->parent->name[0] != 'p')) /* p, pre, param */ |
| 85 | + (parent != NULL) && |
| 86 | + (parent->name != NULL) && |
| 87 | + (parent->name[0] != 'p')) /* p, pre, param */ |
| 88 | xmlOutputBufferWriteString(buf, "\n"); |
| 89 | } |
| 90 | |
| 91 | @@ -842,9 +856,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 92 | break; |
| 93 | if (((cur->name == (const xmlChar *)xmlStringText) || |
| 94 | (cur->name != (const xmlChar *)xmlStringTextNoenc)) && |
| 95 | - ((cur->parent == NULL) || |
| 96 | - ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && |
| 97 | - (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { |
| 98 | + ((parent == NULL) || |
| 99 | + ((xmlStrcasecmp(parent->name, BAD_CAST "script")) && |
| 100 | + (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) { |
| 101 | xmlChar *buffer; |
| 102 | |
| 103 | buffer = xmlEncodeEntitiesReentrant(doc, cur->content); |
| 104 | @@ -902,13 +916,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 105 | break; |
| 106 | } |
| 107 | |
| 108 | - /* |
| 109 | - * The parent should never be NULL here but we want to handle |
| 110 | - * corrupted documents gracefully. |
| 111 | - */ |
| 112 | - if (cur->parent == NULL) |
| 113 | - return; |
| 114 | - cur = cur->parent; |
| 115 | + cur = parent; |
| 116 | + /* cur->parent was validated when descending. */ |
| 117 | + parent = cur->parent; |
| 118 | |
| 119 | if ((cur->type == XML_HTML_DOCUMENT_NODE) || |
| 120 | (cur->type == XML_DOCUMENT_NODE)) { |
| 121 | @@ -939,9 +949,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 122 | (cur->next != NULL)) { |
| 123 | if ((cur->next->type != HTML_TEXT_NODE) && |
| 124 | (cur->next->type != HTML_ENTITY_REF_NODE) && |
| 125 | - (cur->parent != NULL) && |
| 126 | - (cur->parent->name != NULL) && |
| 127 | - (cur->parent->name[0] != 'p')) /* p, pre, param */ |
| 128 | + (parent != NULL) && |
| 129 | + (parent->name != NULL) && |
| 130 | + (parent->name[0] != 'p')) /* p, pre, param */ |
| 131 | xmlOutputBufferWriteString(buf, "\n"); |
| 132 | } |
| 133 | } |
| 134 | diff --git a/xmlsave.c b/xmlsave.c |
| 135 | index 61a40459..aedbd5e7 100644 |
| 136 | --- a/xmlsave.c |
| 137 | +++ b/xmlsave.c |
| 138 | @@ -847,7 +847,7 @@ htmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 139 | static void |
| 140 | xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 141 | int format = ctxt->format; |
| 142 | - xmlNodePtr tmp, root, unformattedNode = NULL; |
| 143 | + xmlNodePtr tmp, root, unformattedNode = NULL, parent; |
| 144 | xmlAttrPtr attr; |
| 145 | xmlChar *start, *end; |
| 146 | xmlOutputBufferPtr buf; |
| 147 | @@ -856,6 +856,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 148 | buf = ctxt->buf; |
| 149 | |
| 150 | root = cur; |
| 151 | + parent = cur->parent; |
| 152 | while (1) { |
| 153 | switch (cur->type) { |
| 154 | case XML_DOCUMENT_NODE: |
| 155 | @@ -868,7 +869,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 156 | break; |
| 157 | |
| 158 | case XML_DOCUMENT_FRAG_NODE: |
| 159 | - if (cur->children != NULL) { |
| 160 | + /* Always validate cur->parent when descending. */ |
| 161 | + if ((cur->parent == parent) && (cur->children != NULL)) { |
| 162 | + parent = cur; |
| 163 | cur = cur->children; |
| 164 | continue; |
| 165 | } |
| 166 | @@ -887,7 +890,18 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 167 | break; |
| 168 | |
| 169 | case XML_ELEMENT_NODE: |
| 170 | - if ((cur != root) && (ctxt->format == 1) && (xmlIndentTreeOutput)) |
| 171 | + /* |
| 172 | + * Some users like lxml are known to pass nodes with a corrupted |
| 173 | + * tree structure. Fall back to a recursive call to handle this |
| 174 | + * case. |
| 175 | + */ |
| 176 | + if ((cur->parent != parent) && (cur->children != NULL)) { |
| 177 | + xmlNodeDumpOutputInternal(ctxt, cur); |
| 178 | + break; |
| 179 | + } |
| 180 | + |
| 181 | + if ((ctxt->level > 0) && (ctxt->format == 1) && |
| 182 | + (xmlIndentTreeOutput)) |
| 183 | xmlOutputBufferWrite(buf, ctxt->indent_size * |
| 184 | (ctxt->level > ctxt->indent_nr ? |
| 185 | ctxt->indent_nr : ctxt->level), |
| 186 | @@ -942,6 +956,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 187 | xmlOutputBufferWrite(buf, 1, ">"); |
| 188 | if (ctxt->format == 1) xmlOutputBufferWrite(buf, 1, "\n"); |
| 189 | if (ctxt->level >= 0) ctxt->level++; |
| 190 | + parent = cur; |
| 191 | cur = cur->children; |
| 192 | continue; |
| 193 | } |
| 194 | @@ -1058,13 +1073,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 195 | break; |
| 196 | } |
| 197 | |
| 198 | - /* |
| 199 | - * The parent should never be NULL here but we want to handle |
| 200 | - * corrupted documents gracefully. |
| 201 | - */ |
| 202 | - if (cur->parent == NULL) |
| 203 | - return; |
| 204 | - cur = cur->parent; |
| 205 | + cur = parent; |
| 206 | + /* cur->parent was validated when descending. */ |
| 207 | + parent = cur->parent; |
| 208 | |
| 209 | if (cur->type == XML_ELEMENT_NODE) { |
| 210 | if (ctxt->level > 0) ctxt->level--; |
| 211 | -- |
| 212 | 2.32.0 |
| 213 | |