[Git][ghc/ghc][wip/andreask/escape_json] Profiling: Properly escape characters when using `-pj`.

Mon Sep 11 13:42:07 UTC 2023


Andreas Klebinger pushed to branch wip/andreask/escape_json at Glasgow Haskell Compiler / GHC


Commits:
c25a5b39 by Andreas Klebinger at 2023-09-11T15:41:36+02:00
Profiling: Properly escape characters when using `-pj`.

There are some ways in which unusual characters like quotes or others
can make it into cost centre names. So properly escape these.

Fixes #23924

- - - - -


1 changed file:

- rts/ProfilerReportJson.c


Changes:

=====================================
rts/ProfilerReportJson.c
=====================================
@@ -17,36 +17,178 @@
 
 #include <string.h>
 
-// I don't think this code is all that perf critical.
-// So we just allocate a new buffer each time around.
+// Including zero byte
+static size_t escaped_size(char const* str)
+{
+    size_t escaped_size = 0;
+    for (; *str != '\0'; str++) {
+        const unsigned char c = *str;
+        switch (c)
+            {
+                // quotation mark (0x22)
+                case '"':
+                {
+                    escaped_size += 2;
+                    break;
+                }
+
+                case '\\':
+                {
+                    escaped_size += 2;
+                    break;
+                }
+
+                // backspace (0x08)
+                case '\b':
+                {
+                    escaped_size += 2;
+                    break;
+                }
+
+                // formfeed (0x0c)
+                case '\f':
+                {
+                    escaped_size += 2;
+                    break;
+                }
+
+                // newline (0x0a)
+                case '\n':
+                {
+                    escaped_size += 2;
+                    break;
+                }
+
+                // carriage return (0x0d)
+                case '\r':
+                {
+                    escaped_size += 2;
+                    break;
+                }
+
+                // horizontal tab (0x09)
+                case '\t':
+                {
+                    escaped_size += 2;
+                    break;
+                }
+
+                default:
+                {
+                    if (c <= 0x1f)
+                    {
+                        // print character c as \uxxxx
+                        escaped_size += 6;
+                    }
+                    else
+                    {
+                        escaped_size ++;
+                    }
+                    break;
+                }
+            }
+    }
+    escaped_size++; // null byte
+
+    return escaped_size;
+}
+
 static void escapeString(char const* str, char **buf)
 {
     char *out;
-    size_t req_size; //Max required size for decoding.
-    size_t in_size;  //Input size, including zero.
-
-    in_size = strlen(str) + 1;
-    // The strings are generally small and short
-    // lived so should be ok to just double the size.
-    req_size = in_size * 2;
-    out = stgMallocBytes(req_size, "writeCCSReportJson");
-    *buf = out;
-    // We provide an outputbuffer twice the size of the input,
-    // and at worse double the output size. So we can skip
-    // length checks.
+    size_t out_size; //Max required size for decoding.
+    size_t pos = 0;
+
+    out_size = escaped_size(str); //includes trailing zero byte
+    out = stgMallocBytes(out_size, "writeCCSReportJson");
     for (; *str != '\0'; str++) {
-        char c = *str;
-        if (c == '\\') {
-            *out = '\\'; out++;
-            *out = '\\'; out++;
-        } else if (c == '\n') {
-            *out = '\\'; out++;
-            *out = 'n';  out++;
-        } else {
-            *out = c; out++;
-        }
+        const unsigned char c = *str;
+        switch (c)
+            {
+                // quotation mark (0x22)
+                case '"':
+                {
+                    out[pos] = '\\';
+                    out[pos + 1] = '"';
+                    pos += 2;
+                    break;
+                }
+
+                // reverse solidus (0x5c)
+                case '\\':
+                {
+                    out[pos] = '\\';
+                    out[pos+1] = '\\';
+                    pos += 2;
+                    break;
+                }
+
+                // backspace (0x08)
+                case '\b':
+                {
+                    out[pos] = '\\';
+                    out[pos + 1] = 'b';
+                    pos += 2;
+                    break;
+                }
+
+                // formfeed (0x0c)
+                case '\f':
+                {
+                    out[pos] = '\\';
+                    out[pos + 1] = 'f';
+                    pos += 2;
+                    break;
+                }
+
+                // newline (0x0a)
+                case '\n':
+                {
+                    out[pos] = '\\';
+                    out[pos + 1] = 'n';
+                    pos += 2;
+                    break;
+                }
+
+                // carriage return (0x0d)
+                case '\r':
+                {
+                    out[pos] = '\\';
+                    out[pos + 1] = 'r';
+                    pos += 2;
+                    break;
+                }
+
+                // horizontal tab (0x09)
+                case '\t':
+                {
+                    out[pos] = '\\';
+                    out[pos + 1] = 't';
+                    pos += 2;
+                    break;
+                }
+
+                default:
+                {
+                    if (c <= 0x1f)
+                    {
+                        // print character c as \uxxxx
+                        out[pos] = '\\';
+                        sprintf(&out[pos + 1], "u%04x", (int)c);
+                        pos += 6;
+                    }
+                    else
+                    {
+                        // all other characters are added as-is
+                        out[pos++] = c;
+                    }
+                    break;
+                }
+            }
     }
-    *out = '\0';
+    out[pos++] = '\0';
+    assert(pos == out_size);
+    *buf = out;
 }
 
 static void



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/c25a5b399f773789cefe2275581a24dfd977d36b

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/c25a5b399f773789cefe2275581a24dfd977d36b
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20230911/258ed308/attachment-0001.html>