Refactor to remove BREAK states in favor of lastchar memory - nihdoc

commit c75dd84b276e33b70613d251eb0a3b9782d8bc96 (patch)
parent 8c0cdf65f6a499d14ff254d64f7ce2c79b18e228
Author: Alex Karle <alex@alexkarle.com>
Date:   Sat, 18 Dec 2021 15:19:22 -0500

Refactor to remove BREAK states in favor of lastchar memory

I noticed all the block states had BREAK states, when in reality all we
really care about is whether both the last char and this char are '\n'
so we can determine if it's the end of the block.

Removing these break states also involved adding a "linestarted" bool
so that lists knew whether the separator in the list was a new item
or just a normal character.

Test case only changed in whitespace. Trimmed some trailing spaces at
the cost of adding newlines to li's... win some lose some!

Diffstat:
M blag.c  | 172 +++++++++++++++++++++++++++++--------------------------------------------------
M test/big.html  | 16 ++++++++++++----
A test/ulist.html  | 37 +++++++++++++++++++++++++++++++++++++
A test/ulist.txt  | 7 +++++++

4 files changed, 118 insertions(+), 114 deletions(-)
diff --git a/blag.c b/blag.c
@@ -6,6 +6,7 @@
  */
 #include <stdio.h>
 #include <stdlib.h>
+#include <err.h>
 #include <unistd.h>
 
 enum Block {
@@ -13,45 +14,18 @@ enum Block {
     HEADER_PARSE,
     HEADER,
     PARAGRAPH,
-    PARAGRAPH_BREAK,
     CODE,
-    CODE_BREAK,
     QUOTE,
-    QUOTE_BREAK,
     ULIST,
     ULIST_START,
     ULIST_PARSE,
-    ULIST_BREAK,
     OLIST,
     OLIST_START,
     OLIST_PARSE,
-    OLIST_BREAK,
     LINK_URL_PARSE,
     LINK_DESC_PARSE,
 };
 
-void putesc(int c) {
-    switch (c) {
-        case '<':
-            printf("&lt;");
-            break;
-        case '>':
-            printf("&gt;");
-            break;
-        case '&':
-            printf("&amp;");
-            break;
-        case '"':
-            printf("&quot;");
-            break;
-        case '\'':
-            printf("&#39;");
-            break;
-        default:
-            putchar(c);
-    }
-}
-
 typedef struct State {
     enum Block in;
     enum Block in_link;
@@ -65,62 +39,56 @@ typedef struct State {
     int indent;
     int previndent;
     int listdepth;
+    int lastc;
+    int linestarted;
 } state;
 
-void closeblock(state *s) {
+void putesc(int c) {
+    switch (c) {
+        case '<': printf("&lt;"); break;
+        case '>': printf("&gt;"); break;
+        case '&': printf("&amp;"); break;
+        case '"': printf("&quot;"); break;
+        case '\'': printf("&#39;"); break;
+        default: putchar(c);
+    }
+}
+
+void handle_lf(state *s) {
     s->indent = 0;
+    s->linestarted = 0;
+
+    /* single line types (one lf to close) */
     if (s->in == HEADER) {
         s->in = NONE;
         printf("</h%d>", s->hlvl);
-    } else if (s->in == PARAGRAPH) {
-        s->in = PARAGRAPH_BREAK;
-    } else if (s->in == PARAGRAPH_BREAK) {
-        s->in = NONE;
-        printf("</p>\n");
-    } else if (s->in == CODE) {
-        s->in = CODE_BREAK;
-    } else if (s->in == CODE_BREAK) {
-        s->in = NONE;
-        printf("</pre></code>\n");
-    } else if (s->in == QUOTE) {
-        s->in = QUOTE_BREAK;
-    } else if (s->in == QUOTE_BREAK) {
-        s->in = NONE;
-        printf("</blockquote>\n");
-    } else if (s->in == ULIST) {
-        s->in = ULIST_BREAK;
-    } else if (s->in == ULIST_BREAK) {
-        s->in = NONE;
-        s->previndent = 0;
-        while (s->listdepth > 0) {
-            printf("\n</li>\n</ul>\n");
-            s->listdepth--;
+    }  else if (s->in == LINK_URL_PARSE || s->in == LINK_DESC_PARSE) {
+        errx(1, "newline detected while processing link");
+    }
+
+    /* multi-line types (two lf to close) */
+    if (s->lastc == '\n') {
+        switch (s->in) {
+            case PARAGRAPH: printf("</p>\n"); break;
+            case CODE:      printf("</pre></code>\n"); break;
+            case QUOTE:     printf("</blockquote>\n"); break;
+            case OLIST:
+            case ULIST:
+                s->previndent = 0;
+                while (s->listdepth > 0) {
+                    printf("\n</li>\n</%s>\n", s->in == OLIST ? "ol" : "ul");
+                    s->listdepth--;
+                }
+                break;
+            default:
+                break; /* no op */
         }
-    } else if (s->in == OLIST) {
-        s->in = OLIST_BREAK;
-    } else if (s->in == OLIST_BREAK) {
         s->in = NONE;
-        s->previndent = 0;
-        while (s->listdepth > 0) {
-            printf("\n</li>\n</ol>\n");
-            s->listdepth--;
-        }
-    } else {
-        /* keep in as is */
     }
 }
 
 int parse() {
-    /* Mini state machine (home grown spaghetti code)
-     *
-     * Key: global "line level" state in `in`, secondary mid-line states
-     * (inline code & links) use dedicated states. A newline triggers many of
-     * the line-level blocks to enter "BREAK" mode, where they can either
-     * continue or truly be broken on a second newline (PARAGRAPH, CODE,
-     * U/OLIST, etc). Several of the variable length tokens (like ordered
-     * list numbers, header level, etc) enter a "PARSE" mode where special
-     * action is taken until the parsing is done (usually on ' ')
-     */
+    /* Mini state machine (home grown spaghetti code) */
     int c;
     state s = {
         .in = NONE,
@@ -135,9 +103,13 @@ int parse() {
         .indent = 0,
         .previndent = 0,
         .listdepth = 0,
+        .lastc = 0,
+        .linestarted = 0,
     };
 
     while ((c = getchar()) != EOF) {
+        /* printf("\n>> c: %d lastc: %d started: %d indent: %d\n", c, s.lastc, s.linestarted, s.indent); */
+
         /* Handle Escapes before all else */
         if (s.escape) {
             if (s.in == NONE) {
@@ -154,14 +126,12 @@ int parse() {
             s.lnkbuf[s.lnkidx++] = c;
         }
 
-        /* first non-space char in a list continuation breaks break */
-        if (s.in == ULIST_BREAK && c != ' ' && c != '-' && c != '\n') {
-            s.in = ULIST;
+        /* Store the indentation */
+        if (!s.linestarted && c == ' ') {
+            s.indent++;
+            continue;  /* don't print leading indents */
         }
-        if (s.in == OLIST_BREAK && c != ' ' && (c < 48 || c > 58) && c != '\n') {
-            s.in = OLIST;
-        }
-        
+
         /* Handle unique state changes by char */
         switch (c) {
             case '\\':
@@ -198,14 +168,6 @@ int parse() {
                 } else if (s.in == OLIST_PARSE) {
                     printf("\n</li>\n<li>\n");
                     s.in = OLIST;
-                } else if (s.in == NONE) {
-                    s.indent++;
-                } else if (s.in == ULIST_BREAK) {
-                    s.indent++;
-                    putesc(c);
-                } else if (s.in == OLIST_BREAK) {
-                    s.indent++;
-                    putesc(c);
                 } else if (s.in_link == LINK_URL_PARSE) {
                     s.in_link = LINK_DESC_PARSE;
                     printf("\">");
@@ -251,21 +213,12 @@ int parse() {
                 }
                 break;
             case '\t':
-                if (s.in == NONE) {
-                    s.in = CODE;
-                    printf("<code><pre>\n");
-                } else if (s.in == CODE_BREAK) {
-                    s.in = CODE;
-                } else {
-                    putesc(c);
-                }
-                break;
             case '>':
                 if (s.in == NONE) {
-                    s.in = QUOTE;
-                    printf("<blockquote>\n");
-                } else if (s.in == QUOTE_BREAK) {
-                    s.in = QUOTE;
+                    s.in = c == '>' ? QUOTE : CODE;
+                    printf("%s\n", c == '>' ? "<blockquote>" : "<code><pre>");
+                } else if (s.lastc == '\n' && c == (s.in == CODE ? '\t' : '>')) {
+                    /* no op */
                 } else {
                     putesc(c);
                 }
@@ -302,8 +255,7 @@ int parse() {
                     s.in = ULIST_START;
                 } else if (s.in == ULIST_START || s.in == ULIST_PARSE) {
                     /* no op */
-                } else if (s.in == ULIST_BREAK) {
-                    /* printf("\n>> %d | %d \n", s.previndent, s.indent); */
+                } else if (!s.linestarted) {
                     if (s.previndent < s.indent) {
                         /* new sublist */
                         s.in = ULIST_START;
@@ -335,7 +287,7 @@ int parse() {
                     s.in = OLIST_START;
                 } else if (s.in == OLIST_START || s.in == OLIST_PARSE) {
                     /* no op */
-                } else if (s.in == OLIST_BREAK) {
+                } else if (!s.linestarted) {
                     if (s.previndent < s.indent) {
                         /* new sublist */
                         s.in = OLIST_START;
@@ -353,25 +305,25 @@ int parse() {
                 }
                 break;
             case '\n':
-                closeblock(&s);
-                if (s.in != ULIST_BREAK && s.in != OLIST_BREAK) {
-                    putesc(c);
-                }
+                handle_lf(&s);
+                putesc(c);
                 break;
             default:
                 if (s.in == NONE) {
                     /* nothing else was matched -> assume new <p> */
                     s.in = PARAGRAPH;
                     printf("<p>\n");
-                } else if (s.in == PARAGRAPH_BREAK) {
-                    /* We thought it might be the end, but it aint! */
-                    s.in = PARAGRAPH;
                 }
                 putesc(c);
                 break;
         }
+        s.lastc = c;
+        if (c != '\n') {
+            s.linestarted = 1;
+        }
     }
-    closeblock(&s);
+    /* pretend there's a final LF to close any blocks */
+    handle_lf(&s);
     return 0;
 }
 
diff --git a/test/big.html b/test/big.html
@@ -11,22 +11,27 @@ This is an example blag file!
 
 <ul>
 <li>
-We can have lists??  
+We can have lists??
+
 <ul>
 <li>
-And nested lists??  
+And nested lists??
+
 </li>
 <li>
-And <code>code</code> within lists?  
+And <code>code</code> within lists?
+
 </li>
 <li>
 but not <code>*bold*</code> within code
+
 </li>
 </ul>
 
 </li>
 <li>
 A tier1 item
+
 </li>
 </ul>
 
@@ -39,12 +44,15 @@ that wraps nicely
 <ol>
 <li>
 This is a numbered list
+
 </li>
 <li>
-Cool, huh?    
+Cool, huh?
+
 <ol>
 <li>
 yes!
+
 </li>
 </ol>
 
diff --git a/test/ulist.html b/test/ulist.html
@@ -0,0 +1,37 @@
+
+<ul>
+<li>
+item 1
+
+<ul>
+<li>
+subi
+
+</li>
+<li>
+subi 2
+
+</li>
+</ul>
+
+</li>
+<li>
+item 2
+
+</li>
+</ul>
+
+
+<ul>
+<li>
+it1
+
+<ul>
+<li>
+s1
+
+</li>
+</ul>
+
+</li>
+</ul>
diff --git a/test/ulist.txt b/test/ulist.txt
@@ -0,0 +1,7 @@
+- item 1
+  - subi
+  - subi 2
+- item 2
+
+- it1
+  - s1

	nihdoc WIP markup parser (txt -> html)
	git clone git://git.alexkarle.com.com/blag
	Log \| Files \| Refs \| README \| LICENSE

M	blag.c	\|	172	+++++++++++++++++++++++++++++--------------------------------------------------
M	test/big.html	\|	16	++++++++++++----
A	test/ulist.html	\|	37	+++++++++++++++++++++++++++++++++++++
A	test/ulist.txt	\|	7	+++++++