DomFramework/src/Markdown.php

<?php

/** DomFramework
  * @package domframework
  * @author Dominique Fournier <dominique@fournier38.fr>
  * @license BSD
  */

namespace Domframework;

/** Convert the Markdown text to html format
  */
class Markdown
{
    /** To debug the markdown analyzer, activate the option */
    public $debug = false;

    /** The list of the HTML elements used by block */
    private $blockid = array("<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>",
                            "<hr/>");

    /** Convert the markdown text to html
      * @param string $markdown The markdown to convert
      */
    public function html($markdown)
    {
        $markdown = rtrim($markdown);
        $markdown = htmlentities($markdown);

        // Here are the regexp on multilines
        $search = array();
        $replace = array();
        // Titles with underline (SeText)
        // Titre1
        // ======
        $search[] = "/(.+)\\n==+$/Um";
        $replace[] = "</p><h1>\\1</h1>\n<p>";
        // Titre2
        // ------
        $search[] = "/(.+)\\n--+$/Um";
        $replace[] = "</p><h2>\\1</h2>\n<p>";

        // SEPARATORS : *** --- ___ * * * - - - _ _ _
        // Must be placed before EMPHASIS
        $search[] = "/^[*_-] ?[*_-] ?[*_-]$/Um";
        $replace[] = "</p><hr/>\n<p>";

        $markdown = preg_replace($search, $replace, $markdown);

        $textArray = explode("\n", $markdown);
        $pos = 0;
        $html = $this->detectBlock($textArray, 0, $pos);
        $html = str_replace("<p></p>", "", $html);
        $html = str_replace("<p> </p>", "", $html);
        $html = trim($html);
        return $html;
    }

    /** Search and replace in the paragraph on one line
      * @param string $line The line to analyze
      */
    private function searchReplace($line)
    {
        if ($this->debug) {
            echo "CALL searchReplace ($line)\n";
        }
        // REMEMBER : THE $line is already in HTML ENTITIES !
        // Quotes : &quot;
        $res = $line;
        // Manage the <hr/> separators
        $search = array("***", "---", "___", "* * *", "- - -", "_ _ _");
        foreach ($search as $key => $pattern) {
            $start = 0;
            while (1) {
                $start = strpos($res, $pattern, $start);
                if ($start === false) {
                    break;
                }
                if ($res[$start + 1] === $pattern) {
                    // Pattern too long, not this test : skip it
                    $start += strlen($pattern) + 1;
                    continue;
                }
                if ($start > 1 && $res[$start - 1] === "\\") {
                    // Search the ending pattern to skip it. Remove the backslash
                    $res = substr($res, 0, $start - 1) . substr($res, $start);
                } else {
                    $res = substr($res, 0, $start) . "<hr/>" .
                    substr($res, $start + strlen($pattern));
                }
            }
        }

        // Manage the emphasis and code correctely with the backslash
        $search = array();
        $replace = array();
        $search[] = "__";
        $replace[] = "<strong>\\1</strong>";
        $search[] = "_";
        $replace[] = "<em>\\1</em>";
        $search[] = "**";
        $replace[] = "<strong>\\1</strong>";
        $search[] = "*";
        $replace[] = "<em>\\1</em>";
        $search[] = "`";
        $replace[] = "<code>\\1</code>";

        foreach ($search as $key => $pattern) {
            $start = 0;
            while (1) {
                $start = strpos($res, $pattern, $start);
                if ($start === false) {
                    break;
                }
                $end = strpos($res, $pattern, $start + strlen($pattern));
                if ($end === false) {
                    break;
                }
                if ($res[$start + 1] === $pattern) {
                    // Pattern too long, not this test : skip it
                    $start += strlen($pattern) + strspn($res, $pattern, $start + 1);
                    continue;
                }
                if ($start > 1 && $res[$start - 1] === "\\") {
                    // Search the ending pattern to skip it. Remove the backslash
                    $res = substr($res, 0, $start - 1) . substr($res, $start);
                } else {
                    // It is the real pattern found, without backslash. Replace by the
                    // $replace value
                    $content = substr(
                        $res,
                        $start + strlen($pattern),
                        $end - $start - strlen($pattern)
                    );
                    if (trim($content) !== "") {
                        $first = substr($replace[$key], 0, strpos($replace[$key], "\\1"));
                        $second = substr($replace[$key], strpos($replace[$key], "\\1") + 2);
                        $res = substr($res, 0, $start) . $first . $content . $second .
                        substr($res, $end + strlen($pattern));
                    }
                }
                $start = $end + strlen($pattern);
            }
        }

        // Manage the others cases
        $search = array();
        $replace = array();
        // Titles short
        // == TITRE1
        $search[] = '~^([^\\\\]|^)(==+ (.+)( ==+)?)$~Um';
        $replace[] = '</p>' . "\n" . '<h1>\3</h1>' . "\n" . '<p>';
        // -- TITRE2
        $search[] = '~^([^\\\\]|^)(--+ (.+)( --+)?)$~Um';
        $replace[] = '</p>\n<h2>\3</h2>\n<p>';

        // LINKS (can be relative)
        // images
        $search[] = '~([^\\\\]|^)(!\[(.+)\]\((.+)\))~';
        $replace[] = '\1<img src=\'\4\' alt=\'\3\'/>';
        // [Google Site](http://google.fr/ "With help bubble")
        $search[] = '~([^\\\\!]|^)(\[(.+)\]\((.+) &quot;(.+)&quot;\))~';
        $replace[] = '\1<a href=\'\4\' title=\'\5\'>\3</a>';
        // [Google Site](http://google.fr/)
        $search[] = '~([^\\\\!]|^)(\[(.+)\]\((.+)\))~U';
        $replace[] = '\1<a href=\'\4\'>\3</a>';

        // Automatics links :
        // <http://dominique.fournier38.fr>
        // <dominique@fournier38.fr>
        $search[] = '~([^\\\\]|^)(&lt;(https?://.+)&gt;)~U';
        $replace[] = '\1<a href=\'\3\'>\3</a>';
        $search[] = '~([^\\\\]|^)(&lt;(.+@.+)&gt;)~U';
        $replace[] = '\1<a href=\'mailto:\3\'>\3</a>';
        // The links must not allow the <em> : redo the conversion
        $search[] = '~(<a href=\'.*)<em>(.*)</em>(.*\'>.*)<em>(.*)</em>(.*</a>)~';
        $replace[] = '\1_\2_\3_\4_\5';
        // TODO : Links by reference :
        // Voici un petit texte écrit par [Michel Fortin][mf].
        // [mf]: http://michelf.ca/ "Mon site web"

        // TITLES
        // Titles ATX (Optionnal sharp at the end)
        // ###### Title6
        $search[] = '~^([^\\\\]|^)?(###### (.+)( +#+)?)$~Um';
        $replace[] = '</p><h6>\3</h6><p>';
        // ##### Title5
        $search[] = '~^([^\\\\]|^)?(##### (.+)( +#+)?)$~Um';
        $replace[] = '</p><h5>\3</h5><p>';
        // #### Title4
        $search[] = '~^([^\\\\]|^)?(#### (.+)( +#+)?)$~Um';
        $replace[] = '</p><h4>\3</h4><p>';
        // ### Title3
        $search[] = '~^([^\\\\]|^)?(### (.+)( +#+)?)$~Um';
        $replace[] = '</p><h3>\3</h3><p>';
        // ## Title2
        $search[] = '~^([^\\\\]|^)?(## (.+)( +#+)?)$~Um';
        $replace[] = '</p><h2>\3</h2><p>';
        // # Title1
        $search[] = '~^([^\\\\]|^)?(# (.+)( +#+)?)$~Um';
        $replace[] = '</p><h1>\3</h1><p>';
        // Remove the backslashes on the existing regex
        foreach ($search as $s) {
            $s = str_replace('([^\\\\]|^)?', '([\\\\])', $s);
            $s = str_replace('([^\\\\]|^)', '([\\\\])', $s);
            $s = str_replace('([^\\\\!]|^)', '([\\\\])', $s);
            $s = str_replace('([^\\\\*]|^)', '([\\\\])', $s);
            $s = str_replace('([^\\\\_]|^)', '([\\\\])', $s);
            $search[] = $s;
            $replace[] = '\2';
        }
        /*foreach ($search as $key=>$s)
        {
        echo "$key => $s\n";
          $res =  preg_replace ($s, $replace[$key], $res);
        echo "$res\n";
        }*/
        $res =  preg_replace($search, $replace, $res);
        return $res;
    }

    /** Return HTML code corresponding to the code block
      * @param array $text The Markdown text to translate split by \n
      * @param integer $depth The depth of current bloc (in number of space)
      * @param integer &$pos The start line number of the bloc
      */
    private function typeCode($text, $depth, &$pos)
    {
        if ($this->debug) {
            echo "CALL typeCode (\$text, $depth, $pos)\n";
        }
        $posStart = $pos;
        $content = "";
        // End of code block : end of markdown text / depth lighter than $depth
        while (
            isset($text[$pos]) &&
            $this->depth($text[$pos]) >= $depth
        ) {
            // The Code blocks can't be imbricated
            if ($pos > $posStart) {
                $content .= "\n";
            }
            $content .= substr($text[$pos], $depth);
            $pos++;
        }
        // Insert Geshi on $content
        if ($this->debug) {
            echo "RETURN typeCode : <pre><code>$content</code></pre>\n";
        }
        return "<pre><code>$content</code></pre>\n";
    }

    /** Return HTML code corresponding to the OL block
      * @param array $text The Markdown text to translate split by \n
      * @param integer $depth The depth of current bloc (in number of space)
      * @param integer &$pos The start line number of the bloc
      */
    private function typeOL($text, $depth, &$pos)
    {
        if ($this->debug) {
            echo "CALL typeOL (\$text, $depth, $pos)\n";
        }
        $content = $this->typeOLUL($text, $depth, $pos, "ol");
        if ($this->debug) {
            echo "RETURN typeOL : $content\n";
        }
        return $content;
    }

    /** Return HTML code corresponding to the UL block
      * @param array $text The Markdown text to translate split by \n
      * @param integer $depth The depth of current bloc (in number of space)
      * @param integer &$pos The start line number of the bloc
      */
    private function typeUL($text, $depth, &$pos)
    {
        if ($this->debug) {
            echo "CALL typeUL (\$text, $depth, $pos)\n";
        }
        $content = $this->typeOLUL($text, $depth, $pos, "ul");
        if ($this->debug) {
            echo "RETURN typeUL : $content\n";
        }
        return $content;
    }

    /** Return the HTML code corresponding to the OL/UL block
      * @param array  $text The Markdown text to translate split by \n
      * @param integer $depth The depth of current bloc (in number of space)
      * @param integer &$pos The start line number of the bloc
      * @param string $type The block type : "ul" or "ol"
      */
    private function typeOLUL($text, $depth, &$pos, $type)
    {
        if ($this->debug) {
            echo "CALL typeOLUL (\$text, $depth, $pos, $type)\n";
        }
        $content = "";
        // End of OL/UL block : end of markdown text / depth lighter than $depth /
        //                   linetype changed
        $blockStart = $pos;
        $blockContent = "";
        while (
            isset($text[$pos]) &&
            $this->depth($text[$pos]) >= $depth &&
            $this->lineType($text[$pos]) === $type
        ) {
            if ($this->debug) {
                echo "Start while $pos\n";
            }
            if (1) {
                $content .= str_repeat(" ", ($depth + 2)) . "<li>";
                $blockContent .= $text[$pos];
                $pos++;
                // Look at continuous lines
                while (
                    isset($text[$pos]) &&
                    $this->lineType($text[$pos]) !== "NONE" &&
                    $this->lineType($text[$pos]) !== $type &&
                    $this->depth($text[$pos]) === $depth
                ) {
                    if ($this->debug) {
                        echo "Continuous line : " . $pos . "\n";
                    }
                    $blockContent .= " " . $text[$pos];
                    $pos++;
                    continue;
                }
                // Indent the li and remove the number and dot and space at start
                if ($type === "ol") {
                    preg_match("/^( *)[0-9]+\. +(.*)/", $blockContent, $matches);
                } else {
                    preg_match("/^( *)[-+*] +(.*)/", $blockContent, $matches);
                }
                if (!isset($matches[2])) {
                    $lineTxt = $blockContent;
                } else {
                    $lineTxt = $matches[2];
                }
                $lineTxt = $this->searchReplace($lineTxt);
                $content .= $lineTxt;
                $blockStart = $pos;
                $blockContent = "";
            }
            if (isset($text[$pos]) && $this->depth($text[$pos]) > $depth) {
                if ($this->debug) {
                    echo "Detect Block\n";
                }
                $content .= "\n" .
                    $this->detectBlock($text, $this->depth($text[$pos]), $pos) .
                    str_repeat(" ", ($depth + 2)) . "</li>\n";
            } else {
                $content .= "</li>\n";
            }
        }
        if ($this->debug) {
            echo "RETURN typeOLUL : <$type>\n$content</$type>\n";
        }
        return "<$type>\n$content" . str_repeat(" ", $depth) . "</$type>\n";
    }

    /** Return HTML code corresponding to the NONE block
      * The NONE type exists only on empty strings. Just skip the current and
      * empty line, and return an empty string
      * @param string $text The Markdown text to translate split by \n
      * @param integer $depth The depth of the current bloc (in number of space)
      * @param integer &$pos The start line number of the bloc
      */
    private function typeNONE($text, $depth, &$pos)
    {
        if ($this->debug) {
            echo "CALL typeNONE (\$text, $depth, $pos)\n";
        }
        $pos++;
        return "";
    }

    /** Return HTML code corresponding to the P block
      * @param array $text The Markdown text to translate split by \n
      * @param integer $depth The depth of current bloc (in number of space)
      * @param integer &$pos The start line number of the bloc
      */
    private function typeP($text, $depth, &$pos)
    {
        if ($this->debug) {
            echo "CALL typeP (\$text, $depth, $pos)\n";
        }
        $content = "";
        // End of P block : end of markdown text / depth lighter than $depth /
        //                   linetype changed
        $Pinc = $pos;
        while (
            isset($text[$pos]) &&
            $this->depth($text[$pos]) == $depth &&
            $this->lineType($text[$pos]) === "p"
        ) {
            if (substr($text[$pos], -2) === "  ") {
                // Two spaces at end of line : add <br/>
                $content .= $this->searchReplace(substr($text[$pos], 0, -2)) . "<br/>";
            } elseif ($pos > $Pinc && substr($content, -5) !== "<br/>") {
                // Add a space between two lines from the same block, if this is not
                // the continuity of the block
                $content .= " " . $this->searchReplace($text[$pos]);
            } else {
                $content .= $this->searchReplace($text[$pos]);
            }
            $pos++;
        }
        if ($this->debug) {
            echo "RETURN typeP : <p>$content</p>\n";
        }
        return "<p>$content</p>\n";
    }

    /** Detect the type of the text and call the appropriate function *
      * @param array $text The Markdown text to translate split by \n
      * @param integer $depth The depth of current bloc (in number of space)
      * @param integer &$pos The start line number of the bloc
      * @return the HTML code
      */
    private function detectBlock($text, $depth, &$pos)
    {
        if ($this->debug) {
            echo "CALL detectBlock (\$text, $depth, $pos)\n";
        }
        $content = "";
        $blockContent = "";
        // detect the type and call the right type function
        while (isset($text[$pos])) {
            if ($this->depth($text[$pos]) > $depth && $depth === 0) {
                // New block code
                if ($this->debug) {
                    echo "New block code\n";
                }
                $content .= $this->typeCode($text, $this->depth($text[$pos]), $pos);
                continue;
            } elseif ($this->depth($text[$pos]) > $depth) {
                if ($this->debug) {
                    echo "CALL DEPTH > MINDEPTH (" . $this->depth($text[$pos]) .
                    " > $depth)\n";
                }
                $content .= $this->detectBlock(
                    $text,
                    $this->depth($text[$pos]),
                    $pos
                );
                continue;
            } elseif ($this->depth($text[$pos]) < $depth) {
                if ($this->debug) {
                    echo "CALL DEPTH > MINDEPTH (" . $this->depth($text[$pos]) .
                    " < $depth)\n";
                }
                return $content;
            }

            $type = $this->lineType($text[$pos]);
            $func = "type$type";
            if ($this->debug) {
                echo "FROM DETECT : CALL $func (line=" . $text[$pos] . ")\n";
            }
            $content .= str_repeat(" ", $depth) . $this->$func($text, $depth, $pos);
        }
        return $content;
    }

    /** Return the Type of object in the provided line
      * p, ul, ol, code
      * @param string $line The line to get the type
      */
    private function lineType($line)
    {
        if (! isset($line[0])) {
            return "NONE";
        }
        if (preg_match("/^[ \t]*[+*-] /", $line) === 1) {
            return "ul";
        }
        if (preg_match("/^[ \t]*[0-9]+\. /", $line) === 1) {
            return "ol";
        }
        if (preg_match("/^(    |\t)+/", $line) === 1) {
            return "code";
        }
        return "p";
    }

    /** Return the depth of the provided line
      * @param string $line Line to analyze
      * @return the depth of the line
      */
    private function depth($line)
    {
        return strspn($line, " ");
    }
}