Updated README, added auto HTML entities, list bugfix, performance improvements.

This commit is contained in:
Rene' Jeschke 2011-04-18 14:12:08 +02:00
parent 2315269a79
commit bb920c8f6e
8 changed files with 300 additions and 67 deletions

View File

@ -109,8 +109,16 @@ This seemed to me as the easiest and safest way to enable different behaviours.
<li>and this is not a list</li>
</ul>
* More to come ...
* Auto HTML entities:
* `(C)` becomes `&copy;` - &copy;
* `(R)` becomes `&reg;` - &reg;
* `(TM)` becomes `&trade;` - &trade;
* `--` becomes `&mdash;` - &mdash;
* `...` becomes `&hellip;` - &hellip;
* `<<` becomes `&laquo;` - &laquo;
* `>>` becomes `&raquo;` - &raquo;
* `"Hello"` becomes `&ldquo;Hello&rdquo;` - &ldquo;Hello&rdquo;
### Markdown conformity
@ -150,42 +158,43 @@ except of two:
Based on [this benchmark suite](http://henkelmann.eu/2011/01/10/performance_comparison_of_markdown_processor_for_the_jvm).
Excerpt from the original post concerning this benchmark suite:
> Most of these tests are of course unrealistic: Who would write a
> text where each word is a link? Yet they serve an important use:
> It makes it possible for the developer to pinpoint the parts of
> the parser where there is most room for improvement. Also, it
> explains why certain texts might render much faster in one
> Processor than in another.
Benchmark system:
* Ubuntu Linux 10.04 32 Bit
* Intel(R) Core(TM) 2 Duo T7500 @ 2.2GHz
* Java(TM) SE Runtime Environment (build 1.6.0_24-b07)
* Java HotSpot(TM) Server VM (build 19.1-b02, mixed mode)
<table>
<tr><th>Test</th><th colspan="2">Actuarius</th><th colspan="2">PegDown</th><th colspan="2">Knockoff</th><th colspan="2">Txtmark</th></tr>
<tr><td></td><td>1st Run (ms)</td><td>2nd Run (ms)</td><td>1st Run (ms)</td><td>2nd Run (ms)</td><td>1st Run (ms)</td><td>2nd Run (ms)</td><td>1st Run (ms)</td><td>2nd Run (ms)</td></tr>
<tr><td>Plain Paragraphs</td><td>887</td><td>461</td><td>2455</td><td>2236</td><td>764</td><td>568</td><td>89</td><td>47</td></tr>
<tr><td>Every Word Emphasized</td><td>2220</td><td>2077</td><td>3411</td><td>3406</td><td>30503</td><td>30514</td><td>72</td><td>66</td></tr>
<tr><td>Every Word Strong</td><td>2384</td><td>2270</td><td>2456</td><td>2466</td><td>23639</td><td>23577</td><td>62</td><td>57</td></tr>
<tr><td>Every Word Inline Code</td><td>824</td><td>804</td><td>2337</td><td>2237</td><td>23506</td><td>23622</td><td>54</td><td>55</td></tr>
<tr><td>Every Word a Fast Link</td><td>3942</td><td>3738</td><td>1164</td><td>1159</td><td>8621</td><td>8595</td><td>89</td><td>68</td></tr>
<tr><td>Every Word Consisting of Special XML Chars</td><td>9393</td><td>9312</td><td>7544</td><td>7314</td><td>801</td><td>608</td><td>3587</td><td>3614</td></tr>
<tr><td>Every Word wrapped in manual HTML tags</td><td>6843</td><td>6828</td><td>1850</td><td>1859</td><td>8699</td><td>8692</td><td>1169</td><td>1154</td></tr>
<tr><td>Every Line with a manual line break</td><td>859</td><td>724</td><td>2968</td><td>2946</td><td>2171</td><td>1990</td><td>58</td><td>56</td></tr>
<tr><td>Every word with a full link</td><td>528</td><td>501</td><td>2252</td><td>2280</td><td>3513</td><td>3512</td><td>66</td><td>60</td></tr>
<tr><td>Every word with a full image</td><td>395</td><td>374</td><td>2463</td><td>2569</td><td>3757</td><td>3726</td><td>56</td><td>55</td></tr>
<tr><td>Every word with a reference link</td><td>19208</td><td>19035</td><td>39183</td><td>38710</td><td>243450</td><td>244943</td><td>1826</td><td>1798</td></tr>
<tr><td>Every block a quote</td><td>465</td><td>449</td><td>2687</td><td>2684</td><td>978</td><td>977</td><td>48</td><td>48</td></tr>
<tr><td>Every block a codeblock</td><td>151</td><td>134</td><td>597</td><td>601</td><td>270</td><td>262</td><td>36</td><td>27</td></tr>
<tr><td>Every block a list</td><td>1209</td><td>1106</td><td>3448</td><td>3432</td><td>1411</td><td>1368</td><td>52</td><td>60</td></tr>
<tr><td>All tests together</td><td>6062</td><td>6042</td><td>11556</td><td>11589</td><td>19827</td><td>19637</td><td>452</td><td>448</td></tr>
<tr><td>Plain Paragraphs</td><td>1127</td><td>577</td><td>1273</td><td>1037</td><td>740</td><td>400</td><td>157</td><td>64</td></tr>
<tr><td>Every Word Emphasized</td><td>1562</td><td>1001</td><td>1523</td><td>1513</td><td>13982</td><td>13221</td><td>54</td><td>46</td></tr>
<tr><td>Every Word Strong</td><td>1125</td><td>997</td><td>1115</td><td>1114</td><td>9543</td><td>9647</td><td>44</td><td>41</td></tr>
<tr><td>Every Word Inline Code</td><td>382</td><td>277</td><td>1058</td><td>1052</td><td>9116</td><td>9074</td><td>51</td><td>39</td></tr>
<tr><td>Every Word a Fast Link</td><td>2257</td><td>1600</td><td>537</td><td>531</td><td>3980</td><td>3410</td><td>109</td><td>55</td></tr>
<tr><td>Every Word Consisting of Special XML Chars</td><td>4045</td><td>4270</td><td>2985</td><td>3044</td><td>312</td><td>377</td><td>778</td><td>775</td></tr>
<tr><td>Every Word wrapped in manual HTML tags</td><td>3334</td><td>2919</td><td>901</td><td>896</td><td>3863</td><td>3736</td><td>73</td><td>62</td></tr>
<tr><td>Every Line with a manual line break</td><td>510</td><td>588</td><td>1445</td><td>1440</td><td>1527</td><td>1130</td><td>56</td><td>56</td></tr>
<tr><td>Every word with a full link</td><td>452</td><td>246</td><td>1045</td><td>996</td><td>1884</td><td>1819</td><td>86</td><td>55</td></tr>
<tr><td>Every word with a full image</td><td>268</td><td>150</td><td>1140</td><td>1132</td><td>1985</td><td>1908</td><td>38</td><td>36</td></tr>
<tr><td>Every word with a reference link</td><td>9847</td><td>9082</td><td>18956</td><td>18719</td><td>121136</td><td>115416</td><td>1525</td><td>1380</td></tr>
<tr><td>Every block a quote</td><td>445</td><td>206</td><td>1312</td><td>1301</td><td>478</td><td>457</td><td>50</td><td>45</td></tr>
<tr><td>Every block a codeblock</td><td>70</td><td>87</td><td>373</td><td>376</td><td>161</td><td>175</td><td>60</td><td>22</td></tr>
<tr><td>Every block a list</td><td>920</td><td>912</td><td>1720</td><td>1725</td><td>622</td><td>651</td><td>55</td><td>55</td></tr>
<tr><td>All tests together</td><td>3281</td><td>2885</td><td>5184</td><td>5196</td><td>10130</td><td>10460</td><td>206</td><td>196</td></tr>
</table>
* Q: Why is Txtmark so slow when it comes to XML entities?
* A: Because Txtmark does some sanity checks on XML entities to make sure
it outputs valid XML. For example:
&cutie;
will produce (when processed with Markdown and most other markdown processors):
&cutie;
and
&amp;cutie;
when processed with Txtmark.
Benchmarked versions:
[Actuarius] version: 0.2
[PegDown] version: 0.8.5.4
@ -193,10 +202,11 @@ Benchmarked versions:
---
[Markdown] is copyright (c) 2004 by John Gruber
[Actuarius] is copyright (c) 2010 by Christoph Henkelmann
[Knockoff] is copyright (c) 2009-2011 by Tristan Juricek
[PegDown] is copyright (c) 2010 by Mathias Doenitz
Mentioned/related projects:
[Markdown] is Copyright (C) 2004 by John Gruber
[Actuarius] is Copyright (C) 2010 by Christoph Henkelmann
[Knockoff] is Copyright (C) 2009-2011 by Tristan Juricek
[PegDown] is Copyright (C) 2010 by Mathias Doenitz
***
@ -206,5 +216,6 @@ Benchmarked versions:
[PegDown]: https://github.com/sirthias/pegdown
[tar]: https://github.com/rjeschke/txtmark/tarball/master "branch: master"
[zip]: https://github.com/rjeschke/txtmark/zipball/master "branch: master"
[$PROFILE$]: extended "Txtmark processing information."
Project link: <https://github.com/rjeschke/txtmark>

View File

@ -230,4 +230,54 @@ class Block
this.lineTail = line;
}
}
/**
* Changes all Blocks of type <code>NONE</code> to <code>PARAGRAPH</code> if this Block
* is a List and any of the ListItems contains a paragraph.
*/
public void expandListParagraphs()
{
if(this.type != BlockType.ORDERED_LIST && this.type != BlockType.UNORDERED_LIST)
{
return;
}
Block outer = this.blocks, inner;
boolean hasParagraph = false;
while(outer != null && !hasParagraph)
{
if(outer.type == BlockType.LIST_ITEM)
{
inner = outer.blocks;
while(inner != null && !hasParagraph)
{
if(inner.type == BlockType.PARAGRAPH)
{
hasParagraph = true;
}
inner = inner.next;
}
}
outer = outer.next;
}
if(hasParagraph)
{
outer = this.blocks;
while(outer != null)
{
if(outer.type == BlockType.LIST_ITEM)
{
inner = outer.blocks;
while(inner != null)
{
if(inner.type == BlockType.NONE)
{
inner.type = BlockType.PARAGRAPH;
}
inner = inner.next;
}
}
outer = outer.next;
}
}
}
}

View File

@ -7,6 +7,17 @@ package txtmark;
/**
* Default Decorator implementation.
*
* <p>Example for a user Decorator having a class attribute on &lt;p> tags.</p>
* <pre><code>public class MyDecorator extends DefaultDecorator
*{
* &#64;Override
* public void openParagraph(StringBuilder out)
* {
* out.append("&lt;p class=\"myclass\">");
* }
*}
*</code></pre>
*
* @author René Jeschke <rene_jeschke@yahoo.de>
*/
public class DefaultDecorator implements Decorator

View File

@ -17,6 +17,8 @@ class Emitter
private final HashMap<String, LinkRef> linkRefs = new HashMap<String, LinkRef>();
/** The Decorator. */
private Decorator decorator;
/** Extension flag. */
public boolean useExtensions = false;
/** Constructor. */
public Emitter(final Decorator decorator)
@ -311,7 +313,6 @@ class Emitter
* @param start Starting position.
* @return The new position or -1 if nothing valid has been found.
*/
// TODO ... hm ... refactor this
private int checkHtml(final StringBuilder out, final String in, int start)
{
final StringBuilder temp = new StringBuilder();
@ -319,8 +320,8 @@ class Emitter
// Check for auto links
temp.setLength(0);
pos = Utils.readUntil(temp, in, start + 1, ':');
if(pos != -1 && HTML.isLinkPrefix(temp.toString()))
pos = Utils.readUntil(temp, in, start + 1, ':', ' ', '>', '\n');
if(pos != -1 && in.charAt(pos) == ':' && HTML.isLinkPrefix(temp.toString()))
{
pos = Utils.readUntil(temp, in, pos, '>');
if(pos != -1)
@ -338,8 +339,8 @@ class Emitter
// Check for mailto auto link
temp.setLength(0);
pos = Utils.readUntil(temp, in, start + 1, '@');
if(pos != -1)
pos = Utils.readUntil(temp, in, start + 1, '@', ' ', '>', '\n');
if(pos != -1 && in.charAt(pos) == '@')
{
pos = Utils.readUntil(temp, in, pos, '>');
if(pos != -1)
@ -534,6 +535,40 @@ class Emitter
out.append("&amp;");
}
break;
case X_COPY:
out.append("&copy;");
pos += 2;
break;
case X_REG:
out.append("&reg;");
pos += 2;
break;
case X_TRADE:
out.append("&trade;");
pos += 3;
break;
case X_MDASH:
out.append("&mdash;");
pos++;
break;
case X_HELLIP:
out.append("&hellip;");
pos += 2;
break;
case X_LAQUO:
out.append("&laquo;");
pos++;
break;
case X_RAQUO:
out.append("&raquo;");
pos++;
break;
case X_RDQUO:
out.append("&rdquo;");
break;
case X_LDQUO:
out.append("&ldquo;");
break;
case ESCAPE:
pos++;
//$FALL-THROUGH$
@ -559,6 +594,7 @@ class Emitter
final char c = in.charAt(pos);
final char c1 = pos + 1 < in.length() ? in.charAt(pos + 1) : ' ';
final char c2 = pos + 2 < in.length() ? in.charAt(pos + 2) : ' ';
final char c3 = pos + 3 < in.length() ? in.charAt(pos + 3) : ' ';
switch(c)
{
@ -608,10 +644,44 @@ class Emitter
return MarkToken.NONE;
}
case '<':
if(this.useExtensions && c1 == '<')
return MarkToken.X_LAQUO;
return MarkToken.HTML;
case '&':
return MarkToken.ENTITY;
default:
if(this.useExtensions)
{
switch(c)
{
case '-':
if(c1 == '-')
return MarkToken.X_MDASH;
break;
case '>':
if(c1 == '>')
return MarkToken.X_RAQUO;
break;
case '.':
if(c1 == '.' && c2 == '.')
return MarkToken.X_HELLIP;
break;
case '(':
if(c1 == 'C' && c2 == ')')
return MarkToken.X_COPY;
if(c1 == 'R' && c2 == ')')
return MarkToken.X_REG;
if(c1 == 'T' & c2 == 'M' & c3 == ')')
return MarkToken.X_TRADE;
break;
case '"':
if(!Character.isLetterOrDigit(c0) && c1 != ' ')
return MarkToken.X_LDQUO;
if(c0 != ' ' && !Character.isLetterOrDigit(c1))
return MarkToken.X_RDQUO;
break;
}
}
return MarkToken.NONE;
}
}

View File

@ -34,5 +34,23 @@ enum MarkToken
/** &amp; */
ENTITY, // &
/** \ */
ESCAPE // \x
ESCAPE, // \x
/** Extended: &copy; */
X_COPY, // (C)
/** Extended: &reg; */
X_REG, // (R)
/** Extended: &trade; */
X_TRADE, // (TM)
/** Extended: &laquo; */
X_LAQUO, // <<
/** Extended: &raquo; */
X_RAQUO, // >>
/** Extended: &mdash; */
X_MDASH, // --
/** Extended: &hellip; */
X_HELLIP, // ...
/** Extended: &rdquo; */
X_RDQUO, // "
/** Extended: &ldquo; */
X_LDQUO // "
}

View File

@ -16,6 +16,10 @@ import java.io.StringReader;
/**
* Markdown processor class.
*
* <p>Example usage:</p>
* <pre><code>String result = Processor.process("This is ***TXTMARK***");
* </code></pre>
*
* @author René Jeschke <rene_jeschke@yahoo.de>
*/
public class Processor
@ -326,7 +330,7 @@ public class Processor
{
if(id.toLowerCase().equals("$profile$"))
{
this.useExtensions = link.toLowerCase().equals("extended");
this.emitter.useExtensions = this.useExtensions = link.toLowerCase().equals("extended");
lastLinkRef = null;
}
else
@ -401,7 +405,7 @@ public class Processor
// TODO ... paragraphs and lists seems to be not working correctly
private void recurse(final Block root, boolean listMode)
{
Block block;
Block block, list;
Line line = root.lines;
while(line != null && line.isEmpty) line = line.next;
if(line == null)
@ -410,8 +414,6 @@ public class Processor
if(listMode)
root.removeListIndent();
boolean hasParagraph = false;
while(line != null)
{
final LineType type = line.getLineType();
@ -446,7 +448,6 @@ public class Processor
root.split(line == null ? root.lineTail : line).type = bt;
root.removeLeadingEmptyLines();
}
hasParagraph |= bt == BlockType.PARAGRAPH;
line = root.lines;
}
break;
@ -521,36 +522,26 @@ public class Processor
break;
line = line.next;
}
block = root.split(line != null ? line.previous : root.lineTail);
block.type = type == LineType.OLIST ? BlockType.ORDERED_LIST : BlockType.UNORDERED_LIST;
block.lines.prevEmpty = false;
block.lineTail.nextEmpty = false;
block.removeSurroundingEmptyLines();
block.lines.prevEmpty = block.lineTail.nextEmpty = false;
this.initListBlock(block);
block = block.blocks;
list = root.split(line != null ? line.previous : root.lineTail);
list.type = type == LineType.OLIST ? BlockType.ORDERED_LIST : BlockType.UNORDERED_LIST;
list.lines.prevEmpty = false;
list.lineTail.nextEmpty = false;
list.removeSurroundingEmptyLines();
list.lines.prevEmpty = list.lineTail.nextEmpty = false;
this.initListBlock(list);
block = list.blocks;
while(block != null)
{
this.recurse(block, true);
block = block.next;
}
list.expandListParagraphs();
break;
default:
line = line.next;
break;
}
}
if(listMode && hasParagraph)
{
block = root;
while(block != null)
{
if(block.type == BlockType.NONE)
block.type = BlockType.PARAGRAPH;
block = block.next;
}
}
}
/**

82
src/java/txtmark/Run.java Normal file
View File

@ -0,0 +1,82 @@
/*
* Copyright (C) 2011 René Jeschke <rene_jeschke@yahoo.de>
* See LICENSE.txt for licensing information.
*/
package txtmark;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
/**
* Simple class for processing markdown files on the command line.
*
* <p>Usage:</p>
* <pre><code>java -cp txtmark.jar txtmark.Run filename [header_footer_file]
* </code></pre>
*
* <p>The <code>header_footer_file</code> is an optional UTF-8 encoded file containing
* a header and a footer to output around the generated HTML code.</p>
*
* <p>Example:</p>
*
* <pre><code>&lt;?xml version="1.0" encoding="UTF-8"?>
*&lt;!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
* "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
*&lt;html xmlns="http://www.w3.org/1999/xhtml">
*&lt;head>
*&lt;title>markdown&lt;/title>
*&lt;link type="text/css" href="style.css" rel="stylesheet"/>
*&lt;meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
*&lt;/head>
*&lt;body>
*&lt;!-- the following file separates header from footer -->
*&lt;!-- ### -->
*&lt;/body>
*&lt;/html>
*</code></pre>
*
* @author René Jeschke <rene_jeschke@yahoo.de>
*/
public class Run
{
/**
* Static main.
*
* @param args Program arguments.
* @throws IOException If an IO error occurred.
*/
public static void main(String[] args) throws IOException
{
// This is just a _hack_ ...
BufferedReader reader = null;
if(args.length == 0)
{
System.err.println("No input file specified.");
System.exit(-1);
}
if(args.length > 1)
{
reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "UTF-8"));
String line = reader.readLine();
while(line != null && !line.startsWith("<!-- ###"))
{
System.out.println(line);
line = reader.readLine();
}
}
System.out.println(Processor.process(new File(args[0])));
if(args.length > 1 && reader != null)
{
String line = reader.readLine();
while(line != null)
{
System.out.println(line);
line = reader.readLine();
}
reader.close();
}
}
}

View File

@ -512,7 +512,7 @@ class Utils
}
if(in.charAt(pos) == '/')
{
out.append('/');
out.append(" /");
pos++;
}
if(in.charAt(pos) == '>')