diff --git a/README.md b/README.md
index 150fac0..24e4427 100644
--- a/README.md
+++ b/README.md
@@ -109,8 +109,16 @@ This seemed to me as the easiest and safest way to enable different behaviours.
and this is not a list
-* More to come ...
+* Auto HTML entities:
+ * `(C)` becomes `©` - ©
+ * `(R)` becomes `®` - ®
+ * `(TM)` becomes `™` - ™
+ * `--` becomes `—` - —
+ * `...` becomes `…` - …
+ * `<<` becomes `«` - «
+ * `>>` becomes `»` - »
+ * `"Hello"` becomes `“Hello”` - “Hello”
### Markdown conformity
@@ -150,42 +158,43 @@ except of two:
Based on [this benchmark suite](http://henkelmann.eu/2011/01/10/performance_comparison_of_markdown_processor_for_the_jvm).
+Excerpt from the original post concerning this benchmark suite:
+
+> Most of these tests are of course unrealistic: Who would write a
+> text where each word is a link? Yet they serve an important use:
+> It makes it possible for the developer to pinpoint the parts of
+> the parser where there is most room for improvement. Also, it
+> explains why certain texts might render much faster in one
+> Processor than in another.
+
+Benchmark system:
+
+* Ubuntu Linux 10.04 32 Bit
+* Intel(R) Core(TM) 2 Duo T7500 @ 2.2GHz
+* Java(TM) SE Runtime Environment (build 1.6.0_24-b07)
+* Java HotSpot(TM) Server VM (build 19.1-b02, mixed mode)
+
+
| Test | Actuarius | PegDown | Knockoff | Txtmark |
| 1st Run (ms) | 2nd Run (ms) | 1st Run (ms) | 2nd Run (ms) | 1st Run (ms) | 2nd Run (ms) | 1st Run (ms) | 2nd Run (ms) |
- | Plain Paragraphs | 887 | 461 | 2455 | 2236 | 764 | 568 | 89 | 47 |
- | Every Word Emphasized | 2220 | 2077 | 3411 | 3406 | 30503 | 30514 | 72 | 66 |
- | Every Word Strong | 2384 | 2270 | 2456 | 2466 | 23639 | 23577 | 62 | 57 |
- | Every Word Inline Code | 824 | 804 | 2337 | 2237 | 23506 | 23622 | 54 | 55 |
- | Every Word a Fast Link | 3942 | 3738 | 1164 | 1159 | 8621 | 8595 | 89 | 68 |
- | Every Word Consisting of Special XML Chars | 9393 | 9312 | 7544 | 7314 | 801 | 608 | 3587 | 3614 |
- | Every Word wrapped in manual HTML tags | 6843 | 6828 | 1850 | 1859 | 8699 | 8692 | 1169 | 1154 |
- | Every Line with a manual line break | 859 | 724 | 2968 | 2946 | 2171 | 1990 | 58 | 56 |
- | Every word with a full link | 528 | 501 | 2252 | 2280 | 3513 | 3512 | 66 | 60 |
- | Every word with a full image | 395 | 374 | 2463 | 2569 | 3757 | 3726 | 56 | 55 |
- | Every word with a reference link | 19208 | 19035 | 39183 | 38710 | 243450 | 244943 | 1826 | 1798 |
- | Every block a quote | 465 | 449 | 2687 | 2684 | 978 | 977 | 48 | 48 |
- | Every block a codeblock | 151 | 134 | 597 | 601 | 270 | 262 | 36 | 27 |
- | Every block a list | 1209 | 1106 | 3448 | 3432 | 1411 | 1368 | 52 | 60 |
- | All tests together | 6062 | 6042 | 11556 | 11589 | 19827 | 19637 | 452 | 448 |
+ | Plain Paragraphs | 1127 | 577 | 1273 | 1037 | 740 | 400 | 157 | 64 |
+ | Every Word Emphasized | 1562 | 1001 | 1523 | 1513 | 13982 | 13221 | 54 | 46 |
+ | Every Word Strong | 1125 | 997 | 1115 | 1114 | 9543 | 9647 | 44 | 41 |
+ | Every Word Inline Code | 382 | 277 | 1058 | 1052 | 9116 | 9074 | 51 | 39 |
+ | Every Word a Fast Link | 2257 | 1600 | 537 | 531 | 3980 | 3410 | 109 | 55 |
+ | Every Word Consisting of Special XML Chars | 4045 | 4270 | 2985 | 3044 | 312 | 377 | 778 | 775 |
+ | Every Word wrapped in manual HTML tags | 3334 | 2919 | 901 | 896 | 3863 | 3736 | 73 | 62 |
+ | Every Line with a manual line break | 510 | 588 | 1445 | 1440 | 1527 | 1130 | 56 | 56 |
+ | Every word with a full link | 452 | 246 | 1045 | 996 | 1884 | 1819 | 86 | 55 |
+ | Every word with a full image | 268 | 150 | 1140 | 1132 | 1985 | 1908 | 38 | 36 |
+ | Every word with a reference link | 9847 | 9082 | 18956 | 18719 | 121136 | 115416 | 1525 | 1380 |
+ | Every block a quote | 445 | 206 | 1312 | 1301 | 478 | 457 | 50 | 45 |
+ | Every block a codeblock | 70 | 87 | 373 | 376 | 161 | 175 | 60 | 22 |
+ | Every block a list | 920 | 912 | 1720 | 1725 | 622 | 651 | 55 | 55 |
+ | All tests together | 3281 | 2885 | 5184 | 5196 | 10130 | 10460 | 206 | 196 |
-* Q: Why is Txtmark so slow when it comes to XML entities?
-* A: Because Txtmark does some sanity checks on XML entities to make sure
- it outputs valid XML. For example:
-
- &cutie;
-
- will produce (when processed with Markdown and most other markdown processors):
-
- &cutie;
-
- and
-
- &cutie;
-
- when processed with Txtmark.
-
Benchmarked versions:
[Actuarius] version: 0.2
[PegDown] version: 0.8.5.4
@@ -193,10 +202,11 @@ Benchmarked versions:
---
-[Markdown] is copyright (c) 2004 by John Gruber
-[Actuarius] is copyright (c) 2010 by Christoph Henkelmann
-[Knockoff] is copyright (c) 2009-2011 by Tristan Juricek
-[PegDown] is copyright (c) 2010 by Mathias Doenitz
+Mentioned/related projects:
+[Markdown] is Copyright (C) 2004 by John Gruber
+[Actuarius] is Copyright (C) 2010 by Christoph Henkelmann
+[Knockoff] is Copyright (C) 2009-2011 by Tristan Juricek
+[PegDown] is Copyright (C) 2010 by Mathias Doenitz
***
@@ -206,5 +216,6 @@ Benchmarked versions:
[PegDown]: https://github.com/sirthias/pegdown
[tar]: https://github.com/rjeschke/txtmark/tarball/master "branch: master"
[zip]: https://github.com/rjeschke/txtmark/zipball/master "branch: master"
+[$PROFILE$]: extended "Txtmark processing information."
Project link:
diff --git a/src/java/txtmark/Block.java b/src/java/txtmark/Block.java
index c09fede..b9550d2 100644
--- a/src/java/txtmark/Block.java
+++ b/src/java/txtmark/Block.java
@@ -230,4 +230,54 @@ class Block
this.lineTail = line;
}
}
+
+ /**
+ * Changes all Blocks of type NONE to PARAGRAPH if this Block
+ * is a List and any of the ListItems contains a paragraph.
+ */
+ public void expandListParagraphs()
+ {
+ if(this.type != BlockType.ORDERED_LIST && this.type != BlockType.UNORDERED_LIST)
+ {
+ return;
+ }
+ Block outer = this.blocks, inner;
+ boolean hasParagraph = false;
+ while(outer != null && !hasParagraph)
+ {
+ if(outer.type == BlockType.LIST_ITEM)
+ {
+ inner = outer.blocks;
+ while(inner != null && !hasParagraph)
+ {
+ if(inner.type == BlockType.PARAGRAPH)
+ {
+ hasParagraph = true;
+ }
+ inner = inner.next;
+ }
+ }
+ outer = outer.next;
+ }
+ if(hasParagraph)
+ {
+ outer = this.blocks;
+ while(outer != null)
+ {
+ if(outer.type == BlockType.LIST_ITEM)
+ {
+ inner = outer.blocks;
+ while(inner != null)
+ {
+ if(inner.type == BlockType.NONE)
+ {
+ inner.type = BlockType.PARAGRAPH;
+ }
+ inner = inner.next;
+ }
+ }
+ outer = outer.next;
+ }
+ }
+ }
}
diff --git a/src/java/txtmark/DefaultDecorator.java b/src/java/txtmark/DefaultDecorator.java
index ab1e680..cd8c60f 100644
--- a/src/java/txtmark/DefaultDecorator.java
+++ b/src/java/txtmark/DefaultDecorator.java
@@ -7,6 +7,17 @@ package txtmark;
/**
* Default Decorator implementation.
*
+ * Example for a user Decorator having a class attribute on <p> tags.
+ * public class MyDecorator extends DefaultDecorator
+ *{
+ * @Override
+ * public void openParagraph(StringBuilder out)
+ * {
+ * out.append("<p class=\"myclass\">");
+ * }
+ *}
+ *
+ *
* @author René Jeschke
*/
public class DefaultDecorator implements Decorator
diff --git a/src/java/txtmark/Emitter.java b/src/java/txtmark/Emitter.java
index fe8ea99..3d97493 100644
--- a/src/java/txtmark/Emitter.java
+++ b/src/java/txtmark/Emitter.java
@@ -17,6 +17,8 @@ class Emitter
private final HashMap linkRefs = new HashMap();
/** The Decorator. */
private Decorator decorator;
+ /** Extension flag. */
+ public boolean useExtensions = false;
/** Constructor. */
public Emitter(final Decorator decorator)
@@ -311,7 +313,6 @@ class Emitter
* @param start Starting position.
* @return The new position or -1 if nothing valid has been found.
*/
- // TODO ... hm ... refactor this
private int checkHtml(final StringBuilder out, final String in, int start)
{
final StringBuilder temp = new StringBuilder();
@@ -319,8 +320,8 @@ class Emitter
// Check for auto links
temp.setLength(0);
- pos = Utils.readUntil(temp, in, start + 1, ':');
- if(pos != -1 && HTML.isLinkPrefix(temp.toString()))
+ pos = Utils.readUntil(temp, in, start + 1, ':', ' ', '>', '\n');
+ if(pos != -1 && in.charAt(pos) == ':' && HTML.isLinkPrefix(temp.toString()))
{
pos = Utils.readUntil(temp, in, pos, '>');
if(pos != -1)
@@ -338,8 +339,8 @@ class Emitter
// Check for mailto auto link
temp.setLength(0);
- pos = Utils.readUntil(temp, in, start + 1, '@');
- if(pos != -1)
+ pos = Utils.readUntil(temp, in, start + 1, '@', ' ', '>', '\n');
+ if(pos != -1 && in.charAt(pos) == '@')
{
pos = Utils.readUntil(temp, in, pos, '>');
if(pos != -1)
@@ -534,6 +535,40 @@ class Emitter
out.append("&");
}
break;
+ case X_COPY:
+ out.append("©");
+ pos += 2;
+ break;
+ case X_REG:
+ out.append("®");
+ pos += 2;
+ break;
+ case X_TRADE:
+ out.append("™");
+ pos += 3;
+ break;
+ case X_MDASH:
+ out.append("—");
+ pos++;
+ break;
+ case X_HELLIP:
+ out.append("…");
+ pos += 2;
+ break;
+ case X_LAQUO:
+ out.append("«");
+ pos++;
+ break;
+ case X_RAQUO:
+ out.append("»");
+ pos++;
+ break;
+ case X_RDQUO:
+ out.append("”");
+ break;
+ case X_LDQUO:
+ out.append("“");
+ break;
case ESCAPE:
pos++;
//$FALL-THROUGH$
@@ -559,6 +594,7 @@ class Emitter
final char c = in.charAt(pos);
final char c1 = pos + 1 < in.length() ? in.charAt(pos + 1) : ' ';
final char c2 = pos + 2 < in.length() ? in.charAt(pos + 2) : ' ';
+ final char c3 = pos + 3 < in.length() ? in.charAt(pos + 3) : ' ';
switch(c)
{
@@ -608,10 +644,44 @@ class Emitter
return MarkToken.NONE;
}
case '<':
+ if(this.useExtensions && c1 == '<')
+ return MarkToken.X_LAQUO;
return MarkToken.HTML;
case '&':
return MarkToken.ENTITY;
default:
+ if(this.useExtensions)
+ {
+ switch(c)
+ {
+ case '-':
+ if(c1 == '-')
+ return MarkToken.X_MDASH;
+ break;
+ case '>':
+ if(c1 == '>')
+ return MarkToken.X_RAQUO;
+ break;
+ case '.':
+ if(c1 == '.' && c2 == '.')
+ return MarkToken.X_HELLIP;
+ break;
+ case '(':
+ if(c1 == 'C' && c2 == ')')
+ return MarkToken.X_COPY;
+ if(c1 == 'R' && c2 == ')')
+ return MarkToken.X_REG;
+ if(c1 == 'T' & c2 == 'M' & c3 == ')')
+ return MarkToken.X_TRADE;
+ break;
+ case '"':
+ if(!Character.isLetterOrDigit(c0) && c1 != ' ')
+ return MarkToken.X_LDQUO;
+ if(c0 != ' ' && !Character.isLetterOrDigit(c1))
+ return MarkToken.X_RDQUO;
+ break;
+ }
+ }
return MarkToken.NONE;
}
}
diff --git a/src/java/txtmark/MarkToken.java b/src/java/txtmark/MarkToken.java
index 968a3e8..d7673b7 100644
--- a/src/java/txtmark/MarkToken.java
+++ b/src/java/txtmark/MarkToken.java
@@ -34,5 +34,23 @@ enum MarkToken
/** & */
ENTITY, // &
/** \ */
- ESCAPE // \x
+ ESCAPE, // \x
+ /** Extended: © */
+ X_COPY, // (C)
+ /** Extended: ® */
+ X_REG, // (R)
+ /** Extended: ™ */
+ X_TRADE, // (TM)
+ /** Extended: « */
+ X_LAQUO, // <<
+ /** Extended: » */
+ X_RAQUO, // >>
+ /** Extended: — */
+ X_MDASH, // --
+ /** Extended: … */
+ X_HELLIP, // ...
+ /** Extended: ” */
+ X_RDQUO, // "
+ /** Extended: “ */
+ X_LDQUO // "
}
diff --git a/src/java/txtmark/Processor.java b/src/java/txtmark/Processor.java
index 53145fd..5752b7d 100644
--- a/src/java/txtmark/Processor.java
+++ b/src/java/txtmark/Processor.java
@@ -16,6 +16,10 @@ import java.io.StringReader;
/**
* Markdown processor class.
*
+ * Example usage:
+ * String result = Processor.process("This is ***TXTMARK***");
+ *
+ *
* @author René Jeschke
*/
public class Processor
@@ -326,7 +330,7 @@ public class Processor
{
if(id.toLowerCase().equals("$profile$"))
{
- this.useExtensions = link.toLowerCase().equals("extended");
+ this.emitter.useExtensions = this.useExtensions = link.toLowerCase().equals("extended");
lastLinkRef = null;
}
else
@@ -401,7 +405,7 @@ public class Processor
// TODO ... paragraphs and lists seems to be not working correctly
private void recurse(final Block root, boolean listMode)
{
- Block block;
+ Block block, list;
Line line = root.lines;
while(line != null && line.isEmpty) line = line.next;
if(line == null)
@@ -410,8 +414,6 @@ public class Processor
if(listMode)
root.removeListIndent();
- boolean hasParagraph = false;
-
while(line != null)
{
final LineType type = line.getLineType();
@@ -446,7 +448,6 @@ public class Processor
root.split(line == null ? root.lineTail : line).type = bt;
root.removeLeadingEmptyLines();
}
- hasParagraph |= bt == BlockType.PARAGRAPH;
line = root.lines;
}
break;
@@ -521,36 +522,26 @@ public class Processor
break;
line = line.next;
}
- block = root.split(line != null ? line.previous : root.lineTail);
- block.type = type == LineType.OLIST ? BlockType.ORDERED_LIST : BlockType.UNORDERED_LIST;
- block.lines.prevEmpty = false;
- block.lineTail.nextEmpty = false;
- block.removeSurroundingEmptyLines();
- block.lines.prevEmpty = block.lineTail.nextEmpty = false;
- this.initListBlock(block);
- block = block.blocks;
+ list = root.split(line != null ? line.previous : root.lineTail);
+ list.type = type == LineType.OLIST ? BlockType.ORDERED_LIST : BlockType.UNORDERED_LIST;
+ list.lines.prevEmpty = false;
+ list.lineTail.nextEmpty = false;
+ list.removeSurroundingEmptyLines();
+ list.lines.prevEmpty = list.lineTail.nextEmpty = false;
+ this.initListBlock(list);
+ block = list.blocks;
while(block != null)
{
this.recurse(block, true);
block = block.next;
}
+ list.expandListParagraphs();
break;
default:
line = line.next;
break;
}
}
-
- if(listMode && hasParagraph)
- {
- block = root;
- while(block != null)
- {
- if(block.type == BlockType.NONE)
- block.type = BlockType.PARAGRAPH;
- block = block.next;
- }
- }
}
/**
diff --git a/src/java/txtmark/Run.java b/src/java/txtmark/Run.java
new file mode 100644
index 0000000..333fdcb
--- /dev/null
+++ b/src/java/txtmark/Run.java
@@ -0,0 +1,82 @@
+/*
+* Copyright (C) 2011 René Jeschke
+* See LICENSE.txt for licensing information.
+*/
+package txtmark;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+
+/**
+ * Simple class for processing markdown files on the command line.
+ *
+ * Usage:
+ * java -cp txtmark.jar txtmark.Run filename [header_footer_file]
+ *
+ *
+ * The header_footer_file is an optional UTF-8 encoded file containing
+ * a header and a footer to output around the generated HTML code.
+ *
+ * Example:
+ *
+ * <?xml version="1.0" encoding="UTF-8"?>
+ *<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+ * "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+ *<html xmlns="http://www.w3.org/1999/xhtml">
+ *<head>
+ *<title>markdown</title>
+ *<link type="text/css" href="style.css" rel="stylesheet"/>
+ *<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
+ *</head>
+ *<body>
+ *<!-- the following file separates header from footer -->
+ *<!-- ### -->
+ *</body>
+ *</html>
+ *
+ *
+ * @author René Jeschke
+ */
+public class Run
+{
+ /**
+ * Static main.
+ *
+ * @param args Program arguments.
+ * @throws IOException If an IO error occurred.
+ */
+ public static void main(String[] args) throws IOException
+ {
+ // This is just a _hack_ ...
+ BufferedReader reader = null;
+ if(args.length == 0)
+ {
+ System.err.println("No input file specified.");
+ System.exit(-1);
+ }
+ if(args.length > 1)
+ {
+ reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "UTF-8"));
+ String line = reader.readLine();
+ while(line != null && !line.startsWith("