From bb920c8f6e6cf41c6a760de5dab4639c43b50f9f Mon Sep 17 00:00:00 2001 From: Rene' Jeschke Date: Mon, 18 Apr 2011 14:12:08 +0200 Subject: [PATCH] Updated README, added auto HTML entities, list bugfix, performance improvements. --- README.md | 83 +++++++++++++++----------- src/java/txtmark/Block.java | 50 ++++++++++++++++ src/java/txtmark/DefaultDecorator.java | 11 ++++ src/java/txtmark/Emitter.java | 80 +++++++++++++++++++++++-- src/java/txtmark/MarkToken.java | 20 ++++++- src/java/txtmark/Processor.java | 39 +++++------- src/java/txtmark/Run.java | 82 +++++++++++++++++++++++++ src/java/txtmark/Utils.java | 2 +- 8 files changed, 300 insertions(+), 67 deletions(-) create mode 100644 src/java/txtmark/Run.java diff --git a/README.md b/README.md index 150fac0..24e4427 100644 --- a/README.md +++ b/README.md @@ -109,8 +109,16 @@ This seemed to me as the easiest and safest way to enable different behaviours.
  • and this is not a list
  • -* More to come ... +* Auto HTML entities: + * `(C)` becomes `©` - © + * `(R)` becomes `®` - ® + * `(TM)` becomes `™` - ™ + * `--` becomes `—` - — + * `...` becomes `…` - … + * `<<` becomes `«` - « + * `>>` becomes `»` - » + * `"Hello"` becomes `“Hello”` - “Hello” ### Markdown conformity @@ -150,42 +158,43 @@ except of two: Based on [this benchmark suite](http://henkelmann.eu/2011/01/10/performance_comparison_of_markdown_processor_for_the_jvm). +Excerpt from the original post concerning this benchmark suite: + +> Most of these tests are of course unrealistic: Who would write a +> text where each word is a link? Yet they serve an important use: +> It makes it possible for the developer to pinpoint the parts of +> the parser where there is most room for improvement. Also, it +> explains why certain texts might render much faster in one +> Processor than in another. + +Benchmark system: + +* Ubuntu Linux 10.04 32 Bit +* Intel(R) Core(TM) 2 Duo T7500 @ 2.2GHz +* Java(TM) SE Runtime Environment (build 1.6.0_24-b07) +* Java HotSpot(TM) Server VM (build 19.1-b02, mixed mode) + + - - - - - - - - - - - - - - - + + + + + + + + + + + + + + +
    TestActuariusPegDownKnockoffTxtmark
    1st Run (ms)2nd Run (ms)1st Run (ms)2nd Run (ms)1st Run (ms)2nd Run (ms)1st Run (ms)2nd Run (ms)
    Plain Paragraphs887461245522367645688947
    Every Word Emphasized222020773411340630503305147266
    Every Word Strong238422702456246623639235776257
    Every Word Inline Code8248042337223723506236225455
    Every Word a Fast Link3942373811641159862185958968
    Every Word Consisting of Special XML Chars939393127544731480160835873614
    Every Word wrapped in manual HTML tags68436828185018598699869211691154
    Every Line with a manual line break85972429682946217119905856
    Every word with a full link52850122522280351335126660
    Every word with a full image39537424632569375737265655
    Every word with a reference link1920819035391833871024345024494318261798
    Every block a quote465449268726849789774848
    Every block a codeblock1511345976012702623627
    Every block a list1209110634483432141113685260
    All tests together6062604211556115891982719637452448
    Plain Paragraphs11275771273103774040015764
    Every Word Emphasized156210011523151313982132215446
    Every Word Strong112599711151114954396474441
    Every Word Inline Code38227710581052911690745139
    Every Word a Fast Link225716005375313980341010955
    Every Word Consisting of Special XML Chars4045427029853044312377778775
    Every Word wrapped in manual HTML tags33342919901896386337367362
    Every Line with a manual line break51058814451440152711305656
    Every word with a full link4522461045996188418198655
    Every word with a full image26815011401132198519083836
    Every word with a reference link98479082189561871912113611541615251380
    Every block a quote445206131213014784575045
    Every block a codeblock70873733761611756022
    Every block a list920912172017256226515555
    All tests together32812885518451961013010460206196
    -* Q: Why is Txtmark so slow when it comes to XML entities? -* A: Because Txtmark does some sanity checks on XML entities to make sure - it outputs valid XML. For example: - - &cutie; - - will produce (when processed with Markdown and most other markdown processors): - - &cutie; - - and - - &cutie; - - when processed with Txtmark. - Benchmarked versions: [Actuarius] version: 0.2 [PegDown] version: 0.8.5.4 @@ -193,10 +202,11 @@ Benchmarked versions: --- -[Markdown] is copyright (c) 2004 by John Gruber -[Actuarius] is copyright (c) 2010 by Christoph Henkelmann -[Knockoff] is copyright (c) 2009-2011 by Tristan Juricek -[PegDown] is copyright (c) 2010 by Mathias Doenitz +Mentioned/related projects: +[Markdown] is Copyright (C) 2004 by John Gruber +[Actuarius] is Copyright (C) 2010 by Christoph Henkelmann +[Knockoff] is Copyright (C) 2009-2011 by Tristan Juricek +[PegDown] is Copyright (C) 2010 by Mathias Doenitz *** @@ -206,5 +216,6 @@ Benchmarked versions: [PegDown]: https://github.com/sirthias/pegdown [tar]: https://github.com/rjeschke/txtmark/tarball/master "branch: master" [zip]: https://github.com/rjeschke/txtmark/zipball/master "branch: master" +[$PROFILE$]: extended "Txtmark processing information." Project link: diff --git a/src/java/txtmark/Block.java b/src/java/txtmark/Block.java index c09fede..b9550d2 100644 --- a/src/java/txtmark/Block.java +++ b/src/java/txtmark/Block.java @@ -230,4 +230,54 @@ class Block this.lineTail = line; } } + + /** + * Changes all Blocks of type NONE to PARAGRAPH if this Block + * is a List and any of the ListItems contains a paragraph. + */ + public void expandListParagraphs() + { + if(this.type != BlockType.ORDERED_LIST && this.type != BlockType.UNORDERED_LIST) + { + return; + } + Block outer = this.blocks, inner; + boolean hasParagraph = false; + while(outer != null && !hasParagraph) + { + if(outer.type == BlockType.LIST_ITEM) + { + inner = outer.blocks; + while(inner != null && !hasParagraph) + { + if(inner.type == BlockType.PARAGRAPH) + { + hasParagraph = true; + } + inner = inner.next; + } + } + outer = outer.next; + } + if(hasParagraph) + { + outer = this.blocks; + while(outer != null) + { + if(outer.type == BlockType.LIST_ITEM) + { + inner = outer.blocks; + while(inner != null) + { + if(inner.type == BlockType.NONE) + { + inner.type = BlockType.PARAGRAPH; + } + inner = inner.next; + } + } + outer = outer.next; + } + } + } } diff --git a/src/java/txtmark/DefaultDecorator.java b/src/java/txtmark/DefaultDecorator.java index ab1e680..cd8c60f 100644 --- a/src/java/txtmark/DefaultDecorator.java +++ b/src/java/txtmark/DefaultDecorator.java @@ -7,6 +7,17 @@ package txtmark; /** * Default Decorator implementation. * + *

    Example for a user Decorator having a class attribute on <p> tags.

    + *
    public class MyDecorator extends DefaultDecorator
    + *{
    + *    @Override
    + *    public void openParagraph(StringBuilder out)
    + *    {
    + *        out.append("<p class=\"myclass\">");
    + *    }
    + *}
    + *
    + * * @author René Jeschke */ public class DefaultDecorator implements Decorator diff --git a/src/java/txtmark/Emitter.java b/src/java/txtmark/Emitter.java index fe8ea99..3d97493 100644 --- a/src/java/txtmark/Emitter.java +++ b/src/java/txtmark/Emitter.java @@ -17,6 +17,8 @@ class Emitter private final HashMap linkRefs = new HashMap(); /** The Decorator. */ private Decorator decorator; + /** Extension flag. */ + public boolean useExtensions = false; /** Constructor. */ public Emitter(final Decorator decorator) @@ -311,7 +313,6 @@ class Emitter * @param start Starting position. * @return The new position or -1 if nothing valid has been found. */ - // TODO ... hm ... refactor this private int checkHtml(final StringBuilder out, final String in, int start) { final StringBuilder temp = new StringBuilder(); @@ -319,8 +320,8 @@ class Emitter // Check for auto links temp.setLength(0); - pos = Utils.readUntil(temp, in, start + 1, ':'); - if(pos != -1 && HTML.isLinkPrefix(temp.toString())) + pos = Utils.readUntil(temp, in, start + 1, ':', ' ', '>', '\n'); + if(pos != -1 && in.charAt(pos) == ':' && HTML.isLinkPrefix(temp.toString())) { pos = Utils.readUntil(temp, in, pos, '>'); if(pos != -1) @@ -338,8 +339,8 @@ class Emitter // Check for mailto auto link temp.setLength(0); - pos = Utils.readUntil(temp, in, start + 1, '@'); - if(pos != -1) + pos = Utils.readUntil(temp, in, start + 1, '@', ' ', '>', '\n'); + if(pos != -1 && in.charAt(pos) == '@') { pos = Utils.readUntil(temp, in, pos, '>'); if(pos != -1) @@ -534,6 +535,40 @@ class Emitter out.append("&"); } break; + case X_COPY: + out.append("©"); + pos += 2; + break; + case X_REG: + out.append("®"); + pos += 2; + break; + case X_TRADE: + out.append("™"); + pos += 3; + break; + case X_MDASH: + out.append("—"); + pos++; + break; + case X_HELLIP: + out.append("…"); + pos += 2; + break; + case X_LAQUO: + out.append("«"); + pos++; + break; + case X_RAQUO: + out.append("»"); + pos++; + break; + case X_RDQUO: + out.append("”"); + break; + case X_LDQUO: + out.append("“"); + break; case ESCAPE: pos++; //$FALL-THROUGH$ @@ -559,6 +594,7 @@ class Emitter final char c = in.charAt(pos); final char c1 = pos + 1 < in.length() ? in.charAt(pos + 1) : ' '; final char c2 = pos + 2 < in.length() ? in.charAt(pos + 2) : ' '; + final char c3 = pos + 3 < in.length() ? in.charAt(pos + 3) : ' '; switch(c) { @@ -608,10 +644,44 @@ class Emitter return MarkToken.NONE; } case '<': + if(this.useExtensions && c1 == '<') + return MarkToken.X_LAQUO; return MarkToken.HTML; case '&': return MarkToken.ENTITY; default: + if(this.useExtensions) + { + switch(c) + { + case '-': + if(c1 == '-') + return MarkToken.X_MDASH; + break; + case '>': + if(c1 == '>') + return MarkToken.X_RAQUO; + break; + case '.': + if(c1 == '.' && c2 == '.') + return MarkToken.X_HELLIP; + break; + case '(': + if(c1 == 'C' && c2 == ')') + return MarkToken.X_COPY; + if(c1 == 'R' && c2 == ')') + return MarkToken.X_REG; + if(c1 == 'T' & c2 == 'M' & c3 == ')') + return MarkToken.X_TRADE; + break; + case '"': + if(!Character.isLetterOrDigit(c0) && c1 != ' ') + return MarkToken.X_LDQUO; + if(c0 != ' ' && !Character.isLetterOrDigit(c1)) + return MarkToken.X_RDQUO; + break; + } + } return MarkToken.NONE; } } diff --git a/src/java/txtmark/MarkToken.java b/src/java/txtmark/MarkToken.java index 968a3e8..d7673b7 100644 --- a/src/java/txtmark/MarkToken.java +++ b/src/java/txtmark/MarkToken.java @@ -34,5 +34,23 @@ enum MarkToken /** & */ ENTITY, // & /** \ */ - ESCAPE // \x + ESCAPE, // \x + /** Extended: © */ + X_COPY, // (C) + /** Extended: ® */ + X_REG, // (R) + /** Extended: ™ */ + X_TRADE, // (TM) + /** Extended: « */ + X_LAQUO, // << + /** Extended: » */ + X_RAQUO, // >> + /** Extended: — */ + X_MDASH, // -- + /** Extended: … */ + X_HELLIP, // ... + /** Extended: ” */ + X_RDQUO, // " + /** Extended: “ */ + X_LDQUO // " } diff --git a/src/java/txtmark/Processor.java b/src/java/txtmark/Processor.java index 53145fd..5752b7d 100644 --- a/src/java/txtmark/Processor.java +++ b/src/java/txtmark/Processor.java @@ -16,6 +16,10 @@ import java.io.StringReader; /** * Markdown processor class. * + *

    Example usage:

    + *
    String result = Processor.process("This is ***TXTMARK***");
    + * 
    + * * @author René Jeschke */ public class Processor @@ -326,7 +330,7 @@ public class Processor { if(id.toLowerCase().equals("$profile$")) { - this.useExtensions = link.toLowerCase().equals("extended"); + this.emitter.useExtensions = this.useExtensions = link.toLowerCase().equals("extended"); lastLinkRef = null; } else @@ -401,7 +405,7 @@ public class Processor // TODO ... paragraphs and lists seems to be not working correctly private void recurse(final Block root, boolean listMode) { - Block block; + Block block, list; Line line = root.lines; while(line != null && line.isEmpty) line = line.next; if(line == null) @@ -410,8 +414,6 @@ public class Processor if(listMode) root.removeListIndent(); - boolean hasParagraph = false; - while(line != null) { final LineType type = line.getLineType(); @@ -446,7 +448,6 @@ public class Processor root.split(line == null ? root.lineTail : line).type = bt; root.removeLeadingEmptyLines(); } - hasParagraph |= bt == BlockType.PARAGRAPH; line = root.lines; } break; @@ -521,36 +522,26 @@ public class Processor break; line = line.next; } - block = root.split(line != null ? line.previous : root.lineTail); - block.type = type == LineType.OLIST ? BlockType.ORDERED_LIST : BlockType.UNORDERED_LIST; - block.lines.prevEmpty = false; - block.lineTail.nextEmpty = false; - block.removeSurroundingEmptyLines(); - block.lines.prevEmpty = block.lineTail.nextEmpty = false; - this.initListBlock(block); - block = block.blocks; + list = root.split(line != null ? line.previous : root.lineTail); + list.type = type == LineType.OLIST ? BlockType.ORDERED_LIST : BlockType.UNORDERED_LIST; + list.lines.prevEmpty = false; + list.lineTail.nextEmpty = false; + list.removeSurroundingEmptyLines(); + list.lines.prevEmpty = list.lineTail.nextEmpty = false; + this.initListBlock(list); + block = list.blocks; while(block != null) { this.recurse(block, true); block = block.next; } + list.expandListParagraphs(); break; default: line = line.next; break; } } - - if(listMode && hasParagraph) - { - block = root; - while(block != null) - { - if(block.type == BlockType.NONE) - block.type = BlockType.PARAGRAPH; - block = block.next; - } - } } /** diff --git a/src/java/txtmark/Run.java b/src/java/txtmark/Run.java new file mode 100644 index 0000000..333fdcb --- /dev/null +++ b/src/java/txtmark/Run.java @@ -0,0 +1,82 @@ +/* +* Copyright (C) 2011 René Jeschke +* See LICENSE.txt for licensing information. +*/ +package txtmark; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; + +/** + * Simple class for processing markdown files on the command line. + * + *

    Usage:

    + *
    java -cp txtmark.jar txtmark.Run filename [header_footer_file]
    + * 
    + * + *

    The header_footer_file is an optional UTF-8 encoded file containing + * a header and a footer to output around the generated HTML code.

    + * + *

    Example:

    + * + *
    <?xml version="1.0" encoding="UTF-8"?>
    + *<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    + *                      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
    + *<html xmlns="http://www.w3.org/1999/xhtml">
    + *<head>
    + *<title>markdown</title>
    + *<link type="text/css" href="style.css" rel="stylesheet"/>
    + *<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
    + *</head>
    + *<body>
    + *<!-- the following file separates header from footer -->
    + *<!-- ### -->
    + *</body>
    + *</html>
    + *
    + * + * @author René Jeschke + */ +public class Run +{ + /** + * Static main. + * + * @param args Program arguments. + * @throws IOException If an IO error occurred. + */ + public static void main(String[] args) throws IOException + { + // This is just a _hack_ ... + BufferedReader reader = null; + if(args.length == 0) + { + System.err.println("No input file specified."); + System.exit(-1); + } + if(args.length > 1) + { + reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "UTF-8")); + String line = reader.readLine(); + while(line != null && !line.startsWith("