napstr4u (Black Edition): RTF to HTML Convertor

/ Very primitive RTF 2 HTML reader
// Converts tiny subset of RTF (from VS IDE) into html.
// Author: Mike Stall (http://blogs.msdn.com/jmstall)
// Gets input RTF from clipboard.
using System;
using System.Collections.Generic;
using System.Text;
using System.Windows.Forms;
using System.Text.RegularExpressions;
using System.IO;

namespace ClipBoard1
{
    class Program
    {
        [STAThread()]
        static void Main(string[] args)
        {
            Console.WriteLine("Get RTF from the clipboard.");
            IDataObject iData = Clipboard.GetDataObject();
            string[] f = iData.GetFormats();
            string rtf = (string)iData.GetData(DataFormats.Rtf);

            Console.WriteLine(iData.GetData(DataFormats.Text));

            // We assume the colortable and fontable are a standard preset used by VS.
            // Avoids hassle of parsing them.
            // Skip past {\colortbl.*;} and to the start of the real data
            // @todo - regular expression would be good here.
            int i1 = rtf.IndexOf(@"{\colortbl");
            if (i1 <= 0) throw new ArgumentException("Bad input RTF.");
            int i2 = rtf.IndexOf(";}", i1);
            if (i2 <= 0) throw new ArgumentException("Bad input RTF.");
            string data = rtf.Substring(i2 + 2, rtf.Length - (i2 + 2) - 1);

            TextWriter tw = new StreamWriter("out.html");
            Format(tw, data);
            tw.Close();
        }

        // Default color table used by VS's IDE.
        static string[] m_colorTable = new string[]
            {
               // rrGGbb
                "#000000", // default, starts at index 0
                "#000000", // real color table starts at index 1
                "#0000FF",
                "#00ffFF",
                "#00FF00",
                "#FF00FF",
                "#FF0000",
                "#FFFF00",
                "#FFffFF",
                "#000080",
                "#008080",
                "#008000",
                "#800080",
                "#800000",
                "#808000",
                "#808080",
                "#c0c0c0"
            };


        // Escape HTML chars
        static string Escape(string st)
        {
            st = st.Replace("&", "&");
            st = st.Replace("<", "<");
            st = st.Replace(">", ">");           
            return st;
        }
        // Convert the RTF data into an HTML stream.
        // This rtf snippet is past the font + color tables, so we're just transfering control words now.
        // Write out HTML to the text writer.       
        static void Format(TextWriter tw, string rtf)
        {
            tw.Write("");
            tw.Write("");
            // Example: \fs20 \cf2 using\cf0  System;
            // root --> ('text' '\' ('control word' | 'escaped char'))+
            // 'control word'  --> (alpha)+ (numeric*) space?
            // 'escaped char' = 'x'. Some characters \, {, } are escaped: '\x' --> 'x'
            // @todo - handle embedded groups (begin with '{')

            int idx = 0;
            while (idx < rtf.Length)
            {
                // Get any text up to a '\'.
                Regex r1 = new Regex(@"(.*?)\\", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Match m = r1.Match(rtf, idx);
                if (m.Length == 0) break;

                // text will be empty if we have adjacent control words
                string stText = m.Groups[1].ToString();
                tw.Write(Escape(stText));
                idx += m.Length;

                // check for RTF escape characters. According to the spec, these are the only escaped chars.
                char chNext = rtf[idx];
                if (chNext == '{' || chNext == '}' || chNext == '\\')
                {
                    // Escaped char
                    tw.Write(chNext);
                    idx++;
                    continue;
                }

                // Must be a control char. @todo- delimeter includes more than just space, right?
                Regex r2 = new Regex(@"([\{a-z]+)([0-9]*) ", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                m = r2.Match(rtf, idx);
                string stCtrlWord = m.Groups[1].ToString();
                string stCtrlParam = m.Groups[2].ToString();

                if (stCtrlWord == "cf")
                {
                    // Set font color.
                    int iColor = Int32.Parse(stCtrlParam);
                    tw.Write(""); // close previous span, and start a new one for the given color.                    
                    tw.Write("");
                }
                else if (stCtrlWord == "fs")
                {
                    // Sets font size. ignore
                }
                else if (stCtrlWord == "par")
                {
                    // This is a newline. ignore
                    // @todo- I think the only reason we can ignore this is because the \par in our input are always followed by
                    // a '\r\n' and we're accidentally writing that.
                }
                else
                {
                    throw new ArgumentException("Unrecognized control word '" + stCtrlWord + stCtrlParam + "'after:" + stText);
                }
                idx += m.Length;
            }
            tw.Write(Escape(rtf.Substring(idx))); // rest of string

            tw.Write("
");
        } // end Format()
    }
}
Pages

RTF to HTML Convertor