Create complex RegExps more easily

When I was writing my linear-gradient() to -webkit-gradient() converter, I knew in advance that I would have to use a quite large regular expression to validate and parse the input. Such a regex would be incredibly hard to read and fix potential issues, so I tried to find a way to cut the process down in reusable parts.

Turns out JavaScript regular expression objects have a .source property that can be used in the RegExp constructor to create a new RegExp out of another one. So I wrote a new function that takes a string with identifiers for regexp replacements in {{ and }} and replaces them with the corresponding sub-regexps, taken from an object literal as a second argument:

/**
 * Create complex regexps in an easy to read way
 * @param str {String} Final regex with {{id}} for replacements
 * @param replacements {Object} Object with the replacements
 * @param flags {String} Just like the flags argument in the RegExp constructor
 */
RegExp.create = function(str, replacements, flags) {
	for(var id in replacements) {
		var replacement = replacements[id],
			idRegExp = RegExp('{{' + id + '}}', 'gi');

		if(replacement.source) {
			replacement = replacement.source.replace(/^\^|\$$/g, '');
		}

		// Don't add extra parentheses if they already exist
		str = str.replace(RegExp('\\(' + idRegExp.source + '\\)', 'gi'), '(' + replacement + ')');

		str = str.replace(idRegExp, '(?:' + replacement + ')');
	}

	return RegExp(str, flags);
};

If you don’t like adding a function to the RegExp object, you can name it however you want. Here’s how I used it for my linear-gradient() parser:

self.regex = {};

self.regex.number = /^-?[0-9]*\.?[0-9]+$/;
self.regex.keyword = /^(?:top\s+|bottom\s+)?(?:right|left)|(?:right\s+|left\s+)?(?:top|bottom)$/;

self.regex.direction = RegExp.create('^(?:{{keyword}}|{{number}}deg|0)$', {
	keyword: self.regex.keyword,
	number: self.regex.number 
});

self.regex.color = RegExp.create('(?:{{keyword}}|{{func}}|{{hex}})', {
	keyword: /^(?:red|tan|grey|gray|lime|navy|blue|teal|aqua|cyan|gold|peru|pink|plum|snow|[a-z]{5,20})$/,
	func: RegExp.create('^(?:rgb|hsl)a?\\((?:\\s*{{number}}%?\\s*,?\\s*){3,4}\\)$', {
		number: self.regex.number
	}),
	hex: /^#(?:[0-9a-f]{1,2}){3}$/
});

self.regex.percentage = RegExp.create('^(?:{{number}}%|0)$', {
	number: self.regex.number
});

self.regex.length = RegExp.create('{{number}}{{unit}}|0', {
	number: self.regex.number,
	unit: /%|px|mm|cm|in|em|rem|en|ex|ch|vm|vw|vh/
});

self.regex.colorStop = RegExp.create('{{color}}\\s*{{length}}?', {
	color: self.regex.color,
	length: self.regex.length
}, 'g');

self.regex.linearGradient = RegExp.create('^linear-gradient\\(\\s*(?:({{direction}})\\s*,)?\\s*({{colorstop}}\\s*(?:,\\s*{{colorstop}}\\s*)+)\\)$', {
	direction: self.regex.direction,
	colorStop: self.regex.colorStop
}, 'i');

(self in this case was a local variable, not the window object)

  • http://twitter.com/marcoos Marek Stępień

    “If you don’t like adding a function to the RegExp object”

    The general rule is: don’t mess with objects you don’t own – it will hurt you sooner or later.

    In particular, there is a possibility of .create methods on other built-in constructors: just like you have Object.create() now, you may get Array.create(), String.create() and even RegExp.create() in ECMAScript.next, see: http://j.mp/f7Vs9H

    • http://leaverou.me Lea Verou

      It *might* hurt you, not it *will* hurt you. You’re grossly exaggerating. Not every single method every single developer comes up will be added to standard JavaScript.
      As for this particular case, that’s why I said if you don’t like adding it to the RegExp object, just assign it to a variable or something. I, personally, don’t mind taking my chances. I find it looks better and makes the code more readable, so I’m willing to take the risk. And since I’m not trying to preach my way of thinking about this as what should be done, then I don’t see what the problem is.

  • Katie McKinsey

    Hi Lea,

    I’m a content curator for a developer website that has over 500,000 registered users. After reading through your blog, we’ve decided to invite you to join our Most Valuable Blogger program. We’re creating a section of our site devoted to web standard (CSS3, HTML5, JavaScript, etc.).

    If you want to hear more about the program, contact me at my direct email address: katie (at) dzone (dot) com.

    -Katie-

  • http://twitter.com/stephband stephband

    Hello!

    I’ve been curious about this idea – it’s great! I got a chance to take a good look this morning. I prefer defining my regexps as /regexps/ rather than strings, so I got to tinkering… then I noticed you loop over all the possible replacements and test for their ids in the template you are creating, and I decided to do the reverse – find the ids in the template and see if they exist in the replacements object. So here’s a variation…

    RegExp.create = (function(){
    function replaceFn(obj) {
    return function($0, $1, $2) {
    // $1 exists where {{key}} is matched, while $2
    // exists where ({{key}}) is matched.
    var r = obj[($1 || $2)];

    if (!r) { throw("Exception: attempting to build RegExp but obj['"+$1+"'] is undefined."); }

    // Strip out beginning and end matchers
    r = r.source.replace(/^^|$$/g, '');

    // Return either a non-capturing group or a capturing
    // group depending on the original match.
    return [($1 ? '(?:' : '('), r, ')'].join('');
    }
    }

    return function(regex, obj) {
    return RegExp(
    regex.source.replace(/{{([a-zA-Z0-9]+)}}|({{([a-zA-Z0-9]+)}})/g, replaceFn(obj)),
    (regex.global ? 'g' : '') +
    (regex.ignoreCase ? 'i' : '') +
    (regex.multiline ? 'm' : '')
    );
    };
    })();

    You use it in much the same way, but you pass in a regexp object and the replacements object. No need for flags, because they are copied from the regexp object:

    var r = RegExp.create( /regex{{template}}/gi, replacementsObj );

    Ok. Cheers for the ideas! Stephen.

    • http://twitter.com/stephband stephband

      Oh, and one more thing – it throws an exception if a replacement is not found in replacementsObj.

      Cheers!

      • http://leaverou.me Lea Verou

        Maybe a notice (console.info()) or a warning (console.warn()) would be more appropriate than an exception. Maybe the developer actually **wants** to match {{something}}

        • http://twitter.com/stephband stephband

          Well, that is a good point, and maybe he does. I did consider that, and took the view that when you’re writing templates for stuff, like Django templates for example, you want to bork when you’re trying to use something that doesn’t exist. But you may be right, a warning may be better.

          And now that you mention it, I realise you couldn’t parse a Django-like template with this because it would throw at {{undefined}} :)

          Hmm.

        • http://twitter.com/stephband stephband

          I edited the post to put your ideas in. I’m sure the regex

          /{{([a-zd]+)}}|({{([a-zd]+)}})/gi

          could be written better – I don’t like the way I’m using a second capturing group to detect whether the template tag is bracketed or not.

        • http://leaverou.me Lea Verou

          How about using /(?{{([a-zd]+)}})?/gi instead?
          Btw have you actually tried your regexp? I’m afraid that the first will always be the one that matches, even when both do, since that’s how regexp engines work (try ‘ab’.match(/a|ab/) and you’ll see what I mean)

        • http://twitter.com/stephband stephband

          The trouble with that is a that it will match ({{id}} and {{id}}), neither of which are what we want.

          I had the same worry as you, but I was surprised that in my tests it worked without problem. I think that’s because it gets to the parenthesis first – ie, it matches ( before it matches {, and by the time everything following ( is replaced it can no longer match {. But it does work :)

          Try ‘ab’.match(/b|ab/) and you’ll see what I mean.

    • http://leaverou.me Lea Verou

      Love it! It would be interesting to compare the performance of the two. Probably yours is much faster but JS performance sometimes surprises me.

      • http://twitter.com/stephband stephband

        My hunch is that yours will be faster (I’m currying a function on each call to create), until the replacementsObj becomes large, at which point mine will overtake (you loop over the whole replacementsObj on each call to create).

        But it is just a hunch.

        I think performance is secondary for this – I don’t expect to be doing it often enough in my code to worry that it will slow anything down. It’s a sort of use-once-and-forget method.

    • http://leaverou.me Lea Verou

      Btw instead of using [a-zA-Z0-9] why don’t you specifiy the i flag so it could become [a-z0-9] or even [a-zd]

  • Anonymous

    I’ve been doing this sort of thing for a couple of years at work in Python, using verbose regular expressions and the built-in string formatting, to maintain a ~5KB regular expression. The ability to use comments and indentation has turned a potential nightmare into a fun project. I didn’t realize that this sort of composition was possible in javascript–we actually use the Python-based regular expression to generate a compact and javascript-compatible version:

    compact_pattern = ”.join(n.split(‘#’)[0].strip() for n in verbose_pattern.splitlines())

  • Shahar_m

    Could you share a few examples of How-To use?

    • http://leaverou.me Lea Verou

      I did, in the blog post above…

  • Steven Levithan

    Lea, have you tried XRegExp? (http://git.io/xregexp ) That lets you use the x flag for creating regexes with free-spacing and comments, which is a more common/alternative approach for increasing RegExp readability. And since XRegExp’s syntax is extendable, you could use your {{id}} syntax more directly using something like this:

    var regexLib = {
        number: /-?d+(?:.d+)?/
    };

    XRegExp.install(“extensibility”);
    XRegExp.addToken(
        /{{([w$]+)}}/,
        function (match) {
            if (XRegExp.isRegExp(regexLib[match[1]])) {
                return “(?:” + 
    regexLib[match[1]].source + “)”;
            }
            throw new ReferenceError(“unknown: ” + match[1]);
        }
    )

    var regex = XRegExp(“^{{number}}$”);
    regex.test(“3.14″); // true

    • http://leaverou.me Lea Verou

      Hi Steven,

      Honored to see you drop by here. How did you find my blog? :)

      I’m well aware of XRegExp. :) It’s a very impressive script and I was planning to devote at least a slide to it in my upcoming regexp talk.  However, I don’t have first hand experience with it, as it always felt a little bit too much for the projects I personally considered it for. Didn’t know that it was extensible like that though, that’s incredible! I will certainly need to give it a go.

      • Steven Levithan

        Found this post after seeing you were giving a RegExp talk at O’Reilly’s 2012 Fluent Conference (neat-o!), but I’ve been stopping by your blog occasionally for at least a couple years. :)

        If you do try it out, I’d recommend the v2 beta on GitHub since parts of the API have changed. Feel free to email me if you have questions about it. Note that extending XRegExp’s syntax like this has a variety of advantages over search-and-replace with regexp source, including the ability to specify whether the syntax applies outside of character classes only (the default), inside character classes, or both (you can even apply different handling for default/class scopes via one handler function); automatic support for backslash-escaping of the new syntax; the ability to apply your new syntax only when a custom flag is used; and so on.

        • http://leaverou.me Lea Verou

          That sounds awesome!! I should definitely try to fully port this to XRegExp. Although in many cases, I’d still prefer this one, purely due to smaller filesize & lack of dependencies.

        • http://stevenlevithan.com/ Steven Levithan

          Understandable that XRegExp might sometimes be overkill.
          In case it helps, though, here’s one way to do a full port:

          XRegExp.create = function (pattern, data, flags) {
              var data2 = {}, id, regex;
              for (id in data) {
                  if (XRegExp.isRegExp(data[id])) {
                      data[id] = data[id].source.replace(/^^|$$/g, “”);
                  }
                  data2[id.toLowerCase()] = data[id];
              }
              XRegExp.create.data = data2;
              try {
                  regex = XRegExp(pattern, flags);
              } catch (err) {
                  throw err;
              } finally {
                  // delete the data even if an “unknown property” error was thrown
                  delete XRegExp.create.data;
              }
              return regex;
          };
          XRegExp.install(“extensibility”); // if not already done
          XRegExp.addToken(
              /{{([^}]+)}}/,
              function (match) {
                  var id = match[1].toLowerCase();
                  if (!XRegExp.create.data[id]) {
                      throw new ReferenceError(“unknown property: ” + id);
                  }
                  return “(?:” + XRegExp.create.data[id] + “)”;
              }, {
                  trigger: function () {return !!XRegExp.create.data;},
                  scope: “all”
              }
          );

          Because of the trigger function provided with the new XRegExp token, the {{..}} syntax works only for regexes created by XRegExp.create. And because XRegExp.create passes to XRegExp, you can mix your new syntax with other fancy XRegExp features like named capture and s, x, or n flags. You also get the things I mentioned previously like support for escaped {{..}}. If you don’t want the {{..}} syntax to work inside character classes, just remove the scope option near the end, or change its value to “default”.

          Hell…if you want, adapt this as you please (or don’t) and release it as your own XRegExp addon. :) I did the port, but it’s your concept and it seems fairly useful. (Note that this will work only with XRegExp 2+.)

        • http://stevenlevithan.com/ Steven Levithan

          This reply box is starting to make me claustrophobic… :P

          I just noticed that you previously told stephband you might prefer to not include error handling so that {{something}} can match itself if there is no replacements.something property. Although that’s less relevant here since XRegExp lets you escape it as {{something}}, the following version works like the original and doesn’t throw exceptions (which removes ~10 out of ~30 lines):

          XRegExp.create = function (str, replacements, flags) {  var data = XRegExp.create.data = replacements, id, regex;  for (id in data) {    if (XRegExp.isRegExp(data[id])) {      data[id] = data[id].source.replace(/^^|$$/g, “”);    }  }  regex = XRegExp(str, flags);  delete XRegExp.create.data;  return regex;};XRegExp.addToken(  /{{([^}]+)}}/,  function (match) {    var value = XRegExp.create.data[match[1]];    return value !== undefined ? “(?:” + value + “)” : match[0];  }, {    trigger: function () {return !!XRegExp.create.data;},    scope: “all”  });

          Note that I’ve also removed the case insensitivity of property names in this version, since IMHO it’s better without that.

        • Anonymous

          Ack, I guess Disqus doesn’t like two-space tabs (and I can’t edit since I posted as a guest).

  • Pingback: Creating Grammatical Patterns Using XRegExp.build

  • Anonymous

    Lea, I think this is a great idea, and I especially like how self-contained it is. I wanted it to allow for entering in real RegExps, and not just strings, so I added it in… the flags parameter is now optional, but this should be a complete drop in replacement for your original version. It also supports custom delimiters. I was hoping to find an alternative to the {{}}, since it gets very busy visually very quickly. I tried to maintain your code style while writing it. I hope it helps someone!


    RegExp.create = function(regex, replacements, flags) {

    var sDelim = RegExp.create.delimiters[0],
    eDelim = RegExp.create.delimiters[1];

    var str = regex.source
    ? regex.source
    : regex;

    for(var id in replacements) {
    var replacement = replacements[id],
    idRegExp = RegExp(sDelim + id + eDelim, 'gi'),
    parenRegExp = RegExp('\(' + idRegExp.source + '\)', 'gi');

    if(replacement.source) {
    replacement = replacement.source.replace(/^^|$$/g, '');
    }

    // Don't add extra parentheses if they already exist
    str = str.replace(parenRegExp, '(' + replacement + ')');

    str = str.replace(idRegExp, '(?:' + replacement + ')');
    }

    flags = flags || regex.source
    ? (regex.multiline ? 'm' : '')
    + (regex.global ? 'g' : '')
    + (regex.ignoreCase ? 'i' : '')
    : '';

    return RegExp(str, flags);
    };
    RegExp.create.delimiters = [ '{{', '}}' ];

  • Pingback: JSSpy » Creating grammatical regexes using xregexp.build