babel: Bug: wrong unicode regexp

Here is my code:

var a = 'a',                        //usual character
    s = String.fromCharCode(55296), //start of surrogate pair
    e = String.fromCharCode(56320); //end of surrogate pair

console.log((a + e).match(/./ug).length); //must print "2" but prints "1" after babel
console.log((e + e).match(/./ug).length); //must print "2" but prints "1" after babel

Babel transforms it to this code:

var a = 'a',
    s = String.fromCharCode(55296),
    e = String.fromCharCode(56320);

console.log((a + e).match(/(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])/g).length); //must print "2" but prints "1" after babel
console.log((e + e).match(/(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])/g).length); //must print "2" but prints "1" after babel

As you can see “.” character is processed totally wrong.

Currently babel converts it to:

/(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])/

but must convert to:

/(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?)/

This not only much shorter but also does not contain such easy bugs.

I also wrote a test:

'use strict';



if (typeof describe === 'undefined') {
	//Use "mocha" command to run this test (npm install -g mocha and move the file to dir with name "test")
	global.describe = (name, clb) => clb();
	global.it = (name, clb) => clb();
}



const assert = require('assert');



const a = 'a',                        //usual character
      s = String.fromCharCode(55296), //start of surrogate pair
      e = String.fromCharCode(56320); //end of surrogate pair

testAll('native',      /./ug);
testAll('my',          /(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?)/g);
testAll('my-shorted',  /(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[\0-\t\x0B\f\x0E-\u2027\u202A-\uFFFF])/g);                //It is shorter but performance may be worse
testAll('babel-fixed', /(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|[\uDC00-\uDFFF])/g);
testAll('babel',       /(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])/g);



function testAll(name, rexp) {
	describe(name, function() {
		testOne(rexp, a + a, [a, a]);
		testOne(rexp, a + s, [a, s]);
		testOne(rexp, a + e, [a, e]);
		testOne(rexp, s + a, [s, a]);
		testOne(rexp, s + s, [s, s]);
		testOne(rexp, s + e, [s + e]);
		testOne(rexp, e + a, [e, a]);
		testOne(rexp, e + s, [e, s]);
		testOne(rexp, e + e, [e, e]);
		
		testOne(rexp, a, [a]);
		testOne(rexp, s, [s]);
		testOne(rexp, e, [e]);
		
		testOne(rexp, a + a + a, [a, a, a]);
		testOne(rexp, a + a + s, [a, a, s]);
		testOne(rexp, a + a + e, [a, a, e]);
		testOne(rexp, a + s + a, [a, s, a]);
		testOne(rexp, a + s + s, [a, s, s]);
		testOne(rexp, a + s + e, [a, s + e]);
		testOne(rexp, a + e + a, [a, e, a]);
		testOne(rexp, a + e + s, [a, e, s]);
		testOne(rexp, a + e + e, [a, e, e]);
		
		testOne(rexp, s + a + a, [s, a, a]);
		testOne(rexp, s + a + s, [s, a, s]);
		testOne(rexp, s + a + e, [s, a, e]);
		testOne(rexp, s + s + a, [s, s, a]);
		testOne(rexp, s + s + s, [s, s, s]);
		testOne(rexp, s + s + e, [s, s + e]);
		testOne(rexp, s + e + a, [s + e, a]);
		testOne(rexp, s + e + s, [s + e, s]);
		testOne(rexp, s + e + e, [s + e, e]);
		
		testOne(rexp, e + a + a, [e, a, a]);
		testOne(rexp, e + a + s, [e, a, s]);
		testOne(rexp, e + a + e, [e, a, e]);
		testOne(rexp, e + s + a, [e, s, a]);
		testOne(rexp, e + s + s, [e, s, s]);
		testOne(rexp, e + s + e, [e, s + e]);
		testOne(rexp, e + e + a, [e, e, a]);
		testOne(rexp, e + e + s, [e, e, s]);
		testOne(rexp, e + e + e, [e, e, e]);
	});
}



function testOne(rexp, s, expectedResult) {
	it('oneTest', function() {
		const a = s.match(rexp);
		assert.deepStrictEqual(a, expectedResult);
		assert.strictEqual(a.length, expectedResult.length);
	});
}

Native implementation and my implementation pass the test but babel implementation does not.


Versions: babel-core: 6.26.0 babel-preset-env: 1.6.1 babel-loader: 7.1.2 (I am using Babel with webpack)

The test field in the main page https://babeljs.io/ has the same bug.

About this issue

  • Original URL
  • State: open
  • Created 7 years ago
  • Comments: 24 (11 by maintainers)

Most upvoted comments

Well. There is two cases:

  1. You need match high surrogate. This is easy because of look ahead: “(?!)”.
  2. You need match low surrogate. Actually it is easy too. This is because of current position cannot be at the end of surrogate pair. So you just need check [\uDC00-\uDFFF]. That’s all.

Of course you can ask why current position cannot be at end of surrogate pair.

  • If you was searching range of code points >= 65536 then you wrote something this:
    /( \uD877[\uDF55-\uDFFF]   |   [\uD878-\uD87A][\uDC00-\uDFFF]   |   \uD87B[\uDC00-\uDC01] )/
    
    //(but without spaces of course)
    
    It can match only full surrogate pairs. This example matches code points from 188245 to 191489. I have chosen this range randomly. Even if you will use *?+ you will match full surrogates anyway.
  • If you was searching range of code points <= 55295 you cannot go to the position of some surrogate pair.
  • If you was searching lone high surrogate everything will be OK because of look ahead using “(?!)”
  • If you was searching lone low surrogate you cannot be at the end of surrogate pair because end is already passed.

So as result you don’t need look behind but you DO need look ahead.