babel: Bug: wrong unicode regexp
Here is my code:
var a = 'a', //usual character
s = String.fromCharCode(55296), //start of surrogate pair
e = String.fromCharCode(56320); //end of surrogate pair
console.log((a + e).match(/./ug).length); //must print "2" but prints "1" after babel
console.log((e + e).match(/./ug).length); //must print "2" but prints "1" after babel
Babel transforms it to this code:
var a = 'a',
s = String.fromCharCode(55296),
e = String.fromCharCode(56320);
console.log((a + e).match(/(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])/g).length); //must print "2" but prints "1" after babel
console.log((e + e).match(/(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])/g).length); //must print "2" but prints "1" after babel
As you can see “.” character is processed totally wrong.
Currently babel converts it to:
/(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])/
but must convert to:
/(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?)/
This not only much shorter but also does not contain such easy bugs.
I also wrote a test:
'use strict';
if (typeof describe === 'undefined') {
//Use "mocha" command to run this test (npm install -g mocha and move the file to dir with name "test")
global.describe = (name, clb) => clb();
global.it = (name, clb) => clb();
}
const assert = require('assert');
const a = 'a', //usual character
s = String.fromCharCode(55296), //start of surrogate pair
e = String.fromCharCode(56320); //end of surrogate pair
testAll('native', /./ug);
testAll('my', /(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?)/g);
testAll('my-shorted', /(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[\0-\t\x0B\f\x0E-\u2027\u202A-\uFFFF])/g); //It is shorter but performance may be worse
testAll('babel-fixed', /(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|[\uDC00-\uDFFF])/g);
testAll('babel', /(?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])/g);
function testAll(name, rexp) {
describe(name, function() {
testOne(rexp, a + a, [a, a]);
testOne(rexp, a + s, [a, s]);
testOne(rexp, a + e, [a, e]);
testOne(rexp, s + a, [s, a]);
testOne(rexp, s + s, [s, s]);
testOne(rexp, s + e, [s + e]);
testOne(rexp, e + a, [e, a]);
testOne(rexp, e + s, [e, s]);
testOne(rexp, e + e, [e, e]);
testOne(rexp, a, [a]);
testOne(rexp, s, [s]);
testOne(rexp, e, [e]);
testOne(rexp, a + a + a, [a, a, a]);
testOne(rexp, a + a + s, [a, a, s]);
testOne(rexp, a + a + e, [a, a, e]);
testOne(rexp, a + s + a, [a, s, a]);
testOne(rexp, a + s + s, [a, s, s]);
testOne(rexp, a + s + e, [a, s + e]);
testOne(rexp, a + e + a, [a, e, a]);
testOne(rexp, a + e + s, [a, e, s]);
testOne(rexp, a + e + e, [a, e, e]);
testOne(rexp, s + a + a, [s, a, a]);
testOne(rexp, s + a + s, [s, a, s]);
testOne(rexp, s + a + e, [s, a, e]);
testOne(rexp, s + s + a, [s, s, a]);
testOne(rexp, s + s + s, [s, s, s]);
testOne(rexp, s + s + e, [s, s + e]);
testOne(rexp, s + e + a, [s + e, a]);
testOne(rexp, s + e + s, [s + e, s]);
testOne(rexp, s + e + e, [s + e, e]);
testOne(rexp, e + a + a, [e, a, a]);
testOne(rexp, e + a + s, [e, a, s]);
testOne(rexp, e + a + e, [e, a, e]);
testOne(rexp, e + s + a, [e, s, a]);
testOne(rexp, e + s + s, [e, s, s]);
testOne(rexp, e + s + e, [e, s + e]);
testOne(rexp, e + e + a, [e, e, a]);
testOne(rexp, e + e + s, [e, e, s]);
testOne(rexp, e + e + e, [e, e, e]);
});
}
function testOne(rexp, s, expectedResult) {
it('oneTest', function() {
const a = s.match(rexp);
assert.deepStrictEqual(a, expectedResult);
assert.strictEqual(a.length, expectedResult.length);
});
}
Native implementation and my implementation pass the test but babel implementation does not.
Versions: babel-core: 6.26.0 babel-preset-env: 1.6.1 babel-loader: 7.1.2 (I am using Babel with webpack)
The test field in the main page https://babeljs.io/ has the same bug.
About this issue
- Original URL
- State: open
- Created 7 years ago
- Comments: 24 (11 by maintainers)
Well. There is two cases:
Of course you can ask why current position cannot be at end of surrogate pair.
So as result you don’t need look behind but you DO need look ahead.