docopt.cpp: Regex expressions cause stack overflows in MSVC 2015

First of all, let me thank you for this awesome library!

The standard <regex> implementation for Visual C++ seems to have a rather severe design issue that seems to cause stack overflows for complicated expressions (I suspect the lookaheads you use, but I’m not really sure about that). A couple of google searches revealed that this seems to be a known issue. Some people seem to have been able to avoid the problem by increasing their stack size, but that didn’t help in my case (I’ve added an example program below that reproduces the problem; the nested method calls in std::_Matcher are really quite insane).

I’ve already written a patch for parse_section that resolves my original problem, but I still get a similar error in parse_defaults with the demo program below. Also, I unfortunately don’t have any git-tools at my disposal right now, so I can’t start a pull request for this. For now, I’ll post my version of parse_section in the comments and see if I can come up with a similar “fix” for the other expressions.

Here’s my test program to reproduce the problem:

#include <docopt.h>
#include <iostream>
#include <string>

void printArguments(const std::map<std::string, docopt::value> &args)
{
    std::cout << "Arguments:\n\n";
    for (auto &entry : args)
    {
        const std::string &name = entry.first;
        const docopt::value &value = entry.second;

        std::cout << name << ": \n";

        if (value.isString())
            std::cout << "       (string)   " << value.asString() << "\n\n";
        else if (value.isLong())
            std::cout << "       (long)     " << value.asLong() << "\n\n";
        else if (value.isBool())
            std::cout << "       (bool)     " << value.asBool() << "\n\n";
        else if (value.isStringList())
        {
            const auto &list = value.asStringList();
            std::cout << "       (str_list) [\n";

            for (auto &str : list)
                std::cout << "                        " << str << ",\n";
            std::cout << "                  ]\n\n";
        }
        else if (!value) std::cout << "       (empty)\n\n";
        else std::cout << "       (unknown)\n\n";

    }
    std::cout << std::endl;
}

const auto USAGE =
R"(
Usage:
    foo [options] <ARG1> <ARG2> [--option1=<OPTION1>] [--option2=<OPTION2>]
    foo --command1 <ARG3> <ARG4> <ARG5> <ARG6>  [--option3=<OPTION3> [--option4=<OPTION4>]] [--option5=<OPTION5>]
    foo --command2 <ARG4>
    foo (-v | --version)
    foo (-h | --help)

Options:
    -o <OPTION6>, --option6=<OPTION6>      Some rather long description of option #6 which makes this line longer than it really should be...
)";

void main(int argc, const char** argv)
{
    try
    {
        auto arguments = docopt::docopt(USAGE, { argv + 1,argv + argc }, true, "1.0.0.0", true);
        printArguments(arguments);
    }
    catch (std::exception &e)
    {
        std::cerr << "Encountered exception of type "
            << typeid(e).name() << ": "
            << e.what();
    }
    std::cin.ignore();
}

About this issue

Original URL
State: closed
Created 8 years ago
Reactions: 1
Comments: 26 (11 by maintainers)

Most upvoted comments

The bug occurs in VS2017 too.

Alternatively, I defined DOCTOPT_USE_BOOST_REGEX as ( _DEBUG && _WIN64 && _HAS_ITERATOR_DEBUGGING != 0 )

Works for me.

grasmanek94 on Mar 28, 2017

This is the aforementioned patch:

static std::vector<std::string> parse_section(std::string const& name, std::string const& source) {
#ifndef _MSC_VER
    // ECMAScript regex only has "?=" for a non-matching lookahead. In order to make sure we always have
    // a newline to anchor our matching, we have to avoid matching the final newline of each grouping.
    // Therefore, our regex is adjusted from the docopt Python one to use ?= to match the newlines before
    // the following lines, rather than after.
    std::regex const re_section_pattern{
        "(?:^|\\n)"  // anchored at a linebreak (or start of string)
        "("
           "[^\\n]*" + name + "[^\\n]*(?=\\n?)" // a line that contains the name
           "(?:\\n[ \\t].*?(?=\\n|$))*"         // followed by any number of lines that are indented
        ")",
        std::regex::icase
    };

    std::vector<std::string> ret;
    std::for_each(std::sregex_iterator(source.begin(), source.end(), re_section_pattern),
        std::sregex_iterator(),
        [&](std::smatch const& match)
    {
        ret.push_back(trim(match[1].str()));
    });

    return ret;
#else
    //Parse the sections in a more old-fashioned way in order to avoid stack overflows in Microsoft's regex implementation
    std::regex const re_section_start_pattern{ "(?:^|\\n)[^\\n]*" + name, std::regex::icase};
    std::sregex_iterator const re_end;

    std::vector<std::string> ret;
    auto const source_end = source.end();
    std::sregex_iterator section_head(source.begin(), source_end, re_section_start_pattern);
    for (; section_head != re_end; ++section_head)
    {
        //We found a section header
        auto section_start = source.begin() + section_head->position();
        if (*section_start == '\n') ++section_start; //If it is positioned on a newline, move ahead one char

        //Add the complete header line to the section
        auto section_end = std::find(section_start, source_end, '\n');
        if (section_end != source_end) ++section_end; //skip '\n'

        while (section_end != source_end)
        { //Add subsequent lines to the section if they're indented
            if (*section_end != '\t' && *section_end != ' ') break; //Unindented lines end the section
            section_end = std::find(section_end, source_end, '\n');
            if (section_end != source_end) ++section_end; //skip '\n'
        }
        ret.push_back(trim({section_start,section_end}));
    }
    return ret;
#endif
}

static std::vector<Option> parse_defaults(std::string const& doc) {
#ifndef _MSC_VER
    // This pattern is a bit more complex than the python docopt one due to lack of
    // re.split. Effectively, it grabs any line with leading whitespace and then a
    // hyphen; it stops grabbing when it hits another line that also looks like that.
    static std::regex const pattern {
        "(?:^|\\n)[ \\t]*"  // a new line with leading whitespace
        "(-(.|\\n)*?)"      // a hyphen, and then grab everything it can...
        "(?=\\n[ \\t]*-|$)" //  .. until it hits another new line with space and a hyphen
    };

    std::vector<Option> defaults;

    for(auto s : parse_section("options:", doc)) {
        s.erase(s.begin(), s.begin()+static_cast<std::ptrdiff_t>(s.find(':'))+1); // get rid of "options:"

        std::for_each(std::sregex_iterator{ s.begin(), s.end(), pattern },
                  std::sregex_iterator{},
                  [&](std::smatch const& m)
        {
            std::string opt = m[1].str();

            if (starts_with(opt, "-")) {
                defaults.emplace_back(Option::parse(opt));
            }
        });
    }

    return defaults;
#else
    static std::regex const pattern {"(?:^|\\n)[\\t ]*-"};
    std::vector<Option> defaults;

    for (auto s : parse_section("options:", doc)) 
    {
        s.erase(s.begin(), s.begin() + static_cast<std::ptrdiff_t>(s.find(':')) + 1); // get rid of "options:"

        std::sregex_iterator option_line(s.begin(), s.end(), pattern);
        std::sregex_iterator re_end;
        if (option_line == re_end) continue;

        auto option_begin = std::find(s.begin() + option_line->position(), s.end(), '-');
        ++option_line;

        auto option_end = s.end();
        for (; option_line != re_end; ++option_line)
        {
            option_end = s.begin() + option_line->position();
            defaults.emplace_back(Option::parse({option_begin,option_end}));
            option_begin = std::find(option_end, s.end(), '-');
        }
        defaults.emplace_back(Option::parse({option_begin,s.end()}));
    }
    return defaults;
#endif
}

mfrischknecht on Jun 2, 2016