compute: [OS X 10.11.5] [GeForce GT 750M] Segmentation fault

Hi, Am trying to sort a vector on GPU (GeForce GT 750M) using example code which generates SEGV. However, it runs fine on Iris Pro (Intel® Core™ i7-4850HQ CPU).

We observe SEGV when size is 10 million. When the size is 5 million, following exception is thrown: boost/1_61_0/include/boost/compute/command_queue.hpp(453): Throw in function boost::compute::event boost::compute::command_queue::enqueue_write_buffer(const boost::compute::buffer &, size_t, size_t, const void *, const boost::compute::wait_list &) Dynamic exception type: boost::exception_detail::clone_impl<boost::exception_detail::error_info_injector< boost::compute::opencl_error> > std::exception::what: Invalid Value

Another observation: if the size of vector is 50 million the sorting works fine, 

though the timings are worse than Iris Pro. Also, when is size is 100 million, the binary causes the OS to crash.

Compiler details: clang++ --version Apple LLVM version 7.3.0 (clang-703.0.31) Target: x86_64-apple-darwin15.5.0 Thread model: posix

OS: System Version: OS X 10.11.5 (15F34) Kernel Version: Darwin 15.5.0 Regards, Prashant

----------------------------Cut here-------------------------------------

#include <iostream>
#include <vector>
#include <algorithm>
#include <boost/foreach.hpp>
#include <boost/compute/core.hpp>
#include <boost/compute/platform.hpp>
#include <boost/compute/algorithm.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/functional/math.hpp>
#include <boost/compute/types/builtin.hpp>
#include <boost/compute/function.hpp>
#include <boost/chrono/include.hpp>
#include <boost/exception/all.hpp>

namespace compute = boost::compute;

int main(int argc, char* argv[])
{
    if (argc != 2) {
        std::cout << "Usage: " << argv[0] << " <size> " << std::endl;
        return 0;
    }

    // generate random data on the host
    std::vector<float> host_vector(atoi(argv[1]));
    std::generate(host_vector.begin(), host_vector.end(), rand);

    std::cout << "===============CPU==================\n";
    for (size_t k=0; k<5; k++)
    {
        std::vector<float> host_copy_vector(host_vector);
        auto start = std::chrono::high_resolution_clock::now();
        std::sort(host_copy_vector.begin(), host_copy_vector.end());

        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>
(std::chrono::high_resolution_clock::now() - start);
        std::cout << "time: iteration ("<< k << ") : " << duration.count() << " ms" << 
std::endl;
    }

    std::vector<compute::platform> platforms = compute::system::platforms();

    for(size_t i = 0; i < platforms.size(); i++){
        const compute::platform &platform = platforms[i];
        std::cout << "Platform '" << platform.name() << "'" << std::endl;
        std::vector<compute::device> devices = platform.devices();

        for(size_t j = 0; j < devices.size(); j++){
            const compute::device &device = devices[j];

            std::string type;
            if(device.type() & compute::device::gpu)
                type = "GPU Device";
            else if(device.type() & compute::device::cpu)
                type = "CPU Device";
            else if(device.type() & compute::device::accelerator)
                type = "Accelerator Device";
            else
                type = "Unknown Device";

            if (type != "GPU Device") {
                std::cout << "Ignoring non GPU devices.\n";
                continue;
            }

            std::cout << "====\n";
            std::cout << "  " << type << ": " << device.name() << std::endl;
            std::cout << "====\n";
            compute::context context(device);
            compute::command_queue queue(context, device);

            for (size_t k=0; k<5; k++)
            {
                compute::vector<float> device_vector(host_vector.size(), context);

                // copy data from the host to the device
                compute::copy(
                    host_vector.begin(), host_vector.end(), device_vector.begin(), queue
                );

                auto start = std::chrono::high_resolution_clock::now();
                try {
                  compute::sort(device_vector.begin(), device_vector.end(), queue);
                } catch (boost::exception & e) {
                  std::cerr << diagnostic_information(e);
                  break;
                }

                auto duration = std::chrono::duration_cast<std::chrono::milliseconds>
(std::chrono::high_resolution_clock::now() - start);
                std::cout << "time: iteration ("<< k << ") : " << duration.count() << " ms" <<
std::endl;
            }

            std::cout <<  "====\n";
        }
    }

    return 0;
}

----------------------------Cut here-------------------------------------

About this issue

  • Original URL
  • State: open
  • Created 8 years ago
  • Comments: 20 (11 by maintainers)

Most upvoted comments

So I can reproduce the segfault at size 10000000 and “Invalid Value” exception at size 5000000.

Here’s the backtrace from the “Invalid Value” exception:

* thread #1: tid = 0x627b92, 0x00007fff82467ab7 libc++abi.dylib`__cxa_throw, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1
  * frame #0: 0x00007fff82467ab7 libc++abi.dylib`__cxa_throw
    frame #1: 0x00000001000087a1 hello_world`void boost::throw_exception<boost::exception_detail::error_info_injector<boost::compute::opencl_error> >(e=0x00007fff5fbfb990) + 113 at throw_exception.hpp:69
    frame #2: 0x00000001000086b5 hello_world`void boost::exception_detail::throw_exception_<boost::compute::opencl_error>(x=0x00007fff5fbfbab0, current_function="boost::compute::event boost::compute::command_queue::enqueue_write_buffer(const boost::compute::buffer &, size_t, size_t, const void *, const boost::compute::wait_list &)", file="../include/boost/compute/command_queue.hpp", line=453) + 197 at throw_exception.hpp:86
    frame #3: 0x00000001000167e6 hello_world`boost::compute::command_queue::enqueue_write_buffer(this=0x00007fff5fbff320, buffer=0x00007fff5fbfbca8, offset=0, size=64, host_ptr=0x00007fff5fbfbbf0, events=0x00007fff5fbfbbc8) + 806 at command_queue.hpp:453
    frame #4: 0x00000001000ab5f7 hello_world`void boost::compute::detail::dispatch_fill<boost::compute::buffer_iterator<boost::compute::uint16_>, boost::compute::uint16_>(first=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfbca8, count=1, value=0x00007fff5fbfc880, queue=0x00007fff5fbff320, (null)=0x0000000000000000) + 343 at fill.hpp:127
    frame #5: 0x00000001000ab2dd hello_world`void boost::compute::fill<boost::compute::buffer_iterator<boost::compute::uint16_>, boost::compute::uint16_>(first=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfc870, last=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfc860, value=0x00007fff5fbfc880, queue=0x00007fff5fbff320) + 221 at fill.hpp:286
    frame #6: 0x00000001000a9d4f hello_world`boost::compute::buffer_iterator<boost::compute::uint16_> boost::compute::detail::scan_impl<boost::compute::buffer_iterator<boost::compute::uint16_>, boost::compute::buffer_iterator<boost::compute::uint16_>, boost::compute::uint16_, boost::compute::plus<boost::compute::uint16_> >(first=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfd068, last=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfd058, result=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfd048, exclusive=false, init=uint16_ @ 0x00007fff5fbfd008, op=plus<boost::compute::uint16_> @ 0x00007fff5fbfcfc0, queue=0x00007fff5fbff320) + 751 at scan_on_gpu.hpp:222
    frame #7: 0x00000001000aa1dc hello_world`boost::compute::buffer_iterator<boost::compute::uint16_> boost::compute::detail::scan_impl<boost::compute::buffer_iterator<boost::compute::uint16_>, boost::compute::buffer_iterator<boost::compute::uint16_>, boost::compute::uint16_, boost::compute::plus<boost::compute::uint16_> >(first=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfd940, last=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfd930, result=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfd920, exclusive=true, init=uint16_ @ 0x00007fff5fbfd8e0, op=plus<boost::compute::uint16_> @ 0x00007fff5fbfd898, queue=0x00007fff5fbff320) + 1916 at scan_on_gpu.hpp:242
    frame #8: 0x00000001000a960e hello_world`boost::compute::buffer_iterator<boost::compute::uint16_> boost::compute::detail::dispatch_scan<boost::compute::buffer_iterator<boost::compute::uint16_>, boost::compute::uint16_, boost::compute::plus<boost::compute::uint16_> >(first=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfdb40, last=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfdb30, result=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfdb10, exclusive=true, init=uint16_ @ 0x00007fff5fbfdad0, op=plus<boost::compute::uint16_> @ 0x00007fff5fbfda88, queue=0x00007fff5fbff320) + 654 at scan_on_gpu.hpp:303
    frame #9: 0x00000001000a396f hello_world`boost::compute::buffer_iterator<boost::compute::uint16_> boost::compute::detail::scan_on_gpu<boost::compute::buffer_iterator<boost::compute::uint16_>, boost::compute::buffer_iterator<boost::compute::uint16_>, boost::compute::uint16_, boost::compute::plus<boost::compute::uint16_> >(first=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfdc90, last=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfdc80, result=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfdc70, exclusive=true, init=uint16_ @ 0x00007fff5fbfdc30, op=plus<boost::compute::uint16_> @ 0x00007fff5fbfdbe8, queue=0x00007fff5fbff320) + 335 at scan_on_gpu.hpp:324
    frame #10: 0x00000001000a1f8e hello_world`boost::compute::buffer_iterator<boost::compute::uint16_> boost::compute::detail::scan<boost::compute::buffer_iterator<boost::compute::uint16_>, boost::compute::buffer_iterator<boost::compute::uint16_>, boost::compute::uint16_, boost::compute::plus<boost::compute::uint16_> >(first=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfde98, last=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfde88, result=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfde68, exclusive=true, init=uint16_ @ 0x00007fff5fbfde28, op=plus<boost::compute::uint16_> @ 0x00007fff5fbfdde0, queue=0x00007fff5fbff320) + 750 at scan.hpp:37
    frame #11: 0x00000001000454b2 hello_world`boost::compute::buffer_iterator<boost::compute::uint16_> boost::compute::exclusive_scan<boost::compute::buffer_iterator<boost::compute::uint16_>, boost::compute::buffer_iterator<boost::compute::uint16_> >(first=<unavailable>, last=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfe258, result=buffer_iterator<boost::compute::uint16_> @ 0x00007fff5fbfe248, queue=0x00007fff5fbff320) + 194 at exclusive_scan.hpp:88
    frame #12: 0x0000000100043b12 hello_world`void boost::compute::detail::radix_sort_impl<float, int>(first=const boost::compute::buffer_iterator<float> @ 0x00007fff5fbfed68, last=const boost::compute::buffer_iterator<float> @ 0x00007fff5fbfed58, values_first=const boost::compute::buffer_iterator<int> @ 0x00007fff5fbfed38, ascending=true, queue=0x00007fff5fbff320) + 8018 at radix_sort.hpp:374
    frame #13: 0x0000000100018a81 hello_world`void boost::compute::detail::radix_sort<boost::compute::buffer_iterator<float> >(first=buffer_iterator<float> @ 0x00007fff5fbfedc8, last=buffer_iterator<float> @ 0x00007fff5fbfedb8, queue=0x00007fff5fbff320) + 97 at radix_sort.hpp:425
    frame #14: 0x0000000100017854 hello_world`void boost::compute::detail::dispatch_gpu_sort<float>(first=buffer_iterator<float> @ 0x00007fff5fbfef48, last=buffer_iterator<float> @ 0x00007fff5fbfef38, (null)=less<float> @ 0x00007fff5fbfeef0, queue=0x00007fff5fbff320, (null)=0x0000000000000000) + 404 at sort.hpp:52
    frame #15: 0x00000001000174d4 hello_world`void boost::compute::detail::dispatch_sort<boost::compute::buffer_iterator<float>, boost::compute::less<float> >(first=buffer_iterator<float> @ 0x00007fff5fbff018, last=buffer_iterator<float> @ 0x00007fff5fbff008, compare=less<float> @ 0x00007fff5fbfefb0, queue=0x00007fff5fbff320, (null)=0x0000000000000000) + 228 at sort.hpp:104
    frame #16: 0x000000010001733e hello_world`void boost::compute::sort<boost::compute::buffer_iterator<float>, boost::compute::less<float> >(first=buffer_iterator<float> @ 0x00007fff5fbff0b8, last=buffer_iterator<float> @ 0x00007fff5fbff0a8, compare=less<float> @ 0x00007fff5fbff050, queue=0x00007fff5fbff320) + 126 at sort.hpp:172
    frame #17: 0x0000000100005dcb hello_world`void boost::compute::sort<boost::compute::buffer_iterator<float> >(first=buffer_iterator<float> @ 0x00007fff5fbff2a8, last=buffer_iterator<float> @ 0x00007fff5fbff298, queue=0x00007fff5fbff320) + 107 at sort.hpp:183
    frame #18: 0x0000000100003681 hello_world`main(argc=2, argv=0x00007fff5fbffa80) + 4273 at hello_world.cpp:101
    frame #19: 0x00007fff878b95c9 libdyld.dylib`start + 1

Looks like we’re inside scan_on_gpu() and calling fill() with a single uint16_ vector value to zero out a temporary buffer.

The segfault backtrace I get is the same as @prthakre posted before.