
eGospodarka.plGrupypl.comp.programmingKiedy będzie milion rdzeni?Re: Kiedy będzie milion rdzeni?
    Newsgroups: pl.comp.programming
    Date: Sun, 22 Sep 2019 03:33:22 -0700 (PDT)
    Subject: Re: Kiedy będzie milion rdzeni?
    From: fir <>
    W dniu piątek, 20 września 2019 15:00:16 UTC+2 użytkownik M.M. napisał:
    > On Thursday, September 19, 2019 at 4:55:42 AM UTC+2, wrote:
    > > W dniu wtorek, 17 września 2019 21:15:49 UTC+2 użytkownik M.M. napisał:
    > > > Ciekawe jak w praktyce wygląda przyspieszenie obliczeń na Tesli
    > > > względem Xenon Phi. I ciekawe czy w ogole warto inwestować w te drogie
    > > > rozwiązania, jak za 150usd można kupić: GeForce GTX 1650 - tania,
    > > > wydajna, a 10 takich kart z lekkim underclockingiem na niektórych
    > > > obliczeniach pobiera tylko 500 wat mocy. No ale na GPU trzeba się
    > > > nauczyć jakiegoś OpenCL albo CUDA.
    > > Jak ktoś umie napisać sensownie równoległy kod to OpenCL/CUDA nie są
    > > żadnym problemem. Owszem, to nie są jakieś przepiękne API, ale
    > > praktycznie każdy ostro sprzętowy kod w C wygląda równie źle;)
    > > Do tego można się tego uczyć bezproblemowo na dowolnym komputerze
    > > z jakąś sensowną kartą graficzną, odpalenie Hello World na CUDA
    > > (domyślny program to bodajże jakieś równoległe dodawanie wektorów
    > > czy tam sortowanie) zajmuje jakieś 15 minut, z czego 10 to rejestracja
    > > na stronie nvidii żeby ściągnać toolchain. To jest wręcz przerażająco
    > > łatwe w porównaniu do dawnego oprogramowywania "grubych" platform
    > > obliczeniowych gdzie potrzeba było komercyjnego kompilatora za grubą
    > > kasę na uczelnianym klastrze i wczytywania się w dokumentację żeby
    > > coś się w ogóle uruchomiło.
    > >
    > > Pozdrawiam,
    > > --
    > > Karol Piotrowski
    > Możesz polecić jakiś praktyczny tutorial od podstaw OpenCLa dla kogoś, kto wie
    > co to C++ i programowanie rownoległe na CPU, ale z GPU nie miał nigdy
    > do czynienia?
    > Pozdrawiam

    moge wkleic jakies moje stare notatki/proby z opencl (z 2015, nie pamietam czy to
    ostatnia wersja ale pamietam ze to dzialalo, i wersja opencl renderowania madelbrota
    byla najszybsza, pisalem o tym na grupie wiec jest o tym gdzies watek w 2015)

    ale szczerze mowiac nie wiem czy polecalbym sie tym zajmowac, jesli ktos chce zbierac
    odznaki programistycznego skauta i wpisac opencl do cv to na pewno
    - w innym wypadku chyba nie

    #include "fist.h"

    //#include <fcntl.h>
    #include <stdio.h>
    #include <stdlib.h>
    //#include <string.h>
    #include <math.h>
    //#include <unistd.h>
    //#include <sys/types.h>
    //#include <sys/stat.h>
    //#include <OpenCL/opencl.h>
    //#include "clew.c"
    #include "clew.h"
    int clewInit(const char* path);


    // Use a static data size for simplicity
    //const int DATA_SIZE = 130*1024;

    const int OCL_INPUT_DATA_SIZE = 1*1024;
    const int OCL_OUTPUT_DATA_SIZE = 256*256;
    const int OCL_PROCESSING_RANGE = 256*256;

    //int DATA_SIZE = DATA_SIZE_MAX/10;


    // Simple compute kernel which computes the square of an input array
    // const char *KernelSource = "\n" \
    // "__kernel void square( \n" \
    // " __global float* input, \n" \
    // " __global float* output, \n" \
    // " const unsigned int count) \n" \
    // "{ \n" \
    // " int i = get_global_id(0); \n" \
    // " if(i < count) \n" \
    // " output[i] = input[i] * input[i]; \n" \
    // "} \n" \
    // "\n";

    // const char *KernelSource = "\n" \
    // "__kernel void square( \n" \
    // " __global int* input, \n" \
    // " __global int* output, \n" \
    // " const unsigned int count) \n" \
    // "{ \n" \
    // " int i = get_global_id(0); \n" \
    // " if(i < count) \n" \
    // " output[i] = i+input[i] *2 ; \n" \
    // "} \n" \
    // "\n";

    const char *KernelSource = "\n" \
    // "__kernel void square( \n" \
    // " __global int* input, \n" \
    // " __global int* output, \n" \
    // " const unsigned int count) \n" \
    // "{ \n" \
    // " int i = get_global_id(0); \n" \
    // " if(i < count) \n" \
    // " { int x = i%550; int y=i/550;
    \n" \
    // " output[i] = x+input[i] +y ; \n" \
    // " } \n" \
    // "} \n" \
    // "\n";
    "__kernel void square( \n" \
    " __global int* input, \n" \
    " __global int* output, \n" \
    " const unsigned int count) \n" \
    "{ \n" \
    " int i = get_global_id(0); \n" \
    " if(i < count) \n" \
    " { \n" \
    " int x = i%256; \n" \
    " // if(x>=256) return; \n" \
    " int y = i/256; \n" \
    " // if(y>=256) return; \n" \
    " float cRe = -0.5 + -1.5 + x/256.*3.; \n" \
    " float cIm = 0.0 + -1.5 + y/256.*3.; \n" \
    " float re = 0; \n" \
    " float im = 0; \n" \
    " int n = 0; \n" \
    " for( n=0; n<=1000; n++) { \n" \
    " if( re * re + im * im > 4.0 ) { output[256*y+x] = n + 256*n + 256*256*n;
    return;} \n" \
    " float re_n = re * re - im * im + cRe; \n" \
    " float im_n = 2 * re * im + cIm; \n" \
    " re = re_n; \n" \
    " im = im_n; \n" \
    " } \n" \
    " output[256*y+x] = 250<<8; \n" \
    " } \n" \
    "} \n" \

    int data[OCL_INPUT_DATA_SIZE]; // original data set given to device
    int results[OCL_OUTPUT_DATA_SIZE]; // results returned from device

    // void SetupInputData()
    // {
    // // Fill our data set with random float values
    // //
    // int i = 0;
    // unsigned int count = DATA_SIZE;
    // for(i = 0; i < count; i++)
    // data[i] = rand() / (float)RAND_MAX;
    // }
    int err; // error code returned from api calls
    unsigned int correct; // number of correct results returned

    size_t global; // global domain size for our calculation
    size_t local; // local domain size for our calculation

    cl_device_id device_id; // compute device id
    cl_context context; // compute context
    cl_command_queue commands; // compute command queue
    cl_program program; // compute program
    cl_kernel kernel; // compute kernel

    cl_mem input; // device memory used for the input array
    cl_mem output; // device memory used for the output array

    int SetupCL()
    static int initialised =0;
    if(initialised) return 0;


    int writeCLInfo();

    // writeCLInfo();

    //////// platform
    static cl_platform_id platform_id[10] = {0};
    cl_uint no_of_platforms_found = 0;

    int ret = clGetPlatformIDs(10, platform_id, &no_of_platforms_found);
    // if(ret == CL_SUCCESS ) ERROR_("clGetPlatformIDs success") ;
    // alert(" %d platforms found\n",no_of_platforms_found ) ;

    // Connect to a compute device
    int gpu = 1;
    err = clGetDeviceIDs(platform_id[0], gpu ? CL_DEVICE_TYPE_GPU :
    CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    ERROR_("Error: Failed to create a device group!\n");
    return EXIT_FAILURE;

    // Create a compute context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    ERROR_("Error: Failed to create a compute context!\n");
    return EXIT_FAILURE;

    // Create a command commands
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    ERROR_("Error: Failed to create a command commands!\n");
    return EXIT_FAILURE;

    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource,
    NULL, &err);
    if (!program)
    ERROR_("Error: Failed to create compute program!\n");
    return EXIT_FAILURE;

    // Build the program executable
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    size_t len;
    char buffer[2048];

    ERROR_("Error: Failed to build program executable!\n");
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
    sizeof(buffer), buffer, &len);
    ERROR__("%s\n", buffer);

    // Create the compute kernel in the program we wish to run
    kernel = clCreateKernel(program, "square", &err);
    if (!kernel || err != CL_SUCCESS)
    ERROR_("Error: Failed to create compute kernel! \n");

    // Create the input and output arrays in device memory for our calculation
    input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) *

    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) *

    if (!input || !output)
    ERROR_("Error: Failed to allocate device memory!\n");


    // Set the arguments to our compute kernel
    err = 0;
    int count = OCL_PROCESSING_RANGE;

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);

    if (err != CL_SUCCESS)
    ERROR__("Error: Failed to set kernel arguments! %d\n", err);

    // Get the maximum work group size for executing the kernel on the device
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE,
    sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    ERROR__("Error: Failed to retrieve kernel work group info! %d\n", err);

    return 0;


    int RunOpenCLtask()
    // DATA_SIZE = frame_size_x*frame_size_y/4;

    // SetupInputData();

    for(int i = 0; i < OCL_INPUT_DATA_SIZE; i++)
    data[i] = frame_bitmap[i] ;

    // Write our data set into the input array in device memory
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) *

    if (err != CL_SUCCESS)
    ERROR_EXIT("Error: Failed to write to source array!\n");

    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device

    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL,
    if (err)
    ERROR_EXIT("Error: Failed to execute kernel!\n");

    // Wait for the command commands to get serviced before reading back results

    // Read back the results from the device to verify the output
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) *
    OCL_OUTPUT_DATA_SIZE, results, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    ERROR__("Error: Failed to read output array! %d\n", err);

    for(int i = 0; i < OCL_OUTPUT_DATA_SIZE; i++)
    int x = i%256;
    int y = i/256;
    frame_bitmap[y*550+x] = results[i];

    // Validate our results
    // correct = 0;
    // for(int i = 0; i < count; i++)
    // {
    // if(results[i] - data[i] * data[i]<0.001)
    // correct++;
    // }
    // for(int i=0; i<1024; i++)
    // {
    // printf("data %f square %f \n", data[i], data[i]*data[i]);
    // printf("result %f \n", results[i]);
    // }

    // Print a brief summary detailing the results
    // printf("Computed '%d/%d' correct values!\n", correct, count);
    // alert("Computed '%d/%d' correct values!\n", correct, count);


    void ShutdownOpenCl()
    // Shutdown and cleanup


    int writeCLInfo()

    int i, j;
    char* value;
    size_t valueSize;
    cl_uint platformCount;
    cl_platform_id* platforms;
    cl_uint deviceCount;
    cl_device_id* devices;
    cl_uint maxComputeUnits;

    // get all platforms

    clGetPlatformIDs(0, NULL, &platformCount);

    platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);

    clGetPlatformIDs(platformCount, platforms, NULL);

    for (i = 0; i < platformCount; i++) {

    // get all devices

    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);

    devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);

    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);

    // for each device print critical attributes

    for (j = 0; j < deviceCount; j++) {

    // print device name

    clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize);

    value = (char*) malloc(valueSize);

    clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);

    printf("%d. Device: %s\n", j+1, value);


    // print hardware device version

    clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize);

    value = (char*) malloc(valueSize);

    clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);

    printf(" %d.%d Hardware version: %s\n", j+1, 1, value);


    // print software driver version

    clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize);

    value = (char*) malloc(valueSize);

    clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);

    printf(" %d.%d Software version: %s\n", j+1, 2, value);


    // print c version supported by compiler for device

    clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL,

    value = (char*) malloc(valueSize);

    clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value,

    printf(" %d.%d OpenCL C version: %s\n", j+1, 3, value);


    // print parallel compute units

    clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS,

    sizeof(maxComputeUnits), &maxComputeUnits, NULL);

    printf(" %d.%d Parallel compute units: %d\n", j+1, 4, maxComputeUnits);





    return 0;

