首页 > c > 使用openmp时,英特尔C编译器输出错误

使用openmp时,英特尔C编译器输出错误 (Intel C Compiler giving the wrong output when using openmp)

2016-12-07 copenmp

问题

我试图第一次使用英特尔C编译器,并得到完全错误的答案。我究竟做错了什么?

我有一些代码如下:

#include <stdint.h>
#include <stdio.h>

#define CHUNK_SIZE 12
#define NUM_THREADS 8

#define popcnt __builtin_popcountll
#define BILLION (1000 * 1000 * 1000)
#define UPDATE_ROW_PPROD() \
    update_row_pprod(row_pprod, row, rows, row_sums, mask, mask_popcnt)

typedef __int128 int128_t;

static inline int64_t update_row_pprod
(
    int64_t* row_pprod, int64_t row, int64_t* rows,
    int64_t* row_sums, int64_t mask, int64_t mask_popcnt
)
{
    int64_t temp = 2 * popcnt(rows[row] & mask) - mask_popcnt;

    row_pprod[0] *= temp;
    temp -= 1;
    row_pprod[1] *= temp;
    temp -= row_sums[row];
    row_pprod[2] *= temp;
    temp += 1;
    row_pprod[3] *= temp;

    return row + 1;
}

int main(int argc, char* argv[])
{
    int64_t size = argc - 1, rows[argc - 1];
    int64_t row_sums[argc - 1];
    int128_t permanent = 0, sign = size & 1 ? -1 : 1;

    if (argc == 2)
    {
        printf("%d\n", argv[1][0] == '-' ? -1 : 1);
        return 0;
    }

    for (int64_t row = 0; row < size; row++)
    {
        char positive = argv[row + 1][0] == '+' ? '-' : '+';

        sign *= ',' - positive;
        rows[row] = row_sums[row] = 0;

        for (char* p = &argv[row + 1][1]; *p; p++)
        {
            rows[row] <<= 1;
            rows[row] |= *p == positive;
            row_sums[row] += *p == positive;
        }

        row_sums[row] = 2 * row_sums[row] - size;
    }

    #pragma omp parallel for reduction(+:permanent) num_threads(NUM_THREADS)
    for (int64_t mask = 1; mask < 1LL << (size - 1); mask += 2)
    {
        int64_t mask_popcnt = popcnt(mask);
        int64_t row = 0;
        int128_t row_prod = 1 - 2 * (mask_popcnt & 1);
        int128_t row_prod_high = -row_prod;
        int128_t row_prod_inv = row_prod;
        int128_t row_prod_inv_high = -row_prod;

        for (int64_t chunk = 0; chunk < size / CHUNK_SIZE; chunk++)
        {
            int64_t row_pprod[4] = {1, 1, 1, 1};

            for (int64_t i = 0; i < CHUNK_SIZE; i++)
                row = UPDATE_ROW_PPROD();

            row_prod *= row_pprod[0], row_prod_high *= row_pprod[1];
            row_prod_inv *= row_pprod[3], row_prod_inv_high *= row_pprod[2];
        }

        int64_t row_pprod[4] = {1, 1, 1, 1};

        while (row < size)
            row = UPDATE_ROW_PPROD();

        row_prod *= row_pprod[0], row_prod_high *= row_pprod[1];
        row_prod_inv *= row_pprod[3], row_prod_inv_high *= row_pprod[2];
        permanent += row_prod + row_prod_high + row_prod_inv + row_prod_inv_high;
    }

    permanent *= sign;

    if (permanent < 0)
        printf("-"), permanent *= -1;

    int32_t output[5], print = 0;

    output[0] = permanent % BILLION, permanent /= BILLION;
    output[1] = permanent % BILLION, permanent /= BILLION;
    output[2] = permanent % BILLION, permanent /= BILLION;
    output[3] = permanent % BILLION, permanent /= BILLION;
    output[4] = permanent % BILLION;

    if (output[4])
        printf("%u", output[4]), print = 1;
    if (print)
        printf("%09u", output[3]);
    else if (output[3])
        printf("%u", output[3]), print = 1;
    if (print)
        printf("%09u", output[2]);
    else if (output[2])
        printf("%u", output[2]), print = 1;
    if (print)
        printf("%09u", output[1]);
    else if (output[1])
        printf("%u", output[1]), print = 1;
    if (print)
        printf("%09u\n", output[0]);
    else
        printf("%u\n", output[0]);
}

如果我用它编译它

gcc -Wall -std=c99 -fopenmp  -o permanent permanent.c

然后我可以运行它

permanent -+ -+

并获得输出

-2

哪个是对的。

如果我使用英特尔C编译器(17.0.1)进行编译

icc -std=c99 -qopenmp -Wall permanent.c

然后呢

a.out -+ -+

我明白了

11910984139051480114196905982

如评论中所述,如果删除-qopenmp,则icc会生成一个正确运行的版本,尽管只在一个核心上运行。

解决方法

看起来像英特尔处理__int128类型变量OpenMP减少的错误。这很容易复制:

#include <stdio.h>
#include <inttypes.h>

int main() {
    __int128 sum = 0;
    #pragma omp parallel for reduction(+:sum)
    for (int i = 0; i < 0; i++) {
        sum += 1;
    }
    printf("%" PRIX64 " %" PRIX64 "\n", (uint64_t)sum, (uint64_t)(sum >> 64));
}

输出-fopenmp

14000000000 78778300

正确输出没有或有gcc -fopenmp

0 0

您可以__int128使用一个简单的自定义缩减声明来解决这个问题(并保持类型):

#pragma omp declare reduction(add128: int128_t: omp_out = omp_out + omp_in) initializer(omp_priv = 0)
[...]
#pragma omp parallel for reduction(add128:permanent) num_threads(NUM_THREADS)

据我所知,标准不限制缩减列表元素的类型。至少编译器应该告诉你它是否不受支持。

问题

I am trying to use the Intel C compiler for the first time and am getting completely the wrong answers. What am I doing wrong?

I have some code as follows:

#include <stdint.h>
#include <stdio.h>

#define CHUNK_SIZE 12
#define NUM_THREADS 8

#define popcnt __builtin_popcountll
#define BILLION (1000 * 1000 * 1000)
#define UPDATE_ROW_PPROD() \
    update_row_pprod(row_pprod, row, rows, row_sums, mask, mask_popcnt)

typedef __int128 int128_t;

static inline int64_t update_row_pprod
(
    int64_t* row_pprod, int64_t row, int64_t* rows,
    int64_t* row_sums, int64_t mask, int64_t mask_popcnt
)
{
    int64_t temp = 2 * popcnt(rows[row] & mask) - mask_popcnt;

    row_pprod[0] *= temp;
    temp -= 1;
    row_pprod[1] *= temp;
    temp -= row_sums[row];
    row_pprod[2] *= temp;
    temp += 1;
    row_pprod[3] *= temp;

    return row + 1;
}

int main(int argc, char* argv[])
{
    int64_t size = argc - 1, rows[argc - 1];
    int64_t row_sums[argc - 1];
    int128_t permanent = 0, sign = size & 1 ? -1 : 1;

    if (argc == 2)
    {
        printf("%d\n", argv[1][0] == '-' ? -1 : 1);
        return 0;
    }

    for (int64_t row = 0; row < size; row++)
    {
        char positive = argv[row + 1][0] == '+' ? '-' : '+';

        sign *= ',' - positive;
        rows[row] = row_sums[row] = 0;

        for (char* p = &argv[row + 1][1]; *p; p++)
        {
            rows[row] <<= 1;
            rows[row] |= *p == positive;
            row_sums[row] += *p == positive;
        }

        row_sums[row] = 2 * row_sums[row] - size;
    }

    #pragma omp parallel for reduction(+:permanent) num_threads(NUM_THREADS)
    for (int64_t mask = 1; mask < 1LL << (size - 1); mask += 2)
    {
        int64_t mask_popcnt = popcnt(mask);
        int64_t row = 0;
        int128_t row_prod = 1 - 2 * (mask_popcnt & 1);
        int128_t row_prod_high = -row_prod;
        int128_t row_prod_inv = row_prod;
        int128_t row_prod_inv_high = -row_prod;

        for (int64_t chunk = 0; chunk < size / CHUNK_SIZE; chunk++)
        {
            int64_t row_pprod[4] = {1, 1, 1, 1};

            for (int64_t i = 0; i < CHUNK_SIZE; i++)
                row = UPDATE_ROW_PPROD();

            row_prod *= row_pprod[0], row_prod_high *= row_pprod[1];
            row_prod_inv *= row_pprod[3], row_prod_inv_high *= row_pprod[2];
        }

        int64_t row_pprod[4] = {1, 1, 1, 1};

        while (row < size)
            row = UPDATE_ROW_PPROD();

        row_prod *= row_pprod[0], row_prod_high *= row_pprod[1];
        row_prod_inv *= row_pprod[3], row_prod_inv_high *= row_pprod[2];
        permanent += row_prod + row_prod_high + row_prod_inv + row_prod_inv_high;
    }

    permanent *= sign;

    if (permanent < 0)
        printf("-"), permanent *= -1;

    int32_t output[5], print = 0;

    output[0] = permanent % BILLION, permanent /= BILLION;
    output[1] = permanent % BILLION, permanent /= BILLION;
    output[2] = permanent % BILLION, permanent /= BILLION;
    output[3] = permanent % BILLION, permanent /= BILLION;
    output[4] = permanent % BILLION;

    if (output[4])
        printf("%u", output[4]), print = 1;
    if (print)
        printf("%09u", output[3]);
    else if (output[3])
        printf("%u", output[3]), print = 1;
    if (print)
        printf("%09u", output[2]);
    else if (output[2])
        printf("%u", output[2]), print = 1;
    if (print)
        printf("%09u", output[1]);
    else if (output[1])
        printf("%u", output[1]), print = 1;
    if (print)
        printf("%09u\n", output[0]);
    else
        printf("%u\n", output[0]);
}

If I compile it with

gcc -Wall -std=c99 -fopenmp  -o permanent permanent.c

I can then run it with

permanent -+ -+

and get the output

-2

which is correct.

If I instead compile using the Intel C compiler (17.0.1) with

icc -std=c99 -qopenmp -Wall permanent.c

and then do

a.out -+ -+

I get

11910984139051480114196905982

As noted in the comments, if you remove the -qopenmp then icc produces a version that runs correctly, albeit only on one core.

解决方法

Looks like a bug in Intel's handling of OpenMP reduction on __int128 type variables. This can be easily reproduced:

#include <stdio.h>
#include <inttypes.h>

int main() {
    __int128 sum = 0;
    #pragma omp parallel for reduction(+:sum)
    for (int i = 0; i < 0; i++) {
        sum += 1;
    }
    printf("%" PRIX64 " %" PRIX64 "\n", (uint64_t)sum, (uint64_t)(sum >> 64));
}

Output with -fopenmp:

14000000000 78778300

Correct output without, or with gcc -fopenmp:

0 0

You can work around that (and keep the __int128 type) by using a trivial custom reduction declaration:

#pragma omp declare reduction(add128: int128_t: omp_out = omp_out + omp_in) initializer(omp_priv = 0)
[...]
#pragma omp parallel for reduction(add128:permanent) num_threads(NUM_THREADS)

As far as I can tell the standard does not restrict the type of a reduction list element. At least the compiler should tell you if it's not supported.

相似信息