This commit is contained in:
iceman1001 2024-04-11 22:08:25 +02:00
parent 2612b87f41
commit 9a73e77d72
4 changed files with 1468 additions and 51 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,122 @@
/*******************************************************************************
* Copyright (c) 2008-2023 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef OPENCL_CL_LAYER_H_
#define OPENCL_CL_LAYER_H_
/*
** This header is generated from the Khronos OpenCL XML API Registry.
*/
#include <CL/cl_icd.h>
#include <CL/cl.h>
/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
#define CL_NO_EXTENSION_PROTOTYPES
#endif
/* CL_NO_EXTENSION_PROTOTYPES implies
CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
!defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
!defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif
#ifdef __cplusplus
extern "C" {
#endif
/***************************************************************
* cl_loader_layers
***************************************************************/
#define cl_loader_layers 1
#define CL_LOADER_LAYERS_EXTENSION_NAME \
"cl_loader_layers"
typedef cl_uint cl_layer_info;
typedef cl_uint cl_layer_api_version;
/* cl_layer_info */
#define CL_LAYER_API_VERSION 0x4240
#define CL_LAYER_NAME 0x4241
/* Misc API enums */
#define CL_LAYER_API_VERSION_100 100
typedef cl_int CL_API_CALL
clGetLayerInfo_t(
cl_layer_info param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret);
typedef clGetLayerInfo_t *
clGetLayerInfo_fn ;
typedef cl_int CL_API_CALL
clInitLayer_t(
cl_uint num_entries,
const cl_icd_dispatch* target_dispatch,
cl_uint* num_entries_ret,
const cl_icd_dispatch** layer_dispatch_ret);
typedef clInitLayer_t *
clInitLayer_fn ;
/*
** The function pointer typedefs prefixed with "pfn_" are provided for
** compatibility with earlier versions of the headers. New code is
** encouraged to use the function pointer typedefs that are suffixed with
** "_fn" instead, for consistency.
*/
typedef clGetLayerInfo_t *
pfn_clGetLayerInfo ;
typedef clInitLayer_t *
pfn_clInitLayer ;
#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
extern CL_API_ENTRY cl_int CL_API_CALL
clGetLayerInfo(
cl_layer_info param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret) ;
extern CL_API_ENTRY cl_int CL_API_CALL
clInitLayer(
cl_uint num_entries,
const cl_icd_dispatch* target_dispatch,
cl_uint* num_entries_ret,
const cl_icd_dispatch** layer_dispatch_ret) ;
#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
#ifdef __cplusplus
}
#endif
#endif /* OPENCL_CL_LAYER_H_ */

View file

@ -830,9 +830,13 @@ int main(int argc, char **argv) {
continue;
}
if (len == 0) continue;
if (len == 0) {
continue;
}
if (len > 0xdeadbe) len = 0xdeadbe; // limit build_log size
if (len > 0xdeadbe) {
len = 0xdeadbe; // limit build_log size
}
char *buffer = (char *) calloc(len, sizeof(char));
if (!buffer) {
@ -908,27 +912,40 @@ int main(int argc, char **argv) {
z = 0;
for (w = 0; w < ocl_platform_cnt; w++) {
if (!cd_ctx[w].selected) continue;
if (!cd_ctx[w].selected) {
continue;
}
for (q = 0; q < cd_ctx[w].device_cnt; q++) {
if (!cd_ctx[w].device[q].selected) continue;
if (!cd_ctx[w].device[q].selected) {
continue;
}
ctx.global_ws[z] = (1 << profiles[profile][1]);
// the following happens with cpu devices or Apple GPU
if (ctx.local_ws[z] > 256) {
if (cd_ctx[w].is_apple) ctx.local_ws[z] = 256;
else if (!cd_ctx[w].device[q].is_gpu) ctx.local_ws[z] = 256;
if (cd_ctx[w].is_apple) {
ctx.local_ws[z] = 256;
} else if (!cd_ctx[w].device[q].is_gpu) {
ctx.local_ws[z] = 256;
}
}
// dow't allow gws < lws
if (ctx.global_ws[z] < ctx.local_ws[z]) ctx.local_ws[z] = ctx.global_ws[z];
if (ctx.global_ws[z] < ctx.local_ws[z]) {
ctx.local_ws[z] = ctx.global_ws[z];
}
if (opencl_profiling) {
printf("[%zu] global_ws %zu, local_ws %zu\n", g, ctx.global_ws[z], ctx.local_ws[z]);
}
if (!ctx.force_hitag2_opencl) {
if (!(matches[z] = (uint64_t *) calloc((uint32_t)(ctx.global_ws[z] * WGS_MATCHES_FACTOR), sizeof(uint64_t)))) {
printf("[%zu] Error: calloc (matches) failed (%d): %s\n", g, errno, strerror(errno));
MEMORY_FREE_OPENCL(ctx, z)
@ -937,7 +954,9 @@ int main(int argc, char **argv) {
MEMORY_FREE_ALL
exit(2);
}
} else { // one
if (!(matches[z] = (uint64_t *) calloc(1, sizeof(uint64_t)))) {
printf("[%zu] Error: calloc (matches) failed (%d): %s\n", z, errno, strerror(errno));
MEMORY_FREE_OPENCL(ctx, z)
@ -1093,7 +1112,9 @@ int main(int argc, char **argv) {
printf("[queue] Fill queue with pre-calculated offset using profile (%d): ", profile);
#endif
for (size_t step = 0; step < max_step; step++) wu_queue_push(&ctx.queue_ctx, step, step << chunk, max_step);
for (size_t step = 0; step < max_step; step++) {
wu_queue_push(&ctx.queue_ctx, step, step << chunk, max_step);
}
#if DEBUGME > 0
printf("done\n");

View file

@ -29,13 +29,18 @@ bool plat_dev_enabled(unsigned int id, const unsigned int *sel,
unsigned int cnt, unsigned int cur_type, unsigned int allow_type) {
// usefulonly with devices
if (allow_type != CL_DEVICE_TYPE_ALL) {
if (cur_type != allow_type) return false;
if (cur_type != allow_type) {
return false;
}
}
if (sel[0] == 0xff) return true; // all
else {
if (sel[0] == 0xff) {
return true; // all
} else {
for (unsigned int i = 0; i < cnt; i++) {
if (sel[i] == (id + 1)) return true;
if (sel[i] == (id + 1)) {
return true;
}
}
}
@ -43,15 +48,18 @@ bool plat_dev_enabled(unsigned int id, const unsigned int *sel,
}
unsigned int get_smallest_profile(compute_platform_ctx_t *cd_ctx, size_t ocl_platform_cnt) {
unsigned int profile = 0xff;
size_t x = 0, y = 0;
unsigned int profile = 0xFF;
for (x = 0; x < ocl_platform_cnt; x++) {
if (!cd_ctx[x].selected) continue;
for (int x = 0; x < ocl_platform_cnt; x++) {
if (!cd_ctx[x].selected) {
continue;
}
for (y = 0; y < cd_ctx[x].device_cnt; y++) {
if (!cd_ctx[x].device[y].selected) continue;
for (int y = 0; y < cd_ctx[x].device_cnt; y++) {
if (!cd_ctx[x].device[y].selected) {
continue;
}
#if DEBUGME > 1
printf("[debug] Initial profile for device %zu: %d\n", y, cd_ctx[x].device[y].profile);
@ -59,12 +67,16 @@ unsigned int get_smallest_profile(compute_platform_ctx_t *cd_ctx, size_t ocl_pla
// with same devices will be selected the best
// but for different devices in the same platform we need the worst for now (todo)
if (cd_ctx[x].device[y].profile < profile) profile = cd_ctx[x].device[y].profile;
if (cd_ctx[x].device[y].profile < profile) {
profile = cd_ctx[x].device[y].profile;
}
}
}
// at worst, set profile to 0
if (profile > 10) profile = 0;
if (profile > 10) {
profile = 0;
}
return profile;
}
@ -118,17 +130,26 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
unsigned int global_device_id = 0;
if (verbose) printf("- Found %u OpenCL Platform(s)\n", ocl_platform_cnt);
if (verbose) {
printf("- Found %u OpenCL Platform(s)\n", ocl_platform_cnt);
}
for (cl_uint platform_idx = 0; platform_idx < ocl_platform_cnt; platform_idx++) {
(*cd_ctx)[platform_idx].platform_id = ocl_platforms[platform_idx];
(*cd_ctx)[platform_idx].selected = plat_dev_enabled(platform_idx, plat_sel, plat_cnt, 0, 0);
if ((*cd_ctx)[platform_idx].selected)(*selected_platforms_cnt)++;
if ((*cd_ctx)[platform_idx].selected) {
(*selected_platforms_cnt)++;
}
if (verbose) printf("\n-- Platform ID: %u\n", platform_idx + 1);
if (verbose) {
printf("\n-- Platform ID: %u\n", platform_idx + 1);
}
for (info_idx = 0; info_idx < ocl_platforms_info_cnt; info_idx++) {
cl_platform_info ocl_info = ocl_platforms_info[info_idx];
err = clGetPlatformInfo((*cd_ctx)[platform_idx].platform_id, ocl_info, 0, NULL, &tmp_len);
@ -169,7 +190,6 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
if (verbose) {
const char *tmp_info_desc = (info_idx == 0) ? "Name" : (info_idx == 1) ? "Vendor" : "Version";
printf("%14s: %s\n", tmp_info_desc, tmp_buf);
}
@ -186,18 +206,31 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
}
if (info_idx == 1) {
if (!strncmp(tmp_buf, "NVIDIA", 6))(*cd_ctx)[platform_idx].is_nv = true;
else if (!strncmp(tmp_buf, "Apple", 5)) { (*cd_ctx)[platform_idx].is_apple = true; (*cd_ctx)[platform_idx].warning = true; }
else if (!strncmp(tmp_buf, "Intel", 5))(*cd_ctx)[platform_idx].is_intel = true;
else if (!strncmp(tmp_buf, "The pocl project", 16))(*cd_ctx)[platform_idx].is_pocl = true;
}
if (!strncmp(tmp_buf, "NVIDIA", 6)) {
(*cd_ctx)[platform_idx].is_nv = true;
} else if (!strncmp(tmp_buf, "Apple", 5)) {
(*cd_ctx)[platform_idx].is_apple = true;
(*cd_ctx)[platform_idx].warning = true;
} else if (!strncmp(tmp_buf, "Intel", 5)) {
(*cd_ctx)[platform_idx].is_intel = true;
} else if (!strncmp(tmp_buf, "The pocl project", 16)) {
(*cd_ctx)[platform_idx].is_pocl = true;
}
}
free(tmp_buf);
}
if (!show && verbose) {
printf("%14s: %s\n", "Selected", ((*cd_ctx)[platform_idx].selected) ? "yes" : "no");
if ((*cd_ctx)[platform_idx].warning) printf("\n%14s: performance will not be optimal using this platform\n\n", "=====> Warning");
if ((*cd_ctx)[platform_idx].warning) {
printf("\n%14s: performance will not be optimal using this platform\n\n", "=====> Warning");
}
}
// enum devices with this platform
@ -214,7 +247,9 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
err = clGetDeviceIDs((*cd_ctx)[platform_idx].platform_id, CL_DEVICE_TYPE_ALL, ocl_device_max, ocl_devices, &ocl_device_cnt);
if (ocl_device_cnt == 0) {
if (device_types_selected == CL_DEVICE_TYPE_ALL) printf("No device(s) available with platform id %u\n", platform_idx);
if (device_types_selected == CL_DEVICE_TYPE_ALL) {
printf("No device(s) available with platform id %u\n", platform_idx);
}
(*cd_ctx)[platform_idx].device_cnt = 0;
continue;
}
@ -227,16 +262,23 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
return -9;
}
if (verbose) printf("%14s: %u\n", "Device(s)", ocl_device_cnt);
if (verbose) {
printf("%14s: %u\n", "Device(s)", ocl_device_cnt);
}
(*cd_ctx)[platform_idx].device_cnt = ocl_device_cnt;
for (unsigned int device_idx = 0; device_idx < ocl_device_cnt; device_idx++) {
memset(&(*cd_ctx)[platform_idx].device[device_idx], 0, sizeof(compute_device_ctx_t));
cl_device_id ocl_device = ocl_devices[device_idx];
(*cd_ctx)[platform_idx].device[device_idx].platform_id = (*cd_ctx)[platform_idx].platform_id;
if (verbose) printf("---- * ID: %u\n", global_device_id + 1);
if (verbose) {
printf("---- * ID: %u\n", global_device_id + 1);
}
for (info_idx = 0; info_idx < ocl_devices_info_cnt; info_idx++) {
cl_device_info ocl_dev_info = ocl_devices_info[info_idx];
@ -253,19 +295,31 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
return -10;
}
if (device_type & CL_DEVICE_TYPE_GPU)(*cd_ctx)[platform_idx].device[device_idx].is_gpu = 1;
else if ((device_type & CL_DEVICE_TYPE_CPU) && (*cd_ctx)[platform_idx].is_pocl) {
if (device_type & CL_DEVICE_TYPE_GPU) {
(*cd_ctx)[platform_idx].device[device_idx].is_gpu = 1;
} else if ((device_type & CL_DEVICE_TYPE_CPU) && (*cd_ctx)[platform_idx].is_pocl) {
(*cd_ctx)[platform_idx].device[device_idx].profile = (profile_selected > 1) ? 0 : profile_selected;
}
if (verbose) printf("%14s: %s\n", "Device Type", (device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : (device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : "Other");
if (verbose) {
printf("%14s: %s\n", "Device Type", (device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : (device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : "Other");
}
if ((*cd_ctx)[platform_idx].selected == false)(*cd_ctx)[platform_idx].device[device_idx].selected = false;
else (*cd_ctx)[platform_idx].device[device_idx].selected = plat_dev_enabled(global_device_id, dev_sel, dev_cnt, (unsigned int) device_type, device_types_selected);
if ((*cd_ctx)[platform_idx].selected == false) {
(*cd_ctx)[platform_idx].device[device_idx].selected = false;
} else {
(*cd_ctx)[platform_idx].device[device_idx].selected = plat_dev_enabled(global_device_id, dev_sel, dev_cnt, (unsigned int) device_type, device_types_selected);
}
global_device_id++;
if ((*cd_ctx)[platform_idx].device[device_idx].selected)(*selected_devices_cnt)++;
if ((*cd_ctx)[platform_idx].device[device_idx].selected) {
(*selected_devices_cnt)++;
}
continue;
} else if (info_idx == 5) {
cl_device_local_mem_type local_mem_type;
err = clGetDeviceInfo(ocl_device, ocl_dev_info, sizeof(cl_device_local_mem_type), &local_mem_type, 0);
@ -278,29 +332,47 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
}
if (local_mem_type == CL_LOCAL || local_mem_type == CL_GLOBAL) {
if (verbose) printf("%14s: %s\n", "Local Mem Type", (local_mem_type == CL_LOCAL) ? "Local" : "Global");
if (verbose) {
printf("%14s: %s\n", "Local Mem Type", (local_mem_type == CL_LOCAL) ? "Local" : "Global");
}
if ((*cd_ctx)[platform_idx].is_apple) {
if (strncmp((*cd_ctx)[platform_idx].device[device_idx].vendor, "Intel", 5) != 0) {
(*cd_ctx)[platform_idx].device[device_idx].have_local_memory = true;
if ((*cd_ctx)[platform_idx].device[device_idx].is_gpu) {
if (profile_selected > 2)(*cd_ctx)[platform_idx].device[device_idx].profile = PROFILE_DEFAULT; // Apple-Intel GPU's
if (profile_selected > 2) {
(*cd_ctx)[platform_idx].device[device_idx].profile = PROFILE_DEFAULT; // Apple-Intel GPU's
}
} else {
if (profile_selected > 3)(*cd_ctx)[platform_idx].device[device_idx].profile = PROFILE_DEFAULT; // Apple-Intel CPU's
if (profile_selected > 3) {
(*cd_ctx)[platform_idx].device[device_idx].profile = PROFILE_DEFAULT; // Apple-Intel CPU's
}
}
}
} else if ((*cd_ctx)[platform_idx].is_nv) {
(*cd_ctx)[platform_idx].device[device_idx].have_local_memory = true;
}
} else {
if (verbose) printf("%14s: None\n", "Local Mem Type");
if (verbose) {
printf("%14s: None\n", "Local Mem Type");
}
}
if (verbose) printf("%14s: %s\n", "Local Mem Opt", ((*cd_ctx)[platform_idx].device[device_idx].have_local_memory) ? "yes" : "no");
if (verbose) {
printf("%14s: %s\n", "Local Mem Opt", ((*cd_ctx)[platform_idx].device[device_idx].have_local_memory) ? "yes" : "no");
}
continue;
} else if (info_idx == 6) {
size_t wis[3] = { 0 };
err = clGetDeviceInfo(ocl_device, ocl_dev_info, sizeof(size_t) * 3, wis, 0);
if (err != CL_SUCCESS) {
printf("Error: clGetDeviceInfo(work_items_size) failed (%d)\n", err);
@ -310,7 +382,9 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
return -10;
}
if (verbose) printf("%14s: (%zu,%zu,%zu)\n", "Max Work-Items", wis[0], wis[1], wis[2]);
if (verbose) {
printf("%14s: (%zu,%zu,%zu)\n", "Max Work-Items", wis[0], wis[1], wis[2]);
}
#if APPLE_GPU_BROKEN == 1
if (wis[1] < GLOBAL_WS_1 && (*cd_ctx)[platform_idx].device[device_idx].is_apple_gpu) {
@ -318,8 +392,11 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
}
#endif
continue;
} else if (info_idx == 7) {
cl_uint cores = 0;
err = clGetDeviceInfo(ocl_device, ocl_dev_info, sizeof(cl_uint), &cores, 0);
if (err != CL_SUCCESS) {
printf("Error: clGetDeviceInfo(compute_units) failed (%d)\n", err);
@ -329,7 +406,9 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
return -10;
}
if (verbose) printf("%14s: %u\n", "Compute Units", cores);
if (verbose) {
printf("%14s: %u\n", "Compute Units", cores);
}
(*cd_ctx)[platform_idx].device[device_idx].compute_units = cores;
continue;
@ -404,8 +483,11 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
if (strstr(tmp_buf, "Tegra") && (*cd_ctx)[platform_idx].is_pocl) {
(*cd_ctx)[platform_idx].device[device_idx].profile = (profile_selected > 1) ? 0 : profile_selected;
}
} else if (info_idx == 4) {
if (!strncmp(tmp_buf, "Intel", 5)) {
if ((*cd_ctx)[platform_idx].is_apple) {
(*cd_ctx)[platform_idx].device[device_idx].is_apple_gpu = (*cd_ctx)[platform_idx].device[device_idx].is_gpu;
}
@ -438,7 +520,9 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
(*cd_ctx)[platform_idx].device[device_idx].sm_maj = sm_maj;
(*cd_ctx)[platform_idx].device[device_idx].sm_min = sm_min;
if (verbose) printf("%14s: %u%u\n", "SM", sm_maj, sm_min);
if (verbose) {
printf("%14s: %u%u\n", "SM", sm_maj, sm_min);
}
if (sm_maj >= 5) { // >= Maxwell
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3
@ -464,7 +548,9 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
free(tmp_buf);
}
if (!show && verbose) printf("%14s: %s\n", "Selected", ((*cd_ctx)[platform_idx].device[device_idx].selected) ? "yes" : "no");
if (!show && verbose) {
printf("%14s: %s\n", "Selected", ((*cd_ctx)[platform_idx].device[device_idx].selected) ? "yes" : "no");
}
if ((*cd_ctx)[platform_idx].device[device_idx].unsupported) {
printf("\n%14s: this device was not supported, because of missing resources\n\n", "=====> Warning");
@ -472,7 +558,9 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
}
if ((*cd_ctx)[platform_idx].device[device_idx].warning) {
if (!show && verbose) printf("\n%14s: performance will not be optimal using this device\n\n", "=====> Warning");
if (!show && verbose) {
printf("\n%14s: performance will not be optimal using this device\n\n", "=====> Warning");
}
}
(*cd_ctx)[platform_idx].device[device_idx].device_id = ocl_device;
@ -486,7 +574,9 @@ int discoverDevices(unsigned int profile_selected, uint32_t device_types_selecte
*platform_detected_cnt = ocl_platform_cnt;
if (show) free(*cd_ctx);
if (show) {
free(*cd_ctx);
}
return 0;
}
@ -582,7 +672,7 @@ int runKernel(opencl_ctx_t *ctx, uint32_t cand_base, uint64_t *matches, uint32_t
if (ctx->force_hitag2_opencl) {
if (matches_found[0] != 1) printf("[%zu] BUG: if match the counter must be 1. Here %u are founds\n", id, matches_found[0]);
} else {
if (matches_found[0] > (uint32_t)(ctx->global_ws[id]*WGS_MATCHES_FACTOR)) {
if (matches_found[0] > (uint32_t)(ctx->global_ws[id] * WGS_MATCHES_FACTOR)) {
printf("[%zu] BUG: the next clEnqueueReadBuffer will crash. 'matches' buffer (%u) is lower than requested (%u)\n", id, (uint32_t)(ctx->global_ws[id]*WGS_MATCHES_FACTOR), matches_found[0]);
}
}