Cuda error codes - Исправление ошибок и поиск оптимальных решений проблем

Each CUDA Runtime function returns a cudaError_t which takes one of CUDA Error Types values.

| cudaMallocHost
(void **ptr, size_t size) | Allocates page-locked memory on the host.

cudaError_t = cudaSuccess, cudaErrorMemoryAllocation |
| cudaMallocManaged
(void **devPtr, size_t size, unsigned int flags) | Allocates memory that will be automatically managed by the Unified Memory system.

cudaError_t = cudaSuccess, cudaErrorMemoryAllocation cudaErrorNotSupported cudaErrorInvalidValue |
| cudaHostAlloc
(void **pHost, size_t size, unsigned int flags) | Allocates page-locked memory on the host.

cudaError_t = cudaSuccess, cudaErrorMemoryAllocation |
| cudaFreeHost
(void *ptr) | Frees page-locked memory.

cudaError_t = cudaSuccess, cudaErrorInitializationError |
| cudaMallocMipmappedArray
(cudaMipmappedArray_t *mipmappedArray, const cudaChannelFormatDesc *desc, cudaExtent extent, unsigned int numLevels, unsigned int flags) | Allocate a mipmapped array on the device.

cudaError_t = cudaSuccess, cudaErrorMemoryAllocation |
| cudaGetMipmappedArrayLevel
(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level) | Gets a mipmap level of a CUDA

mipmapped array.

cudaError_t = cudaSuccess, cudaErrorInvalidValue |
| cudaFreeMipmappedArray
(cudaMipmappedArray_t mipmappedArray) | Frees a mipmapped array on the device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInitializationError |
| cudaGetSymbolAddress
(void **devPtr, const void *symbol) | Finds the address associated with a CUDA symbol.

cudaError_t = cudaSuccess, cudaErrorInvalidSymbol |
| cudaGetSymbolSize
(size_t *size, const void *symbol) | Finds the size of the object associated with a CUDA symbol.

cudaError_t = cudaSuccess, cudaErrorInvalidSymbol |
| cudaHostGetDevicePointer
(void **pDevice, void *pHost, unsigned int flags) | Passes back device pointer of mapped host memory allocated by cudaHostAlloc or registered by cudaHostRegister.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation |
| cudaHostGetFlags
(unsigned int *pFlags, void *pHost) | Passes back flags used to allocate pinned host memory allocated by cudaHostAlloc.

cudaError_t = cudaSuccess, cudaErrorInvalidValue |
| cudaHostRegister
(void *ptr, size_t size, unsigned int flags) | Registers an existing host memory range for use by CUDA.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation |
| cudaHostUnregister
(void *ptr) | Unregisters a memory range that was registered with cudaHostRegister.

cudaError_t = cudaSuccess, cudaErrorInvalidValue |
| cudaMallocPitch
(void **devPtr, size_t *pitch, size_t width, size_t height) | Allocates pitched memory on the device.

cudaError_t = cudaSuccess, cudaErrorMemoryAllocation |
| cudaMemcpy
(void *dst, const void *src, size_t count, cudaMemcpyKind kind) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidMemcpyDirection |
| cudaMemcpy2D
(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidMemcpyDirection |
| cudaMemcpy2DArrayToArray
(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, cudaMemcpyKind kind) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection |
| cudaMemcpy2DAsync
(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, cudaStream_t stream) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidMemcpyDirection |
| cudaMemcpy2DFromArray
(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, cudaMemcpyKind kind) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection |
| cudaMemcpy2DFromArrayAsync
(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, cudaMemcpyKind kind, cudaStream_t stream) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection |
| cudaMemcpy2DToArray
(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection |
| cudaMemcpy2DToArrayAsync
(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, cudaStream_t stream) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection |
| cudaMemcpy3D
(const cudaMemcpy3DParms *p) | Copies data between 3D objects.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection |
| cudaMemcpy3DAsync
(const cudaMemcpy3DParms *p, cudaStream_t stream) | Copies data between 3D objects.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection |
| cudaMemcpy3DPeer
(const cudaMemcpy3DPeerParms *p) | Copies memory between devices.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice |
| cudaMemcpy3DPeerAsync
(const cudaMemcpy3DPeerParms *p, cudaStream_t stream) | Copies memory between devices asynchronously.

cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice |
| cudaMemcpyArrayToArray
(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, cudaMemcpyKind kind) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection |
| cudaMemcpyAsync
(void *dst, const void *src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidMemcpyDirection |
| cudaMemcpyFromArray
(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, cudaMemcpyKind kind) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidMemcpyDirection |
| cudaMemcpyFromArrayAsync
(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, cudaMemcpyKind kind, cudaStream_t stream) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidMemcpyDirection |
| cudaMemcpyFromSymbol
(void *dst, const void *symbol, size_t count, size_t offset, cudaMemcpyKind kind) | Copies data from the given symbol on the device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidSymbol, cudaErrorInvalidDevicePointer, cudaErrorInvalidMemcpyDirection |
| cudaMemcpyFromSymbolAsync
(void *dst, const void *symbol, size_t count, size_t offset, cudaMemcpyKind kind, cudaStream_t stream) | Copies data from the given symbol on the device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidSymbol, cudaErrorInvalidDevicePointer, cudaErrorInvalidMemcpyDirection |
| cudaMemcpyPeer
(void *dst, int dstDevice, const void *src, int srcDevice, size_t count) | Copies memory between two devices.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice |
| cudaMemcpyPeerAsync
(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream) | Copies memory between two devices asynchronously.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice |
| cudaMemcpyToArray
(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, cudaMemcpyKind kind) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidMemcpyDirection |
| cudaMemcpyToArrayAsync
(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) | Copies data between host and device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer, cudaErrorInvalidMemcpyDirection |
| cudaMemcpyToSymbol
(const void *symbol, const void *src, size_t count, size_t offset, cudaMemcpyKind kind) | Copies data to the given symbol on the device.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidSymbol, cudaErrorInvalidDevicePointer, cudaErrorInvalidMemcpyDirection |
| cudaMemcpyToSymbolAsync
(const void *symbol, const void *src, size_t count, size_t offset, cudaMemcpyKind kind, cudaStream_t stream) | Copies data to the given symbol on the device.

cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidSymbol, cudaErrorInvalidDevicePointer, cudaErrorInvalidMemcpyDirection |
| cudaMemset
(void *devPtr, int value, size_t count) | Initializes or sets device memory to a value.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer |
| cudaMemset2D
(void *devPtr, size_t pitch, int value, size_t width, size_t height) | Initializes or sets device memory to a value.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer |
| cudaMemset2DAsync
(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream) | Initializes or sets device memory to a value.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer |
| cudaMemset3D
(cudaPitchedPtr pitchedDevPtr, int value, cudaExtent extent) | Initializes or sets device memory to a value.

cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer |
| cudaMemset3DAsync
(cudaPitchedPtr pitchedDevPtr, int value, cudaExtent extent, cudaStream_t stream) | Initializes or sets device memory to a value.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer |
| cudaMemsetAsync
(void *devPtr, int value, size_t count, cudaStream_t stream) | Initializes or sets device memory to a value.

cudaError_t = cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevicePointer |

Источник

Author: Greg Gutmann
Affiliation: Tokyo Institute of Technology, Nvidia University Ambassador, Nvidia DLI

This is a continuation of my posts on CUDA programming, for the previous post on thread indexing and memory click here [Post 2]. In this post, we will look at methods of error checking and debugging GPU code. However, the the CUDA API function calls are not explained in detail, for this I recommend the CUDA documentation [Error Handling].

For beginners, it is not necessary to try to fully understand everything mentioned, specifically later in the post when talking about asynchronous and concurrent GPU debugging. But it is valuable to have a bit of familiarity with the topic for when it becomes relevant in the future.

Debugging with Print Statements

Caution: the SyntaxHighlighter plugin used for the code block has an error at the time of posting. If you see “&” replace it with “&”.

// 0_creatingErrors.cu
#include <stdio.h>

__global__ void kernelA(int * globalArray){
	int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;

	// If the problem is small or if printing a subset of the problem 
	// (inside conditional expression, etc...). 
	// Then using printf inside of a kernel can be a viable debugging approach.
	printf("blockIdx.x:%d * blockDim.x:%d + threadIdx.x:%d = globalThreadId:%dn", blockIdx.x, blockDim.x, threadIdx.x, globalThreadId);
	globalArray[globalThreadId] = globalThreadId;
}
 
int main()
{
	int elementCount = 32;
	int dataSize = elementCount * sizeof(int);
	
	cudaSetDevice(0);
	
	int * managedArray;
	cudaMallocManaged(&managedArray, dataSize);

	kernelA <<<4,8>>>(managedArray);

    cudaDeviceSynchronize(); 
	
	printf("n");

	// Printing a portion of results can be another good debugging approach
	for(int i = 0; i < elementCount; i++){
		printf("%d%s", managedArray[i], (i < elementCount - 1) ? ", " : "n");
	}	
	
	cudaFree(managedArray);

	cudaDeviceReset();
 
	return 0;
}

One of the most basic methods of debugging, or confirming results, is to simply print out the results.

0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

By doing this we can see there is a problem with indices 8-31, and potentially all indices. But there is not a lot of information here, it would likely be even less enlightening if the kernel was more complex.

The second way to print is from the CUDA kernel itself.

Sample's Output from the GPU Kernel:
blockIdx.x:1 * blockDim.x:8 + threadIdx.x:0 = globalThreadId:0
blockIdx.x:1 * blockDim.x:8 + threadIdx.x:1 = globalThreadId:1
blockIdx.x:1 * blockDim.x:8 + threadIdx.x:2 = globalThreadId:2
blockIdx.x:1 * blockDim.x:8 + threadIdx.x:3 = globalThreadId:3
blockIdx.x:1 * blockDim.x:8 + threadIdx.x:4 = globalThreadId:4
blockIdx.x:1 * blockDim.x:8 + threadIdx.x:5 = globalThreadId:5
blockIdx.x:1 * blockDim.x:8 + threadIdx.x:6 = globalThreadId:6
blockIdx.x:1 * blockDim.x:8 + threadIdx.x:7 = globalThreadId:7
blockIdx.x:3 * blockDim.x:8 + threadIdx.x:0 = globalThreadId:0
blockIdx.x:3 * blockDim.x:8 + threadIdx.x:1 = globalThreadId:1
blockIdx.x:3 * blockDim.x:8 + threadIdx.x:2 = globalThreadId:2
blockIdx.x:3 * blockDim.x:8 + threadIdx.x:3 = globalThreadId:3
blockIdx.x:3 * blockDim.x:8 + threadIdx.x:4 = globalThreadId:4
blockIdx.x:3 * blockDim.x:8 + threadIdx.x:5 = globalThreadId:5
blockIdx.x:3 * blockDim.x:8 + threadIdx.x:6 = globalThreadId:6
blockIdx.x:3 * blockDim.x:8 + threadIdx.x:7 = globalThreadId:7
blockIdx.x:2 * blockDim.x:8 + threadIdx.x:0 = globalThreadId:0
blockIdx.x:2 * blockDim.x:8 + threadIdx.x:1 = globalThreadId:1
blockIdx.x:2 * blockDim.x:8 + threadIdx.x:2 = globalThreadId:2
blockIdx.x:2 * blockDim.x:8 + threadIdx.x:3 = globalThreadId:3
blockIdx.x:2 * blockDim.x:8 + threadIdx.x:4 = globalThreadId:4
blockIdx.x:2 * blockDim.x:8 + threadIdx.x:5 = globalThreadId:5
blockIdx.x:2 * blockDim.x:8 + threadIdx.x:6 = globalThreadId:6
blockIdx.x:2 * blockDim.x:8 + threadIdx.x:7 = globalThreadId:7
blockIdx.x:0 * blockDim.x:8 + threadIdx.x:0 = globalThreadId:0
blockIdx.x:0 * blockDim.x:8 + threadIdx.x:1 = globalThreadId:1
blockIdx.x:0 * blockDim.x:8 + threadIdx.x:2 = globalThreadId:2
blockIdx.x:0 * blockDim.x:8 + threadIdx.x:3 = globalThreadId:3
blockIdx.x:0 * blockDim.x:8 + threadIdx.x:4 = globalThreadId:4
blockIdx.x:0 * blockDim.x:8 + threadIdx.x:5 = globalThreadId:5
blockIdx.x:0 * blockDim.x:8 + threadIdx.x:6 = globalThreadId:6
blockIdx.x:0 * blockDim.x:8 + threadIdx.x:7 = globalThreadId:7

From this, we can see that the correct number of threads and blocks are launched but there is something wrong when computing globalThreadId (blockIdx.y was used instead of blockIdx. x). If the blockIdx.y was printed it would be more obvious what is happening, all zeros, but often mistakes appear in one spot but not another.

Another detail that is easy to see here is that, when running massively parallel code, print statements may quickly become overwhelming. However, targeted printing can still be useful.

Part 1: Error Checking Helpers

It is often not convenient or clean looking to always write error checking code after every operation or function. Thus it is common for many to write macros or functions to speed up writing code and make it look cleaner.

Below an example of explicitly writing out error checking operations, taken from Visual Studio’s CUDA startup project. On the plus side with this approach, you could provide very exact error reporting and possible fixes.

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    goto Error;
}

Below the helper samples in the following subsections have been broken up into .cu and .cuh files for simpler compilation (linking, etc…).

CUDA Error Checking Macros

Macros are a popular option as they can automatically collect and report information about the file, function and line number. If you browse the samples included with the CUDA SDK you will come across error checking macros. The sample code below was actually originally from there but has been modified quite a bit.

// errorCheckingMacro.cuh
#ifndef CHECK_CUDA_ERROR_M_H
#define CHECK_CUDA_ERROR_M_H

#define PRINT_ON_SUCCESS 1

// To be used around calls that return an error code, ex. cudaDeviceSynchronize or cudaMallocManaged
void checkError(cudaError_t code, char const * func, const char *file, const int line, bool abort = true);
#define checkCUDAError(val) { checkError((val), #val, __FILE__, __LINE__); }	// in-line regular function
#define checkCUDAError2(val) check((val), #val, __FILE__, __LINE__) // typical macro 

// To be used after calls that do not return an error code, ex. kernels to check kernel launch errors
void checkLastError(char const * func, const char *file, const int line, bool abort = true);
#define checkLastCUDAError(func) { checkLastError(func, __FILE__, __LINE__); }
#define checkLastCUDAError_noAbort(func) { checkLastError(func, __FILE__, __LINE__, 0); }

#endif // CHECK_CUDA_ERROR_M_H

In the above sample on lines 9 and 10 two different ways of writing the same macro can be seen. I tend to use curly brackets since it acts like a regular function when invoked. For more details on macros see this post: http://www.ebyte.it/library/codesnippets/WritingCppMacros.html#4

// errorCheckingMacro.cu
#include "errorCheckingMacro.cuh"
#include <stdio.h>

// Assumes single device when calling cudaDeviceReset(); and exit(code);
// In some cases a more lengthy program clean up / termination may be needed

void checkError(cudaError_t code, char const * func, const char *file, const int line, bool abort)
{
	if (code != cudaSuccess) 
	{
		const char * errorMessage = cudaGetErrorString(code);
		fprintf(stderr, "CUDA error returned from "%s" at %s:%d, Error code: %d (%s)n", func, file, line, code, errorMessage);
		if (abort){
			cudaDeviceReset();
			exit(code);
		}
	}
	else if (PRINT_ON_SUCCESS)
	{
		const char * errorMessage = cudaGetErrorString(code);
		fprintf(stderr, "CUDA error returned from "%s" at %s:%d, Error code: %d (%s)n", func, file, line, code, errorMessage);
	}
}

void checkLastError(char const * func, const char *file, const int line, bool abort)
{
	cudaError_t code = cudaGetLastError();
	if (code != cudaSuccess)
	{
		const char * errorMessage = cudaGetErrorString(code);
		fprintf(stderr, "CUDA error returned from "%s" at %s:%d, Error code: %d (%s)n", func, file, line, code, errorMessage);
		if (abort) {
			cudaDeviceReset();
			exit(code);
		}
	}
	else if (PRINT_ON_SUCCESS)
	{
		const char * errorMessage = cudaGetErrorString(code);
		fprintf(stderr, "CUDA error returned from "%s" at %s:%d, Error code: %d (%s)n", func, file, line, code, errorMessage);
	}
}

Above, the first function shown takes the CUDA error code as one of its parameters, then uses that to check if an error occurred and if so what kind. The second function instead calls cudaGetLastError which is needed when CUDA operations do not return an error code (ex. kernels).

CUDA Error Checking Functions with Added Functionality

It should be noted that the examples shown below could be converted into macros. The samples are intended to show another approach and some additional methods of error checking asynchronous operations.

Non-macro based error checking lacks the ability to automatically gather the file name, the function name and the line number for printing or other uses. As a solution to this, when calling the error checking function I pass in an identification string.

// errorChecking.cuh
#ifndef CHECK_CUDA_ERROR_H
#define CHECK_CUDA_ERROR_H

// This could be set with a compile time flag ex. DEBUG or _DEBUG
// But then would need to use #if / #ifdef not if / else if in code
#define FORCE_SYNC_GPU 0
#define PRINT_ON_SUCCESS 1

cudaError_t checkAndPrint(const char * name, int sync = 0);
cudaError_t checkCUDAError(const char * name, int sync = 0);

#endif // CHECK_CUDA_ERROR_H

The code below shows error checking that will check errors with or without forced synchronization.

Pros: Forcing the code to synchronize will ensure the operation just called has finished prior to checking if any errors have occurred. Without doing this, errors from asynchronous calls may appear later on in the code when checking errors for other operations, leading to confusion. Or possibly the error never being reported.

Cons: forced synchronization is only to be used when debugging as the code will take a very large performance hit with constant synchronization calls. Also, forced synchronization may change how the code runs since it will prevent most operations from overlapping. For example, the code might normally use many asynchronous calls or call operations on independent threads running concurrently. This can lead to errors that show up in a release but the not show up when debugging.

// errorChecking.cu
#include "errorChecking.cuh"
#include <stdio.h>

cudaError_t checkAndPrint(const char * name, int sync) {
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess)
	{
		const char * errorMessage = cudaGetErrorString(err);
		fprintf(stderr, "CUDA error check "%s" returned ERROR code: %d (%s) %s n", name, err, errorMessage, (sync) ? "after sync" : "");
	}
	else if (PRINT_ON_SUCCESS) {
		printf("CUDA error check "%s" executed successfully %sn", name, (sync) ? "after sync" : "");
	}
	return err;
}

cudaError_t checkCUDAError(const char * name, int sync) {
	cudaError_t err = cudaSuccess;
	if (sync || FORCE_SYNC_GPU) {
		err = checkAndPrint(name, 0);
		cudaDeviceSynchronize();
		err = checkAndPrint(name, 1);
	}
	else {
		err = checkAndPrint(name, 0);
	}
	return err;
}

As seen above the function checkCUDAError has been designed to be called right after GPU functions instead of directly taking the return value from functions that return an error value. This is because not all CUDA functions return an error value, and some calls are asynchronous, as mentioned.

For the asynchronous case, the error checking function checks for errors after invoking the operation and after the operation has completed. Example case: kernels can have pre-launch errors (incorrect configuration) and kernel execution errors (errors running the GPU code).

Asynchronous possibilities:

Memory copies with Async in the name
Kernels

Concurrency can also be increased by using CUDA streams. (A future post, or check google now 🙂 )

Note: The error checking functions could be written to wrap the CUDA functions like a macro, there are many possibilities. General approaches will work but customizing it to your needs may become more desirable as your code becomes more complex.

cudaError_t checkCUDAError(cudaError_t err, const char * name, int sync = 0);
cudaError_t checkCUDAError(cudaError_t err);

Part 2: Using the Error Checking Code Above

Next, we will look at some short samples that make use of the error checking topics just covered.

Use of Error Checking Macros

This sample shows the use of the error checking macros. It also includes other concepts, such as printing a subset of results or writing testing functions.

// 2_creatingErrorsMacro.cu
#include "errorCheckingMacro.cuh"
#include <stdio.h>
 
__global__ void kernelA(int * globalArray){
	int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;
	globalArray[globalThreadId] = globalThreadId;
}

// Testing function
void testResults(int * data, int N) {
	// For more information incorrect values with their index's can be printed when found
	int testPass = true;
	for (int i = 0; i < N; i++) {
		if (data[i] != i) {
			testPass = false;
		}
	}
	printf("Result of test: %snn", (testPass) ? "passed!" : "uh oh...");
}

int main()
{
	int elementCount = 10240;
	int dataSize = elementCount * sizeof(int);
	
	checkCUDAError(cudaSetDevice(0));
	
	int * managedArray;
	checkCUDAError(cudaMallocManaged(&managedArray, dataSize));
 
	kernelA <<<4,1500>>>(managedArray); // Too many threads per block. 
	// Also, not enough threads for the amount of data but that is not the focus for this sample. 
 
	checkLastCUDAError_noAbort("kernelA");

	checkCUDAError(cudaDeviceSynchronize());
	

	// Can print a subset when problem size is larger
	int printStart = elementCount - 16;
	int printEnd = elementCount;
	printf("nChecking values[%d-%d): ", printStart, printEnd); // Interval notation: https://en.wikipedia.org/wiki/Bracket_(mathematics)#Intervals
	for(int i = printStart; i < printEnd; i++){
		printf("%d%s", managedArray[i], (i < elementCount - 1) ? ", " : "n");
	}	
	
	// Or better yet, write a testing function and let the computer test for you
	testResults(managedArray, elementCount);
	
	checkCUDAError(cudaFree(managedArray));
	
	checkCUDAError(cudaDeviceReset());
 
	return 0;
}

Sample's Output: 
CUDA error returned from "cudaSetDevice(0)" at 2_creatingErrorsMacro.cu:15, Error code: 0 (no error)
CUDA error returned from "cudaMallocManaged(&managedArray, dataSize)" at 2_creatingErrorsMacro.cu:18, Error code: 0 (no error)
CUDA error returned from "kernelA" at 2_creatingErrorsMacro.cu:22, Error code: 9 (invalid configuration argument)
CUDA error returned from "cudaDeviceSynchronize()" at 2_creatingErrorsMacro.cu:24, Error code: 0 (no error)
Checking values[10224-10240): 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
Result of test: uh oh…
CUDA error returned from "cudaFree(managedArray)" at 2_creatingErrorsMacro.cu:46, Error code: 0 (no error)
CUDA error returned from "cudaDeviceReset()" at 2_creatingErrorsMacro.cu:48, Error code: 0 (no error)

The error in this sample (error code: 9) on line 32 was caused by trying to use 1500 threads per block. The result of this error can also be seen when looking at the portion of results printed or by the result of the testing function.

Testing functions are generally a better approach as errors may occur in the portion of results that were not printed. Printing might often just be to make the developer more sure the results are correct by personally seeing them.

One detail that might seem strange is that I have added a feature to prevent the program from closing when an error occurs. From my experience with creating simulations, I have found that when an error occurs it may not always have an effect on the code. Depending on the situation it may be better to notify the user or silently log the error and program state, instead of closing the program immediately and frustrating the user.

Use of Error Checking Functions

The sample below was made to run on 32 elements but then modified to showcase errors when calling cudaSetDevice and cudaMallocManaged.

// 1_creatingErrors.cu
#include "errorChecking.cuh"
#include <stdio.h>

enum errorCheckFlag {
	NO_SYNC,
	SYNC
};

__global__ void kernelA(int * globalArray){
	int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;
	globalArray[globalThreadId] = globalThreadId;
}
 
void helperFunction(int * managedMem){
	kernelA <<<4,8>>>(managedMem);
	checkCUDAError("<helperFunction> kernelA", SYNC); 
	// Showing a possible naming convention "<location>" for tracking down error   
	// locations when not using macros. A bit of work but it will save time.
}

int main()
{
	int elementCount = 32;
	int dataSize = elementCount * sizeof(int);
	
	cudaSetDevice(5); // The PC I am on does not have 6 GPU
	checkCUDAError("<main> cudaSetDevice");
	
	int * managedArray;
	cudaMallocManaged(&managedArray, dataSize * 1000000000); // My GPU do not have 32 GB
	checkCUDAError("<main> cudaMallocManaged");

	kernelA <<<4,8>>>(managedArray);
	checkCUDAError("<main> kernelA", SYNC);

    //cudaDeviceSynchronize(); // checkCUDAError will sync with flag SYNC
	
	cudaFree(managedArray);
	checkCUDAError("<main> cudaFree");

	cudaDeviceReset();
	checkCUDAError("<main> cudaDeviceReset");
 
	return 0;
}

Sample's Output as is:
CUDA error check "<main> cudaSetDevice" returned ERROR code: 10 (invalid device ordinal)
CUDA error check "<main> cudaMallocManaged" returned ERROR code: 2 (out of memory)
CUDA error check "<main> kernelA" executed successfully
CUDA error check "<main> kernelA" returned ERROR code: 77 (an illegal memory access was encountered) after sync
CUDA error check "<main> cudaFree" returned ERROR code: 77 (an illegal memory access was encountered)
CUDA error check "<main> cudaDeviceReset" executed successfully

The first error (10) was caused because there is no GPU with the device ID of 5 on my system.
The next error (2) was caused because the code attempted to allocate more memory than was available.
Then error (77), in both locations, was a result of trying to work with memory that was never successfully allocated.

Sample's Output if Corrected:
CUDA error check "<main> cudaSetDevice" executed successfully
CUDA error check "<main> cudaMallocManaged" executed successfully
CUDA error check "<main> kernelA" executed successfully
CUDA error check "<main> kernelA" executed successfully after sync
CUDA error check "<main> cudaFree" executed successfully
CUDA error check "<main> cudaDeviceReset" executed successfully

With the errors corrected, a device ID of 0 and removing “* 1000000000”, everything reports running successfully. Printing success is usually not done though.

Conclusion

In this post, we covered the use of error checking functions and the various approaches needed for debugging synchronous and asynchronous GPU operations. Also, though out the post other common debugging methods where mentioned that might prove useful when writing GPU code.

At this point, you should be able to write and debug simple GPU programs if you have also gone through [post 1] and [post 2] previously.

Additional Information

If you were curious about the naming convention, I found starting folders or files with numbers, like 1_creatingErrors.cu, makes for easy use of tab’s autocomplete feature in terminal windows. It is only intended for initial work with code or samples though.

This post became a bit longer than previous samples, so I am including my simple makefile. A future post may take a closer look at makefiles, but as is there are many pages on makefiles across the internet.

NVCC = nvcc
CUDAFLAGS = -arch=sm_61 -lcudart
OPT = -m64

all: zero one two

errorChecking.obj: errorChecking.cu errorChecking.cuh
	${NVCC} ${CUDAFLAGS} ${OPT} -c errorChecking.cu -o errorChecking.obj

errorCheckingMacro.obj: errorCheckingMacro.cu errorCheckingMacro.cuh
	${NVCC} ${CUDAFLAGS} ${OPT} -c errorCheckingMacro.cu -o errorCheckingMacro.obj

zero: 0_creatingErrors.cu
	${NVCC} ${CUDAFLAGS} ${OPT} -o zero 0_creatingErrors.cu

one: 1_creatingErrors.cu errorChecking.obj
	${NVCC} ${CUDAFLAGS} ${OPT} -o one 1_creatingErrors.cu errorChecking.obj

two: 2_creatingErrorsMacro.cu errorCheckingMacro.obj
	${NVCC} ${CUDAFLAGS} ${OPT} -o two 2_creatingErrorsMacro.cu errorCheckingMacro.obj
	
clean:
	${RM} *.o *.obj *.exp *.pdb *.exe zero one two

Contact me if you would like to use the contents of this post. Thanks 🙂
Copyright © 2019 by Gregory Gutmann

Источник

Introduction

Proper CUDA error checking is critical for making the CUDA program development smooth and successful. Missing or incorrectly identifying CUDA errors could cause problems in production or waste lots of time in debugging.

In this blog post, I would like to quickly discuss proper CUDA error checking.

CUDA Error Types

CUDA errors could be separated into synchronous and asynchronous errors, or sticky and non-sticky errors.

Synchronous Error VS Asynchronous Error

CUDA kernel launch is asynchronous, meaning when the host thread reaches the code for kernel launch, say kernel<<<...>>>, the host thread issues an request to execute the kernel on GPU, then the host thread that launches the kernel continues, without waiting for the kernel to complete. The kernel might not begin to execute right away either.

There could be two types of error for CUDA kernel launch, synchronous error and asynchronous error.

Synchronous error happens when the host thread knows the kernel is illegal or invalid. For example, when the thread block size or grid size is too large, a synchronous error is resulted immediately after the kernel launch call, and this error could be captured by CUDA runtime error capturing API calls, such as cudaGetLastError, right after the kernel launch call.

Asynchronous error happens during kernel execution or CUDA runtime asynchronous API execution on GPU. It might take a while to encounter the error and send the error to host thread. For example, For example, it might encounter accessing invalid memory address in the late stage of kernel execution or CUDA runtime asynchronous API cudaMemcpyAsync execution, it will abort the execution and then send the error back to thread. Even if there are CUDA runtime error capturing API calls, such as cudaGetLastError, right after the kernel launch call, at the time when the error reaches host, those CUDA runtime error capturing API calls have been executed and they found no error. It is possible to capture the asynchronous error by explicitly synchronizing using the CUDA kernel launch using CUDA runtime API calls, such as cudaDeviceSynchronize, cudaStreamSynchronize, or cudaEventSynchronize, and checking the returned error from those CUDA kernel launch using CUDA runtime API calls or capturing the error using CUDA runtime error capturing API calls, such as cudaGetLastError. However, explicitly synchronization usually affects performance and therefore is not recommended for using in production unless it is extremely necessary.

Sticky VS Non-Sticky Error

CUDA runtime API returns non-sticky error if there is any, whereas CUDA kernel execution resulted in sticky error if there is any.

A non-sticky error is recoverable, meaning subsequent CUDA runtime API calls could behave normally. Therefore, the CUDA context is not corrupted. For example, when we allocate memory using cudaMalloc, it will return a non-sticky error if the GPU memory is insufficient.

A sticky error is not recoverable, meaning subsequent CUDA runtime API calls will always return the same error. Therefore, the CUDA context is corrupted, unless the application host process is terminated. For example, when the kernel tries to access invalid memory address during kernel execution, it will result in a sticky error which will be captured and returned by all the subsequent CUDA runtime API calls.

CUDA Error Checking Best Practice

In a CUDA program implementation, both development and production code, always check the return value of each CUDA runtime synchronous or asynchronous API call to see if there is any CUDA synchronous error, always run CUDA runtime error capturing API calls, such as cudaGetLastError, after kernel launch calls to see if there is any CUDA synchronous error. Check CUDA asynchronous error in development by synchronization and error checking after kernel launch calls and disable it in production.

Quiz

There is a question on the NVIDIA developer forum. Let’s use it as a quiz. Basically, the user has the following code. All calculations are done on the default stream and one thread. The cudaDeviceSynchronize returns cudaSuccess, but the cudaGetLastError call returns an invalid device function error. How would this happen?



res = cudaDeviceSynchronize();


res = cudaGetLastError();

cudaGetLastError returns the last error that has been produced by any of the runtime calls in the same host thread and resets it to cudaSuccess. cudaDeviceSynchronize is a CUDA runtime API call and it got no error. This means the kernel launch got no asynchronous error. However, there could be errors from CUDA runtime API calls prior to launching the kernel or the kernel launching encountered synchronous error which have not been properly error-checked. The last error that produced by those would not be reset until the cudaGetLastError call, even though before the reset there were cudaSuccess from other CUDA runtime API calls.

For example,

last_error.cu

#include <cuda_runtime.h>
#include <iostream>

#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
void check(T err, const char* const func, const char* const file,
           const int line)
{
    if (err != cudaSuccess)
    {
        std::cerr << "CUDA Runtime Error at: " << file << ":" << line
                  << std::endl;
        std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
        
        
    }
}

#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
void checkLast(const char* const file, const int line)
{
    cudaError_t err{cudaGetLastError()};
    if (err != cudaSuccess)
    {
        std::cerr << "CUDA Runtime Error at: " << file << ":" << line
                  << std::endl;
        std::cerr << cudaGetErrorString(err) << std::endl;
        
        
    }
}

int main()
{
    float* p;
    
    CHECK_CUDA_ERROR(cudaMalloc(&p, 1000000000000000 * sizeof(float)));
    
    CHECK_CUDA_ERROR(cudaMalloc(&p, 10 * sizeof(float)));
    
    CHECK_CUDA_ERROR(cudaFree(p));
    
    
    
    CHECK_LAST_CUDA_ERROR();
    
    CHECK_LAST_CUDA_ERROR();
}

$ nvcc last_error.cu -o last_error
$ ./last_error 
CUDA Runtime Error at: last_error.cu:37
out of memory cudaMalloc(&p, 1000000000000000 * sizeof(float))
CUDA Runtime Error at: last_error.cu:45
out of memory

Fundamentally, it was due to that the CUDA program error checking was not following the best practice mentioned previously.

References

CUDA Debugging

Источник

Probably the best way to check for errors in runtime API code is to define an assert style handler function and wrapper macro like this:

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

You can then wrap each API call with the gpuErrchk macro, which will process the return status of the API call it wraps, for example:

gpuErrchk( cudaMalloc(&a_d, size*sizeof(int)) );

If there is an error in a call, a textual message describing the error and the file and line in your code where the error occurred will be emitted to stderr and the application will exit. You could conceivably modify gpuAssert to raise an exception rather than call exit() in a more sophisticated application if it were required.

A second related question is how to check for errors in kernel launches, which can’t be directly wrapped in a macro call like standard runtime API calls. For kernels, something like this:

kernel<<<1,1>>>(a);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );

will firstly check for invalid launch argument, then force the host to wait until the kernel stops and checks for an execution error. The synchronisation can be eliminated if you have a subsequent blocking API call like this:

kernel<<<1,1>>>(a_d);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaMemcpy(a_h, a_d, size * sizeof(int), cudaMemcpyDeviceToHost) );

in which case the cudaMemcpy call can return either errors which occurred during the kernel execution or those from the memory copy itself. This can be confusing for the beginner, and I would recommend using explicit synchronisation after a kernel launch during debugging to make it easier to understand where problems might be arising.

Note that when using CUDA Dynamic Parallelism, a very similar methodology can and should be applied to any usage of the CUDA runtime API in device kernels, as well as after any device kernel launches:

#include <assert.h>
#define cdpErrchk(ans) { cdpAssert((ans), __FILE__, __LINE__); }
__device__ void cdpAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      printf("GPU kernel assert: %s %s %dn", cudaGetErrorString(code), file, line);
      if (abort) assert(0);
   }
}

CUDA Fortran error checking is analogous. See here and here for typical function error return syntax. A method similar to CUDA C++ is used to collect errors related to kernel launches.

Источник

Ошибки Видеокарты При Майнинге

Самое полное собрание ошибок в майнинге на Windows, HiveOS и RaveOS и их быстрых и спокойных решений

Can’t find nonce with device CUDA_ERROR_LAUNCH_FAILED

Ошибка майнера Can’t find nonce

Ошибка говорит о том, что майнер не может найти нонс и сразу же сам предлагает решение — уменьшить разгон. Особенно начинающие майнеры стараются выжать из видеокарты максимум — разгоняют слишком сильно по ядру или памяти. В таком разгоне видеокарта даже может запуститься, но потом выдавать ошибки как указано ниже. Помните, лучше — стабильная отправка шар на пул, чем гонка за цифрами в майнере.

Зарабатывай на чужих сделках на бирже BingX. Подробнее — тут.

Phoenixminer Connection to API server failed — что делать?

Ошибка Connection to API server failed

Такая ошибка встречается на PhoenixMiner на операционной систему HiveOS. Она говорит о том, что майнинг-ферма/риг не может подключиться к серверу статистики. Что делать для ее решения:

Введите команду net-test и запомните/запишите сервер с низким пингом. После чего смените его в веб интерфейсе Hive (на воркере) и перезагрузите ваш риг.
Если это не помогло, выполните команду dnscrypt -i && sreboot

Phoenixminer CUDA error in CudaProgram.cu:474 : the launch timed out and was terminated (702)

Ошибка майнера Phoenixminer CUDA error in CudaProgram

Эта ошибка, как и в первом случае, говорит о переразгоне карты. Откатите видеокарту до заводских настроек и постепенно поднимайте разгон до тех пор, пока не будет ошибки.

UNABLE TO ENUM CUDA GPUS: INVALID DEVICE ORDINAL

Ошибка майнера Unable to enum CUDA GPUs: invalid device ordinal

Проверяем драйвера видеокарты и саму видеокарту на работоспособность (как она отмечена в диспетчере устройств, нет ли восклицательных знаков).
Если все ок, то проверяем райзера. Часто бывает, что именно райзер бывает причиной такой ошибки.

UNABLE TO ENUM CUDA GPUS: INSUFFICIENT CUDA DRIVER: 5000

Ошибка майнера Unable to enum CUDA GPUs: Insufficient CUDA driver: 5000

Аналогично предыдущей ошибке — проверяем драйвера видеокарты и саму видеокарту на работоспособность (как она отмечена в диспетчере устройств, нет ли восклицательных знаков).

NBMINER MINING PROGRAM UNEXPECTED EXIT.CODE: -1073740791, REASON: PROCESS CRASHED

Ошибка майнера NBMINER MINING PROGRAM UNEXPECTED EXIT.CODE: -1073740791, REASON: PROCESS CRASHED

Ошибка code 1073740791 nbminer возникает, если ваш риг/майнинг-ферма собраны из солянки Nvidia+AMD. В этом случае разделите майнинг на два .bat файла (или полетника, если вы на HiveOS). Один — с картами AMD, другой с картами Nvidia.

NBMINER CUDA ERROR: OUT OF MEMORY (ERR_NO=2) — как исправить?

Ошибка майнера NBMINER CUDA ERROR: OUT OF MEMORY (ERR_NO=2)

Одна из самых распространённых ошибок на Windows — нехватка памяти, в данном случае на майнере Nbminer, но встречается и в майнере Nicehash. Чтобы ее исправить — надо увеличить файл подкачки. Файл подкачки должен быть равен сумме гб всех видеокарт в риге плюс 10% запаса. Как увеличить файл подкачки — читаем тут.

GMINER ERROR ON GPU: OUT OF MEMORY STOPPED MINING ON GPU0

Ошибка майнера GMINER ERROR ON GPU: OUT OF MEMORY STOPPED MINING ON GPU0

В данном случае скорее всего виноват не файл подкачки, а переразгон по видеокарте, которая идет под номером 0. Сбавьте разгон и ошибка должна пропасть.

Socket error. the remote host closed the connection, в майнере Nbminer

Socket error. the remote host closed the connection

Также может быть описана как «ERROR — Failed to establish connection to mining pool: Socket operation timed out».
Сетевой конфликт — проверьте соединение рига с интернетом. Перегрузите роутер.
Также может быть, что провайдер закрывает соединение с пулом. Смените пул, попробуйте VPN или измените адреса DNS на внешнего провайдера, например cloudflare 1.1.1.1, 1.0.0.1

Server not responded on share, на майнере Gminer

Server not responded on share

Такая ошибка говорит о том, что у вас что-то с подключением к интернету, что критично для Gminer. Попробуйте сделать рестарт роутера и отключить watchdog на майнере.

DAG has been damaged check overclocking settings, в майнере Gminer

Также в этой ошибке может быть указано Device not responding, check overclocking settings.
Ошибка говорит о переразгоне, попробуйте сначала убавить его.
Если это не помогло, смените майнер — Gminer никогда не славился работой с видеокартами AMD. Мы рекомендуем поменять майнер на Teamredminer, а если вам критична поддержка майнером одновременно Nvidia и AMD видеокарт, то используйте Lolminer.
Если смена майнера не поможет, переставьте видеодрайвер.
Если и это не поможет, то нужно тестировать эту карту отдельно в слоте X16.

ERROR: Can’t start T-Rex, failed to initialize device map: can’t get busid, code -6

Ошибки настройки памяти с кодом -6 обычно указывают на проблему с драйвером.

Если у вас Windows, используйте программу DDU (DisplayDriverUninstaller), чтобы полностью удалить все драйверы Nvidia.
Перезагрузите систему.
Установите новый драйвер прямо с сайта Nvidia.
Перезагрузите систему снова.
Если у вас HiveOS/RaveOS — накатите чистый образ системы. Чтобы наверняка.

TREX: Can’t unlock GPU

Полный текст ошибки:
TREX: Can’t unlock GPU [ID=1, GPU #1], error code 15
WARN: Miner is going to shutdown…
WARN: NVML: can’t get fan speed for GPU #1, error code 15
WARN: NVML: can’t get power for GPU #1, error code 15
WARN: NVML: can’t get mem/core clock for GPU #1, error code 17

Решение:

Проверьте все кабельные соединения видеокарты и райзера, особенно кабеля питания.
Если с первый пунктом все ок, попробуйте поменять райзер на точно рабочий.
Если ошибка остается, вставьте видеокарту в разъем х16 напрямую в материнскую плату.

CAN’T START MINER, FAILED TO INITIALIZE DEVIS MAP, CAN’T GET BUSID, CODE -6

Ошибка майнера CAN’T START MINER, FAILED TO INITIALIZE DEVIS MAP, CAN’T GET BUSID, CODE -6

В конкретном случае была проблема в блоке питания, он не держал 3 видеокарты. После замены блока питания ошибка пропала.
Если вы уверены, что ваш мощности вашего блока питания достаточно, попробуйте сменить майнер.

Зарабатывай на чужих сделках на бирже BingX. Подробнее — тут.

ОШИБКА 511 ГРАДУСОВ НА ВИДЕОКАРТА

Ошибка 511 градусов видеокарта

Ошибка 511 говорит о неисправности райзера или питания карты. Проверьте все соединения. Для выявления неисправности рекомендуется запустить систему с одной картой. Протестировать, и затем добавлять по одной карте.

GPU driver error, no temps в HiveOS — что делать?

Вероятнее всего, вы получили эту ошибку, майнив на HiveOS. Причин ее появления может быть несколько — как софтовая, так и аппаратная (например райзер).
Можно попробовать обойтись малой кровью и вбить в HiveOS команду:
hive-replace -y —stable
Система по новой накатит стабильную версию HiveOS.

Если ошибка не уйдет — проверьте райзер.

GPU are lost, rebooting

Это не ошибка, а ее последствие. Что узнать какая ошибка приводит к перезагрузке карт, сделайте следующее:

Включите сохранение логов (по умолчанию они выключены) командой

logs-on

И перезагрузите риг.
После того как ошибка повторится можно будет скачать логи командами ниже.
Вы можете использовать следующую команду, чтобы загрузить логи майнера прямо с панели мониторинга;

message file «miner.log» -f=/var/log/miner/minername/minername.log

Итак, скажем, например, мне нужны логи TeamRedMiner
message file «teamredminer.log» -f=/var/log/miner/teamredminer/teamredminer.log

Отправленная командная строка будет выделена синим цветом. Загружаемый файл будет отображаться белым цветом. Нажав на него, вы сможете его скачать.
Эта команда позволит скачать лог системы

message file «syslog» -f=/var/log/syslog

exitcode=3 в HiveOS

Если ошибка не уйдет — проверьте райзер.

exitcode=1 в HiveOS

Данная ошибка возникает когда есть проблема с датой в биосе материнской платы (сбитое время) и (или) есть проблема с интернетом.
Если сбито время, то удаленно вы не сможете подключиться.
Тем не менее, обновление драйверов Nvidia должно пройти командой:

nvidia-driver-update —list

gpu fault detected 146

Скорее всего вы пытаетесь майнить с помощью Phoenix miner. Решения два:

Откатитесь на более старую версию, например на 5.4с
(Рекомендуемый вариант) Используйте Trex для видеокарт Nvidia и TeamRedMiner для AMD.

Waiting interface to come up — не работает VPN на HiveOS

Waiting interface to come up

Начните с логов, чтобы понять какая именно ошибка вызывает эту проблему.
Команды для получения логов:
systemctl status openvpn@client
journalctl -u openvpn@client -e —no-pager -n 100

Как узнать ip адрес воркера hive os

Самое простое — зайти в воркера и прокрутить страницу ниже видеокарт. Там будет указан Remote IP — это и есть внешний IP.
Альтернативный вариант — вы можете проверить ваш внешний айпи адрес hive через консоль Hive Shell:
Выполните одну из команд:
curl 2ip.ru
wget -qO- eth0.me
wget -qO- ipinfo.io/ip
wget -qO- ipecho.net/plain
wget -qO- icanhazip.com
wget -qO- ipecho.net
wget -qO- ident.me

Repository update failed в HiveOS

Иногда встречается на HiveOS. Полный текст ошибки:

Some index files failed to download. They have been ignored, or old ones used instead.
Repository update failed
------------------------------------------------------
> Restarting autofan and watchdog
> Starting miners
Miner screen is already running
Run miner or screen -r to resume screen
Upgrade failed

Решение:

Выполнить команду apt update && selfupgrade -f
Если не сработала и она, то 99.9%, что разработчики HiveOS уже знают об этой проблеме и решают ее. Попробуйте выполнить обновление через некоторое время.

Rave os не запускается. Boot aborted Rave os

Перепроверьте все настройки ПК и БИОСа материнской платы:
— Установите загрузочное устройство HDD/SSD/M2/USB в зависимости от носителя с ОС.
— Включите 4G decoding.
— Установите поддержку PCIe на Auto.
— Включите встроенную графику.
— Установите предпочтительный режим загрузки Legacy mode.
— Отключите виртуализацию.

Если после данных настроек не определяется часть карт, то выполните следующие настройки в BIOS (после каждого пункта требуется полная перезагрузка):

— Отключите 4G decoding
— Перезагрузка
— Отключите CSM
— Перезагрузка
— Включите 4G decoding, установите PCI-E Gen2/3, а при отсутствии Gen2/3, можно выбрать Gen1

Failed to allocate memory Raveos

Эта же ошибка может называться как:
failed to allocate initramfs memory bailing out, failed to load idlinux c.32
или
failed to allocate memory for kernel boot parameter block
или
failed to allocate initramfs memory raveos bailing

Но решение у нее одно — вы должны правильно настроить БИОС материнской платы.

gpu_driver_fault, GPU #0 fault в RaveOS

gpu_driver_fault, GPU #0 fault в RaveOS

В большинстве случаев эта проблема решается уменьшением разгона (особенно по памяти) на конкретной видеокарте (на скрине это карта номер 0).
Если уменьшение разгона не помогает, то попробуйте обновить драйвера.
Если обновление драйверов не привело к решению проблемы, то попробуйте поменять райзер на этой карте на точно работающий.
Если и это не помогает, перепроверьте все кабельные соединения и мощность блока питания, хватает ли его для вашей конфигурации.

Gpu driver fault. All tasks have been stopped. Worker will be rebooted after 5 minutes в RaveOS

Gpu driver fault. All tasks have been stopped. Worker will be rebooted after 5 minutes

Что приводит к появлению этой ошибки? Вероятно, вы переразогнали видеокарту (часто сильно гонят по памяти), сбавьте разгон. На скрине видно, что проблему дает именно GPU под номером 1 — начните с нее.
Вторая частая причина — нехватка питания БП на систему с видеокартами. Учтите, что сама система потребляет не менее 100 вт, каждый райзер еще закладывайте 50 вт. БП должно хватать с запасом в 20%.

Miner restarted after error RaveOS

Смотрите логи майнера, там будет указана конкретная ошибка, которая приводит к miner restarted. После этого найдите ее на этой странице и исправьте. Проблема уйдет.

Miner restart limit reached. Worker rebooting by flag auto в RaveOS

Аналогично предыдущему пункту — смотрите логи майнера, там будет указана конкретная ошибка, которая приводит к рестарту воркера. Пофиксите ту ошибку — уйдет и эта проблема.

Miner cannot be started, ОС RaveOS

Непосредственно перед этой ошибкой обычно пишется еще другая, которая и вызывает эту проблему. Но если ничего нет, то:

Поставьте майнер на паузу, перезагрузите риг и в консоли выполните команды clear-miners clear-logs и fix-fs. Запустите майнинг.
Если ошибка не ушла, перепишите образ RaveOS.

Overclock can’t be applied в RaveOS

Эта ошибка означает, что значения разгона между собой конфликтуют или выходят за пределы допустимых. Перепроверьте их. Скиньте разгон на стоковый и попробуйте еще раз.
В редких случаях причиной этой ошибки также становится райзер.

Error installing hive miners

Можно попробовать обойтись малой кровью и вбить в HiveOS команду:
hive-replace -y —stable
Система по новой накатит стабильную версию HiveOS.

Если ошибка не уйдет — физически перезапишите образ. Если у вас флешка, то скорее всего она умерла. Купите SSD.

Warning: Nvidia settings applied with errors

Переразгон. Снизьте значения частот ядра и памяти. После этого перезагрузите риг.

Nvtool error или Danger: nvtool error

Скорее всего при установке драйвера появилась проблема с модулем nvtool
Попробуйте переустановить драйвер Nvidia командой через Hive shell:
nvidia-driver-update версия_драйвера —force
Или попробуйте обновить систему полностью командой из Hive shell:
hive-replace -y —stable

nvtool error

Перестал отображаться кулер видеокарты HiveOS

0% скорости вращения кулера.
Это может произойти по нескольким причинам:

кулер действительно не крутится
датчик оборотов отключен или сломан
видеокарта слишком агрессивно работает (высокий разгон)
неисправен райзер или одно из его частей

ERROR: parsing JSON failed

Необходимо выполнить на риге локально (с клавиатурой и монитором) следующую команду:
net-test

Данная команда покажет ваше текущее состояние подключения к разным зеркалам API серверов HiveOS.
Посмотрите, к какому API у вас наименьшая задержка (ping), и когда воркер снова появится в панели, измените стандартное зеркало на то, что ближе к вам.
После смены зеркала, в обязательном порядке перезагрузите ваш воркер.
Изменить сервер API вы можете командой nano /hive-config/rig.conf
После смены нажмите ctrl + o и ентер для того чтобы сохранить файл.
После этого выйдите в консоль командой ctrl + x, f10 и выполните команду hello

NVML: can’t get fan speed for GPU #5, error code 999 hive os

Проблема с скоростью кулеров на GPU 5
0% скорости вращения кулера / ошибки в целом
Это может произойти по нескольким причинам:
— кулер действительно не крутится
— датчик оборотов отключен или сломан
— видеокарта слишком агрессивно работает (высокий разгон)
Начните с визуальной проверки карты и ее кулера.

Can’t get power for GPU #2

Как правило эта ошибка встречается рядом вместе с другими:
Attribute ‘GPUGraphicsClockOffset’ was already set to 0
Attribute ‘GPUMemoryTransferRateOffset’ was already set to 2200
Attribute ‘GPUFanControlState’ (hive1660s_ETH:0[gpu:2]) assigned value
0.

20211029 12:40:50 WARN: NVML: can’t get fan speed for GPU #2, error code 999
20211029 12:40:50 WARN: NVML: can’t get power for GPU #2, error code 999
20211029 12:40:50 WARN: NVML: can’t get mem/core clock for GPU #2, error code 999

Решение:

Проверьте корректность установки драйвера на видеокарте.
Убедитесь что нет проблем с драйвером, если все в порядке, то попробуйте другой параметр разгона. Например уменьшить разгон по памяти.

GPU1 search error: unspecified launch failure

Уменьшите разгон и проверьте контакты райзера

Warning: Autofan: unable to set fan speed, rebooting

Найдите логи майнера, посмотрите какие ошибки майнер пишет в логах. Например:

kernel: [12112.410046][ T7358] NVRM: GPU at PCI:0000:0c:00: GPU-236e3bef-2e03-6cdb-0518-7ac01eb8736d
kernel: [12112.410049][ T7358] NVRM: Xid (PCI:0000:0c:00): 62, pid=7317, 0000(0000) 00000000 00000000
kernel: [12112.433831][ T7358] NVRM: Xid (PCI:0000:0c:00): 45, pid=7317, Ch 00000010
CRON[21094]: (root) CMD (command -v debian-sa1 > /dev/null && debian-sa1 1 1)

Исходя из логов, мы видим что есть проблема с видеокартой на слоте PCIE 0c:00 (под номером Gpu пишется номер PCIE слота) с ошибками 45 и 62
Коды ошибок (других, которые также могут быть там) и что с ними делать:

• 13, 43, 45: ошибки памяти, снизить MEM
• 8, 31, 32, 61, 62: снизить CORE, возможно и MEM
• 79: снизить CORE, проверить райзер

Ошибка Kernel-Power код 41

Проверьте все провода (от БП до карт, от БП до райзеров), возможно где-то идёт оплавление. Если визуальный осмотр показал, что все ок, то ошибка программная и вам нужно переустановить Windows.

Danger: hive-replace -y —stable (failed, exitcode=137)

Очень редкая ошибка, которая вылезла в момент удаленного обновления образа HiveOS. Она не встречается в тематических майнинг группах и сайтах. Не поверите что произошло.
На балконе, где стоял риг, поселилась семья голубей. Они засрали риг, в прямом смысле, из-за этого он постоянно уходил в оффлайн. После полной продувки материнской платы и видеокарт проблема решилась сама.

MALFUNCTION HIVEOS

Malfunction — неисправность. Причин и решений может быть несколько:

Вам следует переустановить видео драйвер;
Если драйвер не помог, тогда отключайте все GPU и поочередно вставляйте по 1 шт, и смотрите вызовет ли какая-то видеокарта подобную ошибку или нет. Если да, то возможно это райзер.
Неисправен носитель, на который записана Hive OS, запишите образ еще раз.

Не нашли своей ошибки? Помогите сделать мир майнинга лучше. Отправьте ее по этой форме и мы обновим наш гайд в самое ближайшее время.

Источник

Debugging with Print Statements

Part 1: Error Checking Helpers

CUDA Error Checking Macros

CUDA Error Checking Functions with Added Functionality

Part 2: Using the Error Checking Code Above

Use of Error Checking Macros

Use of Error Checking Functions

Conclusion

Additional Information

Introduction

CUDA Error Types

Synchronous Error VS Asynchronous Error

Sticky VS Non-Sticky Error

CUDA Error Checking Best Practice

Quiz

References

Ошибки Видеокарты При Майнинге

UNABLE TO ENUM CUDA GPUS: INVALID DEVICE ORDINAL

UNABLE TO ENUM CUDA GPUS: INSUFFICIENT CUDA DRIVER: 5000

NBMINER MINING PROGRAM UNEXPECTED EXIT.CODE: -1073740791, REASON: PROCESS CRASHED

NBMINER CUDA ERROR: OUT OF MEMORY (ERR_NO=2) — как исправить?

GMINER ERROR ON GPU: OUT OF MEMORY STOPPED MINING ON GPU0

Socket error. the remote host closed the connection, в майнере Nbminer

Server not responded on share, на майнере Gminer

DAG has been damaged check overclocking settings, в майнере Gminer

ERROR: Can’t start T-Rex, failed to initialize device map: can’t get busid, code -6

TREX: Can’t unlock GPU

CAN’T START MINER, FAILED TO INITIALIZE DEVIS MAP, CAN’T GET BUSID, CODE -6

ОШИБКА 511 ГРАДУСОВ НА ВИДЕОКАРТА

GPU driver error, no temps в HiveOS — что делать?

GPU are lost, rebooting

exitcode=3 в HiveOS

exitcode=1 в HiveOS

gpu fault detected 146

Waiting interface to come up — не работает VPN на HiveOS

Как узнать ip адрес воркера hive os

Repository update failed в HiveOS

Rave os не запускается. Boot aborted Rave os

Failed to allocate memory Raveos

gpu_driver_fault, GPU #0 fault в RaveOS

Gpu driver fault. All tasks have been stopped. Worker will be rebooted after 5 minutes в RaveOS

Miner restarted after error RaveOS

Miner restart limit reached. Worker rebooting by flag auto в RaveOS

Miner cannot be started, ОС RaveOS

Overclock can’t be applied в RaveOS

Error installing hive miners

Warning: Nvidia settings applied with errors

Nvtool error или Danger: nvtool error

Перестал отображаться кулер видеокарты HiveOS

ERROR: parsing JSON failed

NVML: can’t get fan speed for GPU #5, error code 999 hive os

Can’t get power for GPU #2

GPU1 search error: unspecified launch failure

Warning: Autofan: unable to set fan speed, rebooting

Ошибка Kernel-Power код 41

Danger: hive-replace -y —stable (failed, exitcode=137)

MALFUNCTION HIVEOS

Читайте также: