#define _BSD_SOURCE // for endian.h #include #include #include #include #include #include #include #include #include #include #include #include #include // how stupid is that, the 1.2 header files define CL_VERSION_1_1, // but then fail to define the api functions unless you ALSO define // this. This breaks 100% of the opencl 1.1 apps, for what reason? // after all, the functions are deprecated, not removed. // in addition, you cannot test for this in any future-proof way. // each time a new opencl version comes out, you need to make a new // release. #define CL_USE_DEPRECATED_OPENCL_1_1_APIS #ifdef __APPLE__ #define CLHDR(name) #else #define CLHDR(name) #endif #include CLHDR(opencl.h) #undef NDEBUG #include static cl_int res; #define FAIL(name) \ do { fprintf (stderr, "cl" # name ": error %d", res); exit (1); } while (0) #define NEED_SUCCESS(name,args) \ do { \ res = cl ## name args; \ \ if (res) \ FAIL (name); \ } while (0) #define NEED_SUCCESS_ARG(retdecl, name, args) \ retdecl = cl ## name args; \ if (res) \ FAIL (name); typedef uint32_t off_type; static cl_platform_id platform; static cl_device_id device; static cl_context context; static cl_command_queue queue; #define LISTEN_PORT 3843 static off_type capacity = 768 * 1024 * 1024; static cl_mem disk; static struct nbd_request req; static struct nbd_reply reply; static char *buf; static uint32_t bufsize; static int fd; #ifndef MAP_LOCKED # define MAP_LOCKED 0 #endif static void xread (void *data, int len) { int n = 0; while (len > n) { int r = read (fd, n + (char *)data, len - n); assert (r > 0); n += r; } } static void xwrite (const void *data, int len) { int n = 0; while (len > n) { int r = write (fd, n + (char *)data, len - n); assert (r > 0); n += r; } } static buf_alloc (uint32_t size) { if (bufsize >= size) return; if (buf) munmap (buf, bufsize); bufsize = (size + 4095) & ~4095; // too lazy to query PAGE_SIZE buf = mmap (0, bufsize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, -1, 0); if (!MAP_LOCKED) mlock (buf, bufsize); } static void oclread (off_type off, uint32_t len, void *buf) { NEED_SUCCESS (EnqueueReadBuffer, (queue, disk, 1, off, len, buf, 0, 0, 0)); } static void oclwrite (off_type off, uint32_t len, void *buf) { NEED_SUCCESS (EnqueueWriteBuffer, (queue, disk, 0, off, len, buf, 0, 0, 0)); } static void init (void) { NEED_SUCCESS_ARG (disk, CreateBuffer, (context, 0, capacity, 0, &res)); const int block = 64 * 1024; off_type o; buf_alloc (block); for (o = 0; o < capacity; o += block) { oclwrite (o, block, buf); oclread (o, block, buf); } static struct sockaddr_in sa; sa.sin_family = AF_INET; sa.sin_port = htons (LISTEN_PORT); sa.sin_addr.s_addr = htonl (0x7f000001); int listener = socket (AF_INET, SOCK_STREAM, 0); bind (listener, (void *)&sa, sizeof (sa)); listen (listener, 1); printf ("waiting for connect on port %d...", LISTEN_PORT); fflush (stdout); fd = accept (listener, 0, 0); assert (fd >= 0); printf (" ok\n"); close (listener); { xwrite ("NBDMAGIC", 8); xwrite ("\x00\x00\x42\x02\x81\x86\x12\x53", 8); uint64_t cap = htobe64 (capacity); xwrite (&cap, 8); int i; cap = 0; for (i = 0; i < 128 / 8; ++i) xwrite (&cap, 8); } } static void run (void) { for (;;) { xread (&req, sizeof (req)); assert (req.magic == htonl (NBD_REQUEST_MAGIC)); int type = ntohl (req.type); off_type off = be64toh (req.from); uint32_t len = ntohl (req.len); reply.magic = htonl (NBD_REPLY_MAGIC); memcpy (reply.handle, req.handle, sizeof (reply.handle)); reply.error = 0; //printf ("type %d off %d len %d\n", type, (int)off, (int)len);//D if (off + len > capacity) reply.error = htonl (1); else if (type == NBD_CMD_READ) { buf_alloc (len); clFinish (queue); oclread (off, len, buf); xwrite (&reply, sizeof (reply)); xwrite (buf, len); } else if (type == NBD_CMD_WRITE) { clFinish (queue); buf_alloc (len); xread (buf, len); xwrite (&reply, sizeof (reply)); oclwrite (off, len, buf); } else abort (); } } int main (int argc, char **argv) { cl_platform_id *list; cl_uint count; int i; // we would really like to MCL_FUTURE, as being swapped out could turn // out to be fatal but we don't, as at least nvidia's opencl results // in a 1.5gb allocation (on my 1gb card) instantly. //mlockall (MCL_CURRENT | MCL_FUTURE); mlockall (MCL_CURRENT); NEED_SUCCESS (GetPlatformIDs, (0, 0, &count)); list = malloc (sizeof (*list) * count); NEED_SUCCESS (GetPlatformIDs, (count, list, 0)); for (i = 0; i < count; ++i) { platform = list [i]; cl_device_id *list; cl_uint count; int i; NEED_SUCCESS (GetDeviceIDs, (platform, CL_DEVICE_TYPE_GPU, 0, 0, &count)); list = malloc (sizeof (*list) * count); NEED_SUCCESS (GetDeviceIDs, (platform, CL_DEVICE_TYPE_GPU, count, list, 0)); for (i = 0; i < count; ++i) { device = list [i]; NEED_SUCCESS_ARG (context, CreateContext, (0, 1, &device, 0, 0, &res)); NEED_SUCCESS_ARG (queue, CreateCommandQueue, (context, device, 0, &res)); init (); run (); exit (0); } free (list); } fprintf (stderr, "unable to create suitable opencl context\n"); return 1; }