/*
 *     Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * NVIDIA CORPORATION and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA CORPORATION is strictly prohibited.
 *
 */

#include "openacc.h"

#ifndef _DEF_ACC_PROF
#define _DEF_ACC_PROF

#ifndef LLONG
/* LLONG must be 64 bits on 64-bit system */
#ifdef TARGET_WIN_X8664
#define LLONG long long
#else
#define LLONG long
#endif
#endif

#if defined(TARGET_WIN) || defined(_WIN64)
#define ssize_t long
#endif

/*
 * event types that can be profiled
 */
typedef enum acc_event_t {
  acc_ev_none = 0,              /* so zero is undefined */
  acc_ev_device_init_start = 1, /* initialize a device */
  acc_ev_device_init_end = 2,
  acc_ev_device_shutdown_start = 3, /* shutdown a device */
  acc_ev_device_shutdown_end = 4,
  acc_ev_runtime_shutdown = 5,  /* program finished */
  acc_ev_create = 6,            /* create data in device memory */
  acc_ev_delete = 7,            /* delete data in device memory */
  acc_ev_alloc = 8,             /* allocate physical memory in device memory */
  acc_ev_free = 9,              /* free physical memory in device memory */
  acc_ev_enter_data_start = 10, /* enter data directive or API call */
  acc_ev_enter_data_end = 11,
  acc_ev_exit_data_start = 12, /* exit data directive or API call */
  acc_ev_exit_data_end = 13,
  acc_ev_update_start = 14, /* update directive */
  acc_ev_update_end = 15,
  acc_ev_compute_construct_start = 16, /* kernels or parallel construct */
  acc_ev_compute_construct_end = 17,
  acc_ev_enqueue_launch_start = 18, /* launch a kernel or computation */
  acc_ev_enqueue_launch_end = 19,
  acc_ev_enqueue_upload_start = 20, /* upload data to device */
  acc_ev_enqueue_upload_end = 21,
  acc_ev_enqueue_download_start = 22, /* download data from device */
  acc_ev_enqueue_download_end = 23,
  acc_ev_wait_start = 24, /* wait for device queue or queues */
  acc_ev_wait_end = 25,
  acc_ev_last = 26
} acc_event_t;

/*
 * supported device runtime API interfaces
 */
typedef enum acc_device_api {
  acc_device_api_none = 0,
  acc_device_api_cuda = 1, /* for NVIDIA CUDA device */
  acc_device_api_cuda_native = 1,
  acc_device_api_opencl = 2,    /* for an OpenCL device */
  acc_device_api_other = 4,     /* other device type */
  acc_device_api_native = 1008, /* this device is the native core */
  acc_device_api_implementation_defined =
    1000, /* this device is the native core */
} acc_device_api;

typedef enum acc_device_api acc_devapi_t;
#define acc_devapi_none acc_device_api_none
#define acc_devapi_native acc_device_api_native
#define acc_devapi_cuda acc_device_api_cuda
#define acc_devapi_cuda_native acc_device_api_cuda_native
#define acc_devapi_opencl acc_device_api_opencl
#define acc_devapi_coi acc_device_api_coi

/*
 * construct types
 */
typedef enum acc_construct_t {
  acc_construct_parallel = 0,
  acc_construct_kernels = 1,
  acc_construct_loop = 2,
  acc_construct_data = 3,
  acc_construct_enter_data = 4,
  acc_construct_exit_data = 5,
  acc_construct_host_data = 6,
  acc_construct_atomic = 7,
  acc_construct_declare = 8,
  acc_construct_init = 9,
  acc_construct_shutdown = 10,
  acc_construct_set = 11,
  acc_construct_update = 12,
  acc_construct_routine = 13,
  acc_construct_wait = 14,
  acc_construct_runtime_api = 15,
  acc_construct_serial = 16,
  acc_construct_unknown = 9999
} acc_construct_t;

/*
 * (s)size_t is 64-bits for 64-bit OS (pointers are 64 bits)
 * (s)size_t is 32-bits for 32-bit OS (pointers are 32 bits)
 * int is 32-bits for 64-bit and 32-bit OS
 */

/*
 * General information, the first argument
 */
typedef struct acc_prof_info {
  acc_event_t event_type;   /* enumeration type */
  int valid_bytes;          /* how many valid bytes in this struct argument */
  int version;              /* version ID */
  acc_device_t device_type; /* the OpenACC device type */
  int device_number;        /* the device number */
  int thread_id;            /* unique for each host thread */
  ssize_t async;            /* value in async(x) */
  ssize_t async_queue;      /* runtime queue number; may be the same as async */
  const char *src_file;     /* may be NULL */
  const char *func_name;    /* may be NULL */
  int line_no,
    end_line_no; /* start/end line numbers of construct, may be zero */
  int func_line_no,
    func_end_line_no; /* start/end line numbers of function, may be zero */
} acc_prof_info;

typedef struct acc_data_event_info {
  /* for enqueue_upload/enqueue_download/create/delete/alloc/free callbacks */
  acc_event_t event_type;
  int valid_bytes; /* in bytes */
  acc_construct_t parent_construct;
  int implicit;
  void *tool_info;
  const char *var_name; /* if known, else NULL; not static */
  size_t bytes;
  const void *host_ptr;
  const void *device_ptr;
  size_t transfers; /* PGI extension */
} acc_data_event_info;

typedef struct acc_launch_event_info {
  /* for enqueue_launch callbacks */
  acc_event_t event_type;
  int valid_bytes; /* in bytes */
  acc_construct_t parent_construct;
  int implicit;
  void *tool_info;
  const char *kernel_name; /* if known, else NULL; not static */
  size_t num_gangs, num_workers, vector_length; /* may be zero */
  size_t grid[3], block[3], smem;               /* PGI extension */
  int flags;                                    /* PGI extension */
} acc_launch_event_info;

typedef struct acc_other_event_info {
  /* for any event */
  acc_event_t event_type;
  int valid_bytes; /* in bytes */
  acc_construct_t parent_construct;
  int implicit;
  void *tool_info;
} acc_other_event_info;

typedef union acc_event_info {
  acc_event_t event_type;
  acc_data_event_info data_event;
  acc_launch_event_info launch_event;
  acc_other_event_info other_event;
} acc_event_info;

/* Bit mask in the flags: */
/* set if this is an event generated by an API call, not by a directive */
#define EVENT_API 0x02

/*
 * Device- and API-specific information, the third argument
 * Vendor information may be any value provided by the vendor,
 * such as 0x50474900  which is the string 'PGI\0' encoded in ASCII
 */
typedef struct acc_api_info {
  acc_device_api device_api;  /* the device API: CUDA, OpenCL, native, other */
  int valid_bytes;            /* how many valid bytes in this struct argument */
  acc_device_t device_type;   /* the OpenACC device type */
  int vendor;                 /* vendor information */
  const void *device_handle;  /* api-specific device handle */
  const void *context_handle; /* api-specific context handle */
  const void *async_handle;   /* api-specific async queue handle */
  void *event_handle;         /* PGI extension: api-specific event handle */
} acc_api_info;

#ifdef __cplusplus
#define EXTERN extern "C"
#else
#define EXTERN extern
#endif

/*
 * The prototype for a callback routine
 */
typedef void (*acc_prof_callback)(acc_prof_info *, acc_event_info *,
                                  acc_api_info *);

typedef acc_prof_callback acc_prof_callback_t;

/*
 * register / toggle
 */
typedef enum acc_register_t {
  acc_reg = 0,
  acc_toggle = 1,
  acc_toggle_per_thread = 2
} acc_register_t;

/*
 * Register event callback.
 */
EXTERN void acc_prof_register(acc_event_t eventtype, acc_prof_callback cb,
                              acc_register_t);
typedef void (*acc_prof_reg)(acc_event_t eventtype, acc_prof_callback cb,
                             acc_register_t);
typedef acc_prof_reg acc_prof_register_t;
typedef void (*acc_prof_fn)(void);
typedef acc_prof_fn (*acc_prof_lookup_func)(const char *name);
/* legacy definition, for existing code that used the
 * old, incorrect PGI typedef name */
typedef acc_prof_lookup_func acc_prof_lookup;

/*
 * Unregister event callback.
 */
EXTERN void acc_prof_unregister(acc_event_t eventtype, acc_prof_callback cb,
                                acc_register_t);

/*
 * Optional registration routine for use in shared objects
 */
EXTERN void acc_register_library(acc_prof_reg acc_register,
                                 acc_prof_reg acc_unregister,
                                 acc_prof_lookup_func lookup);

/*
 * When registering callbacks for profiler events, the argument
 * is a pointer to an array of struct with three fields per element:
 * - profiler event type
 * - callback routine to call at the start of the event
 * - callback routine to call at the end of the event
 * end the array with eventtype of zero (acc_ev_none)
 */

/*
 * Routines to get runtime information from the openacc library
 */

/*
 * Return the maximum unique distinct async value used
 * A library may use this to size an array, one per 'async' queue
 */
EXTERN int __pgi_acc_get_async_count(void);

/*
 * Return the CUDA Context used by a specific devnum
 */
EXTERN void *__pgi_acc_get_cuda_context(int devnum);

/*
 * Return the CUDA Stream used by a specific devnum and async value
 */
EXTERN void *__pgi_acc_get_cuda_stream(int devnum, LLONG async);

/*
 * Return the OpenCL device handle for a devnum
 */
EXTERN void *__pgi_acc_get_opencl_device(int devnum);

/*
 * Return the OpenCL context handle for a devnum
 */
EXTERN void *__pgi_acc_get_opencl_context(int devnum);

/*
 * Return the OpenCL command queue handle for a devnum
 */
EXTERN void *__pgi_acc_get_opencl_commandqueue(int devnum, LLONG async);

#define ACC_VENDOR_(a) ((unsigned char)((a)&0xFF))
#define ACC_VENDOR(a, b, c, d)                                                \
  ((((unsigned int)ACC_VENDOR_(a))) + (((unsigned int)ACC_VENDOR_(b)) << 8) + \
   (((unsigned int)ACC_VENDOR_(c)) << 16) +                                   \
   (((unsigned int)ACC_VENDOR_(d)) << 24))

#define ACC_VENDOR_A(v) ((char)(((v)) & 0xFF))
#define ACC_VENDOR_B(v) ((char)(((v) >> 8) & 0xFF))
#define ACC_VENDOR_C(v) ((char)(((v) >> 16) & 0xFF))
#define ACC_VENDOR_D(v) ((char)(((v) >> 24) & 0xFF))

#define ACC_VENDOR_PGI ACC_VENDOR('P', 'G', 'I', 0)
#endif
