//  -----------------------------------------------------------------------------------------
//    g x264 o(GUI) Ex  v1.xx/2.xx by rigaya
//  -----------------------------------------------------------------------------------------
//   \[XR[hɂ
//   Eۏ؂łB
//   E{\[XR[hgpƂɂ邢Ȃ鑹QEguɂrigaya͐ӔC𕉂܂B
//   ȏɗĒꍇA{\[XR[h̎gpAAρAĔЕzsĒč\܂B
//  -----------------------------------------------------------------------------------------

#include <Windows.h>
#include <malloc.h>
#include <stdlib.h>
#include <string.h>
#include "auo.h"
#include "auo_util.h"
#include "auo_video.h"
#include "auo_frm.h"
#include "convert.h"

//16bit->8bitϊ̑I
func_audio_16to8 get_audio_16to8_func(BOOL split) {
	static const func_audio_16to8 FUNC_CONVERT_AUDIO[][2] = {
		{ convert_audio_16to8,      split_audio_16to8x2      },
		{ convert_audio_16to8_sse2, split_audio_16to8x2_sse2 },
#if (_MSC_VER >= 1700)
		{ convert_audio_16to8_avx2, split_audio_16to8x2_avx2 },
#endif
	};
	int simd = 0;
#if (_MSC_VER >= 1700)
	if (0 == (simd = (!!check_avx2() * 2)))
#endif
		simd = check_sse2();
	return FUNC_CONVERT_AUDIO[simd][!!split];
}

enum eInterlace {
	A = -1, //ʂ̕KvȂ
	P = 0,  //vObVup
	I = 1   //C^[[Xp
};

typedef struct {
	int        input_from_aviutl; //Aviutl̓͂Ɏgp
	int        output_csp;        //o͐F
	BOOL       for_16bit;         //16bitp֐ł邩ǂ
	eInterlace for_interlaced;    //C^[Xp֐ł邩ǂ
	DWORD      mod;               //()ɐ(؂邩ǂ)
	DWORD      SIMD;              //ΉSIMD
	func_convert_frame func;      //֐ւ̃|C^
} COVERT_FUNC_INFO;

//\łƂ̂ŏȗ
#define NONE  AUO_SIMD_NONE
#define SSE2  AUO_SIMD_SSE2
#define SSE3  AUO_SIMD_SSE3
#define SSSE3 AUO_SIMD_SSSE3
#define SSE41 AUO_SIMD_SSE41
#define SSE42 AUO_SIMD_SSE42
#define AVX   AUO_SIMD_AVX
#define AVX2  AUO_SIMD_AVX2

static const BOOL BIT_8 = 0;  //8bitp
static const BOOL BIT16 = 1;  //16bitp

//ϊ֐̃e[u
//ォ`FbN̂ŁA茵ő֐ɏ
static const COVERT_FUNC_INFO FUNC_TABLE[] = {
	//YUY2̂܂ܓn
	{ CF_YUY2, OUT_CSP_YUY2,   BIT_8, A,  1,  SSE2,                  copy_yuy2_sse2 },
	{ CF_YUY2, OUT_CSP_YUY2,   BIT_8, A,  1,  NONE,                  copy_yuy2 },
	//YUY2 -> nv16(8bit)
#if (_MSC_VER >= 1700)
	{ CF_YUY2, OUT_CSP_NV16,   BIT_8, A,  1,  AVX2|AVX,             convert_yuy2_to_nv16_avx2 },
#endif
#if (_MSC_VER >= 1600)
	{ CF_YUY2, OUT_CSP_NV16,   BIT_8, A,  1,  AVX|SSE2,             convert_yuy2_to_nv16_avx },
#endif
	{ CF_YUY2, OUT_CSP_NV16,   BIT_8, A,  1,  SSE2,                 convert_yuy2_to_nv16_sse2 },
	{ CF_YUY2, OUT_CSP_NV16,   BIT_8, A,  1,  NONE,                 convert_yuy2_to_nv16 },
	//YC48 -> nv16(16bit)
#if (_MSC_VER >= 1700)
	{ CF_YC48, OUT_CSP_NV16,   BIT16, A,  1,  AVX2|AVX,             convert_yc48_to_nv16_16bit_avx2 },
#endif
#if (_MSC_VER >= 1600)
	{ CF_YC48, OUT_CSP_NV16,   BIT16, A,  1,  AVX|SSE41|SSSE3|SSE2, convert_yc48_to_nv16_16bit_avx },
#endif
	{ CF_YC48, OUT_CSP_NV16,   BIT16, A,  1,  SSE41|SSSE3|SSE2,     convert_yc48_to_nv16_16bit_sse4_1 },
	{ CF_YC48, OUT_CSP_NV16,   BIT16, A,  1,  SSSE3|SSE2,           convert_yc48_to_nv16_16bit_ssse3 },
	{ CF_YC48, OUT_CSP_NV16,   BIT16, A,  1,  SSE2,                 convert_yc48_to_nv16_16bit_sse2 },
	{ CF_YC48, OUT_CSP_NV16,   BIT16, A,  1,  NONE,                 convert_yc48_to_nv16_16bit },

	//YC48 -> yuv444(8bit)
	{ CF_YC48, OUT_CSP_YUV444, BIT_8, A,  1,  SSE2,                 convert_yc48_to_yuv444_sse2 },
	{ CF_YC48, OUT_CSP_YUV444, BIT_8, A,  1,  NONE,                 convert_yc48_to_yuv444 },
	//YC48 -> yuv444(16bit)
#if (_MSC_VER >= 1700)
	{ CF_YC48, OUT_CSP_YUV444, BIT16, A,  1,  AVX2|AVX,             convert_yc48_to_yuv444_16bit_avx2 },
#endif
#if (_MSC_VER >= 1600)
	{ CF_YC48, OUT_CSP_YUV444, BIT16, A,  1,  AVX|SSE41|SSSE3|SSE2, convert_yc48_to_yuv444_16bit_avx },
#endif
	{ CF_YC48, OUT_CSP_YUV444, BIT16, A,  1,  SSE41|SSSE3|SSE2,     convert_yc48_to_yuv444_16bit_sse4_1 },
	{ CF_YC48, OUT_CSP_YUV444, BIT16, A,  1,  SSE2,                 convert_yc48_to_yuv444_16bit_sse2 },
	{ CF_YC48, OUT_CSP_YUV444, BIT16, A,  1,  NONE,                 convert_yc48_to_yuv444_16bit },
	//Copy RGB
	{ CF_RGB,  OUT_CSP_RGB,    BIT_8, A,  1,  SSE2,                 copy_rgb_sse2 },
	{ CF_RGB,  OUT_CSP_RGB,    BIT_8, A,  1,  NONE,                 copy_rgb },

	//YUY2 -> nv12(8bit)
#if (_MSC_VER >= 1700)
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, P, 32,  AVX2|AVX,             convert_yuy2_to_nv12_avx2_mod32 },
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, P,  1,  AVX2|AVX,             convert_yuy2_to_nv12_avx2 },
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, I, 32,  AVX2|AVX,             convert_yuy2_to_nv12_i_avx2_mod32 },
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, I,  1,  AVX2|AVX,             convert_yuy2_to_nv12_i_avx2 },
#endif
#if (_MSC_VER >= 1600)
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, P, 16,  AVX|SSE2,             convert_yuy2_to_nv12_avx_mod16 },
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, P,  1,  AVX|SSE2,             convert_yuy2_to_nv12_avx },
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, I, 16,  AVX|SSE2,             convert_yuy2_to_nv12_i_avx_mod16 },
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, I,  1,  AVX|SSE2,             convert_yuy2_to_nv12_i_avx },
#endif //_MSC_VER >= 1600
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, P, 16,  SSE2,                 convert_yuy2_to_nv12_sse2_mod16 },
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, P,  1,  SSE2,                 convert_yuy2_to_nv12_sse2 },
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, P,  1,  NONE,                 convert_yuy2_to_nv12 },
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, I, 16,  SSSE3|SSE2,           convert_yuy2_to_nv12_i_ssse3_mod16 },
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, I,  1,  SSSE3|SSE2,           convert_yuy2_to_nv12_i_ssse3 },
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, I, 16,  SSE2,                 convert_yuy2_to_nv12_i_sse2_mod16 },
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, I,  1,  SSE2,                 convert_yuy2_to_nv12_i_sse2 },
	{ CF_YUY2, OUT_CSP_NV12,   BIT_8, I,  1,  NONE,                 convert_yuy2_to_nv12_i },
#if (_MSC_VER >= 1700)
	{ CF_YC48, OUT_CSP_NV12,   BIT16, P, 16,  AVX2|AVX,             convert_yc48_to_nv12_16bit_avx2_mod16 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, P,  1,  AVX2|AVX,             convert_yc48_to_nv12_16bit_avx2 },
#endif
#if (_MSC_VER >= 1600)
	{ CF_YC48, OUT_CSP_NV12,   BIT16, P,  8,  AVX|SSE41|SSSE3|SSE2, convert_yc48_to_nv12_16bit_avx_mod8 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, P,  1,  AVX|SSE41|SSSE3|SSE2, convert_yc48_to_nv12_16bit_avx },
#endif
	{ CF_YC48, OUT_CSP_NV12,   BIT16, P,  8,  SSE41|SSSE3|SSE2,     convert_yc48_to_nv12_16bit_sse4_1_mod8 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, P,  1,  SSE41|SSSE3|SSE2,     convert_yc48_to_nv12_16bit_sse4_1 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, P,  8,  SSSE3|SSE2,           convert_yc48_to_nv12_16bit_ssse3_mod8 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, P,  1,  SSSE3|SSE2,           convert_yc48_to_nv12_16bit_ssse3 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, P,  8,  SSE2,                 convert_yc48_to_nv12_16bit_sse2_mod8 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, P,  1,  SSE2,                 convert_yc48_to_nv12_16bit_sse2 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, P,  1,  NONE,                 convert_yc48_to_nv12_16bit },
#if (_MSC_VER >= 1700)
	{ CF_YC48, OUT_CSP_NV12,   BIT16, I, 16,  AVX2|AVX,             convert_yc48_to_nv12_i_16bit_avx2_mod16 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, I,  1,  AVX2|AVX,             convert_yc48_to_nv12_i_16bit_avx2 },
#endif
#if (_MSC_VER >= 1600)
	{ CF_YC48, OUT_CSP_NV12,   BIT16, I,  8,  AVX|SSE41|SSSE3|SSE2, convert_yc48_to_nv12_i_16bit_avx_mod8 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, I,  1,  AVX|SSE41|SSSE3|SSE2, convert_yc48_to_nv12_i_16bit_avx },
#endif
	{ CF_YC48, OUT_CSP_NV12,   BIT16, I,  8,  SSE41|SSSE3|SSE2,     convert_yc48_to_nv12_i_16bit_sse4_1_mod8 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, I,  1,  SSE41|SSSE3|SSE2,     convert_yc48_to_nv12_i_16bit_sse4_1 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, I,  8,  SSSE3|SSE2,           convert_yc48_to_nv12_i_16bit_ssse3_mod8 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, I,  1,  SSSE3|SSE2,           convert_yc48_to_nv12_i_16bit_ssse3 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, I,  8,  SSE2,                 convert_yc48_to_nv12_i_16bit_sse2_mod8 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, I,  1,  SSE2,                 convert_yc48_to_nv12_i_16bit_sse2 },
	{ CF_YC48, OUT_CSP_NV12,   BIT16, I,  1,  NONE,                 convert_yc48_to_nv12_i_16bit },
	
	{ CF_LW48, OUT_CSP_NV12,   BIT_8, P,  1,  NONE,                 convert_lw48_to_nv12 },
	{ CF_LW48, OUT_CSP_NV12,   BIT16, P,  8,  AVX|SSE41|SSSE3|SSE2, convert_lw48_to_nv12_16bit_avx },
	{ CF_LW48, OUT_CSP_NV12,   BIT16, P,  8,  SSE41|SSSE3|SSE2,     convert_lw48_to_nv12_16bit_sse41 },
	{ CF_LW48, OUT_CSP_NV12,   BIT16, P,  1,  NONE,                 convert_lw48_to_nv12_16bit },
	{ CF_LW48, OUT_CSP_NV12,   BIT_8, I,  1,  NONE,                 convert_lw48_to_nv12_i },
	{ CF_LW48, OUT_CSP_NV12,   BIT16, I,  8,  AVX|SSE41|SSSE3|SSE2, convert_lw48_to_nv12_i_16bit_avx },
	{ CF_LW48, OUT_CSP_NV12,   BIT16, I,  8,  SSE41|SSSE3|SSE2,     convert_lw48_to_nv12_i_16bit_sse41 },
	{ CF_LW48, OUT_CSP_NV12,   BIT16, I,  1,  NONE,                 convert_lw48_to_nv12_i_16bit },
	{ CF_LW48, OUT_CSP_NV16,   BIT_8, A,  1,  NONE,                 convert_lw48_to_nv16 },
	{ CF_LW48, OUT_CSP_NV16,   BIT16, A,  8,  AVX|SSE41|SSSE3|SSE2, convert_lw48_to_nv16_16bit_avx },
	{ CF_LW48, OUT_CSP_NV16,   BIT16, A,  8,  SSE41|SSSE3|SSE2,     convert_lw48_to_nv16_16bit_sse41 },
	{ CF_LW48, OUT_CSP_NV16,   BIT16, A,  1,  NONE,                 convert_lw48_to_nv16_16bit },
	{ CF_LW48, OUT_CSP_YUV444, BIT_8, A, 16,  AVX|SSE41|SSSE3|SSE2, convert_lw48_to_yuv444_avx },
	{ CF_LW48, OUT_CSP_YUV444, BIT_8, A, 16,  SSE41|SSSE3|SSE2,     convert_lw48_to_yuv444_sse41 },
	{ CF_LW48, OUT_CSP_YUV444, BIT_8, A,  1,  NONE,                 convert_lw48_to_yuv444 },
	{ CF_LW48, OUT_CSP_YUV444, BIT16, A,  8,  AVX|SSE41|SSSE3|SSE2, convert_lw48_to_yuv444_16bit_avx },
	{ CF_LW48, OUT_CSP_YUV444, BIT16, A,  8,  SSE41|SSSE3|SSE2,     convert_lw48_to_yuv444_16bit_sse41 },
	{ CF_LW48, OUT_CSP_YUV444, BIT16, A,  1,  NONE,                 convert_lw48_to_yuv444_16bit },

	{ 0, 0, 0, A, 0, 0, NULL }
};

static void build_simd_info(DWORD simd, char *buf, DWORD nSize) {
	ZeroMemory(buf, nSize);
	if (simd != NONE) {
		strcpy_s(buf, nSize, ", using");
		if (simd & SSE2)  strcat_s(buf, nSize, " SSE2");
		if (simd & SSE3)  strcat_s(buf, nSize, " SSE3");
		if (simd & SSSE3) strcat_s(buf, nSize, " SSSE3");
		if (simd & SSE41) strcat_s(buf, nSize, " SSE4.1");
		if (simd & SSE42) strcat_s(buf, nSize, " SSE4.2");
		if (simd & AVX)   strcat_s(buf, nSize, " AVX");
		if (simd & AVX2)  strcat_s(buf, nSize, " AVX2");
	}
}

static void auo_write_func_info(const COVERT_FUNC_INFO *func_info) {
	char simd_buf[128];
	build_simd_info(func_info->SIMD, simd_buf, _countof(simd_buf));

	if (func_info->output_csp == OUT_CSP_YUY2) {
		write_log_auo_line_fmt(LOG_INFO, "Passing YUY2", simd_buf);
		return;
	}

	if (func_info->output_csp == OUT_CSP_RGB) {
		write_log_auo_line_fmt(LOG_INFO, "Copying RGB%s", simd_buf);
		return;
	}

	const char *interlaced = NULL;
	switch (func_info->for_interlaced) {
		case P: interlaced = "p"; break;
		case I: interlaced = "i"; break;
		case A: 
		default:interlaced = ""; break;
	}
	const char *use_16bit = (func_info->for_16bit == BIT16) ? "(16bit)" : "";

	write_log_auo_line_fmt(LOG_INFO, "converting %s -> %s%s%s%s",
		CF_NAME[func_info->input_from_aviutl],
		specify_csp[func_info->output_csp],
		interlaced,
		use_16bit,
		simd_buf);
};

//C4189 : [Jϐ܂AQƂĂ܂B
#pragma warning( push )
#pragma warning( disable: 4189 )
//gp֐I
func_convert_frame get_convert_func(int width, int input_csp, BOOL use16bit, BOOL interlaced, int output_csp) {
	const DWORD availableSIMD = get_availableSIMD();

	const COVERT_FUNC_INFO *func_info = NULL;
	for (int i = 0; FUNC_TABLE[i].func; i++) {
		if (FUNC_TABLE[i].input_from_aviutl != input_csp)
			continue;
		if (FUNC_TABLE[i].output_csp != output_csp)
			continue;
		if (FUNC_TABLE[i].for_16bit != use16bit)
			continue;
		if (FUNC_TABLE[i].for_interlaced != A &&
			FUNC_TABLE[i].for_interlaced != (eInterlace)interlaced)
			continue;
		if ((width % FUNC_TABLE[i].mod) != 0)
			continue;
		if ((FUNC_TABLE[i].SIMD & availableSIMD) != FUNC_TABLE[i].SIMD)
			continue;

		func_info = &FUNC_TABLE[i];
		break;
	}

	if (func_info == NULL)
		return NULL;

	auo_write_func_info(func_info);
	return func_info->func;
}
#pragma warning( pop )

BOOL malloc_pixel_data(CONVERT_CF_DATA * const pixel_data, int width, int height, int output_csp, BOOL use16bit) {
	BOOL ret = TRUE;
	const DWORD pixel_size = (use16bit) ? sizeof(short) : sizeof(BYTE);
	const DWORD simd_check = get_availableSIMD();
	const DWORD align_size = (simd_check & AUO_SIMD_SSE2) ? ((simd_check & AUO_SIMD_AVX2) ? 32 : 16) : 1;
#define ALIGN_NEXT(i, align) (((i) + (align-1)) & (~(align-1))) //align2̗ݏ(1,2,4,8,16,32...)
	const DWORD frame_size = ALIGN_NEXT(width * height * pixel_size + (ALIGN_NEXT(width, align_size / pixel_size) - width) * 2 * pixel_size, align_size);
#undef ALIGN_NEXT

	ZeroMemory(pixel_data->data, sizeof(pixel_data->data));
	switch (output_csp) {
		case OUT_CSP_YUY2: //YUY2łĂRs[t[@\T|[g邽߂ɂ̓Rs[KvƂȂ
			if ((pixel_data->data[0] = (BYTE *)_mm_malloc(frame_size * 2, max(align_size, 16))) == NULL)
				ret = FALSE;
			break;
		case OUT_CSP_NV16:
			if ((pixel_data->data[0] = (BYTE *)_mm_malloc(frame_size * 2, max(align_size, 16))) == NULL)
				ret = FALSE;
			pixel_data->data[1] = pixel_data->data[0] + frame_size;
			break;
		case OUT_CSP_YUV444:
			if ((pixel_data->data[0] = (BYTE *)_mm_malloc(frame_size * 3, max(align_size, 16))) == NULL)
				ret = FALSE;
			pixel_data->data[1] = pixel_data->data[0] + frame_size;
			pixel_data->data[2] = pixel_data->data[1] + frame_size;
			break;
		case OUT_CSP_RGB:
			if ((pixel_data->data[0] = (BYTE *)_mm_malloc(frame_size * 3, max(align_size, 16))) == NULL)
				ret = FALSE;
			break;
		case OUT_CSP_NV12:
		default:
			if ((pixel_data->data[0] = (BYTE *)_mm_malloc(frame_size * 3 / 2, max(align_size, 16))) == NULL)
				ret = FALSE;
			pixel_data->data[1] = pixel_data->data[0] + frame_size;
			break;
	}
	return ret;
}

void free_pixel_data(CONVERT_CF_DATA *pixel_data) {
	if (pixel_data->data[0])
		_mm_free(pixel_data->data[0]);
	ZeroMemory(pixel_data, sizeof(CONVERT_CF_DATA));
}
