//-------------------------------------------------------------------------------------
// DirectXMathSSE4.h -- SSE4.1 extensions for SIMD C++ Math library
//
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
//
// http://go.microsoft.com/fwlink/?LinkID=615560
//-------------------------------------------------------------------------------------

#pragma once

#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
#error SSE4 not supported on ARM platform
#endif

#include <smmintrin.h>

#include <DirectXMath.h>

namespace DirectX
{

namespace SSE4
{

inline bool XMVerifySSE4Support()
{
    // Should return true on AMD Bulldozer, Intel Core 2 ("Penryn"), and Intel Core i7 ("Nehalem") or later processors

    // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
    int CPUInfo[4] = { -1 };
#if defined(__clang__) || defined(__GNUC__)
    __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
    __cpuid(CPUInfo, 0);
#endif
    if ( CPUInfo[0] < 1  )
        return false;

#if defined(__clang__) || defined(__GNUC__)
    __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
    __cpuid(CPUInfo, 1);
#endif

    // We only check for SSE4.1 instruction set. SSE4.2 instructions are not used.
    return ( (CPUInfo[2] & 0x80000) == 0x80000 );
}


//-------------------------------------------------------------------------------------
// Vector
//-------------------------------------------------------------------------------------

#ifdef __clang__
#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
#endif

inline void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V)
{
    assert( y != nullptr );
    *reinterpret_cast<int*>(y) = _mm_extract_ps( V, 1 );
}

inline void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V)
{
    assert( z != nullptr );
    *reinterpret_cast<int*>(z) = _mm_extract_ps( V, 2 );
}

inline void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V)
{
    assert( w != nullptr );
    *reinterpret_cast<int*>(w) = _mm_extract_ps( V, 3 );
}

inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
{
    __m128i V1 = _mm_castps_si128( V );
    return static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
}

inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V)
{
    __m128i V1 = _mm_castps_si128( V );
    return static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
}

inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V)
{
    __m128i V1 = _mm_castps_si128( V );
    return static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
}

inline void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V)
{
    assert( y != nullptr );
    __m128i V1 = _mm_castps_si128( V );
    *y = static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
}

inline void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V)
{
    assert( z != nullptr );
    __m128i V1 = _mm_castps_si128( V );
    *z = static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
}

inline void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V)
{
    assert( w != nullptr );
    __m128i V1 = _mm_castps_si128( V );
    *w = static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
}

inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
{
    XMVECTOR vResult = _mm_set_ss(y);
    vResult = _mm_insert_ps( V, vResult, 0x10 );
    return vResult;
}

inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z)
{
    XMVECTOR vResult = _mm_set_ss(z);
    vResult = _mm_insert_ps( V, vResult, 0x20 );
    return vResult;
}

inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w)
{
    XMVECTOR vResult = _mm_set_ss(w);
    vResult = _mm_insert_ps( V, vResult, 0x30 );
    return vResult;
}

inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y)
{
    __m128i vResult = _mm_castps_si128( V );
    vResult = _mm_insert_epi32( vResult, static_cast<int>(y), 1 );
    return _mm_castsi128_ps( vResult );
}

inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
{
    __m128i vResult = _mm_castps_si128( V );
    vResult = _mm_insert_epi32( vResult, static_cast<int>(z), 2 );
    return _mm_castsi128_ps( vResult );
}

inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w)
{
    __m128i vResult = _mm_castps_si128( V );
    vResult = _mm_insert_epi32( vResult, static_cast<int>(w), 3 );
    return _mm_castsi128_ps( vResult );
}

inline XMVECTOR XM_CALLCONV XMVectorRound( FXMVECTOR V )
{
    return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
}

inline XMVECTOR XM_CALLCONV XMVectorTruncate( FXMVECTOR V )
{
    return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
}

inline XMVECTOR XM_CALLCONV XMVectorFloor( FXMVECTOR V )
{
    return _mm_floor_ps( V );
}

inline XMVECTOR XM_CALLCONV XMVectorCeiling( FXMVECTOR V )
{
    return _mm_ceil_ps( V );
}


//-------------------------------------------------------------------------------------
// Vector2
//-------------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2Dot( FXMVECTOR V1, FXMVECTOR V2 )
{
    return _mm_dp_ps( V1, V2, 0x3f );
}

inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V )
{
    return SSE4::XMVector2Dot(V, V);
}

inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
    return _mm_rsqrt_ps( vTemp );
}

inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
    return _mm_div_ps( g_XMOne, vLengthSq );
}

inline XMVECTOR XM_CALLCONV XMVector2LengthEst( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
    return _mm_sqrt_ps( vTemp );
}

inline XMVECTOR XM_CALLCONV XMVector2Length( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
    return _mm_sqrt_ps( vTemp );
}

inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
    return _mm_mul_ps(vResult, V);
}

inline XMVECTOR XM_CALLCONV XMVector2Normalize( FXMVECTOR V )
{
    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f );
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Create zero with a single instruction
    XMVECTOR vZeroMask = _mm_setzero_ps();
    // Test for a divide by zero (Must be FP to detect -0.0)
    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
    // Reciprocal mul to perform the normalization
    vResult = _mm_div_ps(V,vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult,vZeroMask);
    // Select qnan or result based on infinite length
    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
    vResult = _mm_or_ps(vTemp1,vTemp2);
    return vResult;
}


//-------------------------------------------------------------------------------------
// Vector3
//-------------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3Dot( FXMVECTOR V1, FXMVECTOR V2 )
{
    return _mm_dp_ps( V1, V2, 0x7f );
}

inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V )
{
    return SSE4::XMVector3Dot(V, V);
}

inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
    return _mm_rsqrt_ps( vTemp );
}

inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
    return _mm_div_ps( g_XMOne, vLengthSq );
}

inline XMVECTOR XM_CALLCONV XMVector3LengthEst( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
    return _mm_sqrt_ps( vTemp );
}

inline XMVECTOR XM_CALLCONV XMVector3Length( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
    return _mm_sqrt_ps( vTemp );
}

inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
    return _mm_mul_ps(vResult, V);
}

inline XMVECTOR XM_CALLCONV XMVector3Normalize( FXMVECTOR V )
{
    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f );
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Create zero with a single instruction
    XMVECTOR vZeroMask = _mm_setzero_ps();
    // Test for a divide by zero (Must be FP to detect -0.0)
    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
    // Divide to perform the normalization
    vResult = _mm_div_ps(V,vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult,vZeroMask);
    // Select qnan or result based on infinite length
    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
    vResult = _mm_or_ps(vTemp1,vTemp2);
    return vResult;
}


//-------------------------------------------------------------------------------------
// Vector4
//-------------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4Dot( FXMVECTOR V1, FXMVECTOR V2 )
{
    return _mm_dp_ps( V1, V2, 0xff );
}

inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V )
{
    return SSE4::XMVector4Dot(V, V);
}

inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
    return _mm_rsqrt_ps( vTemp );
}

inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
    XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
    return _mm_div_ps( g_XMOne, vLengthSq );
}

inline XMVECTOR XM_CALLCONV XMVector4LengthEst( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
    return _mm_sqrt_ps( vTemp );
}

inline XMVECTOR XM_CALLCONV XMVector4Length( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
    return _mm_sqrt_ps( vTemp );
}

inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst( FXMVECTOR V )
{
    XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
    return _mm_mul_ps(vResult, V);
}

inline XMVECTOR XM_CALLCONV XMVector4Normalize( FXMVECTOR V )
{
    XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff );
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Create zero with a single instruction
    XMVECTOR vZeroMask = _mm_setzero_ps();
    // Test for a divide by zero (Must be FP to detect -0.0)
    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
    // Divide to perform the normalization
    vResult = _mm_div_ps(V,vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult,vZeroMask);
    // Select qnan or result based on infinite length
    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
    vResult = _mm_or_ps(vTemp1,vTemp2);
    return vResult;
}


//-------------------------------------------------------------------------------------
// Plane
//-------------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst( FXMVECTOR P )
{
    XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f );
    XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
    return _mm_mul_ps(vResult, P);
}

inline XMVECTOR XM_CALLCONV XMPlaneNormalize( FXMVECTOR P )
{
    XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f );
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
    // Reciprocal mul to perform the normalization
    vResult = _mm_div_ps(P,vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult,vLengthSq);
    return vResult;
}

} // namespace SSE4

} // namespace DirectX