各種SIMD拡張命令(SSE2、SSSE3、AVX2)に対応

This commit is contained in:
HaijinW 2019-02-14 19:29:23 +09:00
parent 767a55d293
commit 4544996900
17 changed files with 2351 additions and 27 deletions

7
MEMO.txt Normal file
View File

@ -0,0 +1,7 @@
勉強用に復号処理をSIMD拡張命令で実装。
既存のコードや資料などを参考に、SSE2、SSSE3、AVX2に対応した。
初期化時には、AVX2、SSSE3、SSE2、拡張命令なしの順で利用可能なものを選択する。
ラウンド関数のあと、最後のXOR演算はもっとよい方法があればよかったが、思いつかなかった。
Windows環境x86-64でのみ動作確認。開発環境は Visual Studio 2017 Community (15.9.7)。
あくまで勉強用なので、安定的な動作の保証はない。

View File

@ -1,10 +1,11 @@
// IB25Decoder.h: IB25Decoder クラスのインターフェイス
// IB25Decoder.h: IB25Decoder クラスのインターフェイス
//
//////////////////////////////////////////////////////////////////////
#pragma once
#include <stdint.h>
/////////////////////////////////////////////////////////////////////////////
// 定数定義
@ -46,7 +47,8 @@ public:
virtual void DiscardNullPacket(const bool bEnable = true) = 0;
virtual void DiscardScramblePacket(const bool bEnable = true) = 0;
virtual void EnableEmmProcess(const bool bEnable = true) = 0;
virtual void SetMulti2Round(const int32_t round = 4) = 0; // オリジナルに追加
virtual void SetMulti2Round(const int32_t round = 4) = 0; // オリジナルに追加
virtual void SetSimdMode(const int32_t instruction = 3) = 0; // オリジナルに追加
virtual const DWORD GetDescramblingState(const WORD wProgramID) = 0;

View File

@ -17,7 +17,7 @@ CFLAGS = -O2 -fPIC -Wall $(PCSC_CFLAGS) -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=
LIBS = $(PCSC_LDLIBS)
LDFLAGS =
OBJS = arib_std_b25.o b_cas_card.o multi2.o ts_section_parser.o
OBJS = arib_std_b25.o b_cas_card.o multi2.o multi2_simd.o ts_section_parser.o
HEADERS = arib_std_b25.h arib_std_b25_error_code.h b_cas_card.h portable.h
TARGET_APP = b25
TARGET_LIB = libaribb25.so

View File

@ -5,6 +5,7 @@
#include "arib_std_b25.h"
#include "arib_std_b25_error_code.h"
#include "multi2.h"
#include "multi2_simd.h"
#include "ts_common_types.h"
#include "ts_section_parser.h"
@ -88,6 +89,9 @@ typedef struct {
int32_t multi2_round;
int32_t strip;
int32_t emm_proc_on;
#ifdef ENABLE_MULTI2_SIMD
int32_t simd_instruction;
#endif
int32_t unit_size;
@ -315,6 +319,7 @@ static void release_arib_std_b25(void *std_b25);
static int set_multi2_round_arib_std_b25(void *std_b25, int32_t round);
static int set_strip_arib_std_b25(void *std_b25, int32_t strip);
static int set_emm_proc_arib_std_b25(void *std_b25, int32_t on);
static int set_simd_mode_arib_std_b25(void *std_b25, int32_t instruction);
static int set_b_cas_card_arib_std_b25(void *std_b25, B_CAS_CARD *bcas);
static int set_unit_size_arib_std_b25(void *std_b25, int size);
static int reset_arib_std_b25(void *std_b25);
@ -344,6 +349,7 @@ ARIB_STD_B25 *create_arib_std_b25(void)
}
prv->multi2_round = 4;
prv->simd_instruction = (int32_t)get_supported_simd_instruction();
r = (ARIB_STD_B25 *)(prv+1);
r->private_data = prv;
@ -352,6 +358,7 @@ ARIB_STD_B25 *create_arib_std_b25(void)
r->set_multi2_round = set_multi2_round_arib_std_b25;
r->set_strip = set_strip_arib_std_b25;
r->set_emm_proc = set_emm_proc_arib_std_b25;
r->set_simd_mode = set_simd_mode_arib_std_b25;
r->set_b_cas_card = set_b_cas_card_arib_std_b25;
r->set_unit_size = set_unit_size_arib_std_b25;
r->reset = reset_arib_std_b25;
@ -380,7 +387,11 @@ static int32_t find_ca_descriptor_pid(uint8_t *head, uint8_t *tail, int32_t ca_s
static int32_t add_ecm_stream(ARIB_STD_B25_PRIVATE_DATA *prv, TS_STREAM_LIST *list, int32_t ecm_pid);
static int check_ecm_complete(ARIB_STD_B25_PRIVATE_DATA *prv);
static int find_ecm(ARIB_STD_B25_PRIVATE_DATA *prv);
#ifdef ENABLE_MULTI2_SIMD
static int proc_ecm(DECRYPTOR_ELEM *dec, B_CAS_CARD *bcas, int32_t multi2_round, int32_t simd_instruction);
#else
static int proc_ecm(DECRYPTOR_ELEM *dec, B_CAS_CARD *bcas, int32_t multi2_round);
#endif
static int proc_arib_std_b25(ARIB_STD_B25_PRIVATE_DATA *prv);
static int proc_cat(ARIB_STD_B25_PRIVATE_DATA *prv);
@ -471,6 +482,21 @@ static int set_emm_proc_arib_std_b25(void *std_b25, int32_t on)
return 0;
}
static int set_simd_mode_arib_std_b25(void * std_b25, int32_t instruction)
{
#ifdef ENABLE_MULTI2_SIMD
ARIB_STD_B25_PRIVATE_DATA *prv;
prv = private_data(std_b25);
if(prv == NULL){
return ARIB_STD_B25_ERROR_INVALID_PARAM;
}
prv->simd_instruction = instruction;
#endif
return 0;
}
static int set_b_cas_card_arib_std_b25(void *std_b25, B_CAS_CARD *bcas)
{
int n;
@ -694,7 +720,11 @@ static int flush_arib_std_b25(void *std_b25)
if(m == 0){
goto NEXT;
}
#ifdef ENABLE_MULTI2_SIMD
r = proc_ecm(dec, prv->bcas, prv->multi2_round, prv->simd_instruction);
#else
r = proc_ecm(dec, prv->bcas, prv->multi2_round);
#endif
if(r < 0){
if((curr+unit) <= tail)
l = unit;
@ -1140,19 +1170,19 @@ static void teardown(ARIB_STD_B25_PRIVATE_DATA *prv)
release_work_buffer(&(prv->dbuf));
}
static int set_unit_size_arib_std_b25(void *std_b25, int size)
{
ARIB_STD_B25_PRIVATE_DATA *prv;
prv = private_data(std_b25);
if (prv == NULL || size < 188 || size > 320) {
return ARIB_STD_B25_ERROR_INVALID_PARAM;
}
prv->unit_size = size;
return 0;
}
static int set_unit_size_arib_std_b25(void *std_b25, int size)
{
ARIB_STD_B25_PRIVATE_DATA *prv;
prv = private_data(std_b25);
if (prv == NULL || size < 188 || size > 320) {
return ARIB_STD_B25_ERROR_INVALID_PARAM;
}
prv->unit_size = size;
return 0;
}
static int select_unit_size(ARIB_STD_B25_PRIVATE_DATA *prv)
{
@ -1906,7 +1936,11 @@ static int find_ecm(ARIB_STD_B25_PRIVATE_DATA *prv)
goto NEXT;
}
#ifdef ENABLE_MULTI2_SIMD
r = proc_ecm(dec, prv->bcas, prv->multi2_round, prv->simd_instruction);
#else
r = proc_ecm(dec, prv->bcas, prv->multi2_round);
#endif
if(r < 0){
curr += unit;
goto LAST;
@ -1938,7 +1972,11 @@ LAST:
return r;
}
#ifdef ENABLE_MULTI2_SIMD
static int proc_ecm(DECRYPTOR_ELEM *dec, B_CAS_CARD *bcas, int32_t multi2_round, int32_t simd_instruction)
#else
static int proc_ecm(DECRYPTOR_ELEM *dec, B_CAS_CARD *bcas, int32_t multi2_round)
#endif
{
int r,n;
uint32_t len;
@ -2005,6 +2043,9 @@ static int proc_ecm(DECRYPTOR_ELEM *dec, B_CAS_CARD *bcas, int32_t multi2_round)
if(dec->m2 == NULL){
dec->m2 = create_multi2();
#ifdef ENABLE_MULTI2_SIMD
dec->m2->set_simd(dec->m2, (enum INSTRUCTION_TYPE)simd_instruction);
#endif
if(dec->m2 == NULL){
return ARIB_STD_B25_ERROR_NO_ENOUGH_MEMORY;
}
@ -2156,7 +2197,11 @@ static int proc_arib_std_b25(ARIB_STD_B25_PRIVATE_DATA *prv)
if(m == 0){
goto NEXT;
}
#ifdef ENABLE_MULTI2_SIMD
r = proc_ecm(dec, prv->bcas, prv->multi2_round, prv->simd_instruction);
#else
r = proc_ecm(dec, prv->bcas, prv->multi2_round);
#endif
if(r < 0){
return r;
}
@ -2708,7 +2753,8 @@ static int reserve_work_buffer(TS_WORK_BUFFER *buf, intptr_t size)
n += n;
}
p = (uint8_t *)malloc(n);
//p = (uint8_t *)malloc(n);
p = (uint8_t *)mem_aligned_alloc(n);
if(p == NULL){
return 0;
}
@ -2719,7 +2765,8 @@ static int reserve_work_buffer(TS_WORK_BUFFER *buf, intptr_t size)
if(m > 0){
memcpy(p, buf->head, m);
}
free(buf->pool);
//free(buf->pool);
mem_aligned_free(buf->pool);
buf->pool = NULL;
}
@ -2763,7 +2810,8 @@ static void reset_work_buffer(TS_WORK_BUFFER *buf)
static void release_work_buffer(TS_WORK_BUFFER *buf)
{
if(buf->pool != NULL){
free(buf->pool);
//free(buf->pool);
mem_aligned_free(buf->pool);
}
buf->pool = NULL;
buf->head = NULL;

View File

@ -32,6 +32,7 @@ typedef struct {
int (* set_multi2_round)(void *std_b25, int32_t round);
int (* set_strip)(void *std_b25, int32_t strip);
int (* set_emm_proc)(void *std_b25, int32_t on);
int (* set_simd_mode)(void *std_b25, int32_t instructin);
int (* set_b_cas_card)(void *std_b25, B_CAS_CARD *bcas);

View File

@ -22,7 +22,7 @@
<ProjectGuid>{6E77C1AC-A31A-49B9-9A52-9FE1E03B8FEC}</ProjectGuid>
<RootNamespace>arib_std_b25</RootNamespace>
<Keyword>Win32Proj</Keyword>
<WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
<WindowsTargetPlatformVersion>10.0.17763.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
@ -133,7 +133,7 @@
<PrecompiledHeader />
<WarningLevel>Level3</WarningLevel>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
<Optimization>Full</Optimization>
<Optimization>MaxSpeed</Optimization>
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
@ -161,7 +161,7 @@
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
<Optimization>Full</Optimization>
<Optimization>MaxSpeed</Optimization>
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
@ -184,6 +184,7 @@
<ClCompile Include="arib_std_b25.c" />
<ClCompile Include="b_cas_card.c" />
<ClCompile Include="multi2.c" />
<ClCompile Include="multi2_simd.c" />
<ClCompile Include="td.c" />
<ClCompile Include="ts_section_parser.c" />
</ItemGroup>
@ -194,7 +195,9 @@
<ClInclude Include="b_cas_card_error_code.h" />
<ClInclude Include="multi2.h" />
<ClInclude Include="multi2_error_code.h" />
<ClInclude Include="multi2_simd.h" />
<ClInclude Include="portable.h" />
<ClInclude Include="simd_instruction_type.h" />
<ClInclude Include="ts_common_types.h" />
<ClInclude Include="ts_section_parser.h" />
<ClInclude Include="ts_section_parser_error_code.h" />

View File

@ -30,6 +30,9 @@
<ClCompile Include="b_cas_card.c">
<Filter>ソース ファイル</Filter>
</ClCompile>
<ClCompile Include="multi2_simd.c">
<Filter>ソース ファイル</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="arib_std_b25.h">
@ -62,5 +65,11 @@
<ClInclude Include="ts_section_parser_error_code.h">
<Filter>ヘッダー ファイル</Filter>
</ClInclude>
<ClInclude Include="simd_instruction_type.h">
<Filter>ヘッダー ファイル</Filter>
</ClInclude>
<ClInclude Include="multi2_simd.h">
<Filter>ヘッダー ファイル</Filter>
</ClInclude>
</ItemGroup>
</Project>

View File

@ -1,4 +1,4 @@
// libaribb25.cpp: CB25Decoder クラスのインプリメンテーション
// libaribb25.cpp: CB25Decoder クラスのインプリメンテーション
//
//////////////////////////////////////////////////////////////////////
#include "libaribb25.h"
@ -248,6 +248,11 @@ void CB25Decoder::SetMulti2Round(const int32_t round)
_b25->set_multi2_round(_b25, round);
}
void CB25Decoder::SetSimdMode(const int32_t instruction)
{
_b25->set_simd_mode(_b25, instruction);
}
const DWORD CB25Decoder::GetDescramblingState(const WORD wProgramID)
{
// 指定したプログラムIDの復号状態を返す

View File

@ -27,6 +27,7 @@ public:
virtual void DiscardScramblePacket(const bool bEnable = true);
virtual void EnableEmmProcess(const bool bEnable = true);
virtual void SetMulti2Round(const int32_t round = 4);
virtual void SetSimdMode(const int32_t instruction = 2);
virtual const DWORD GetDescramblingState(const WORD wProgramID);
virtual void ResetStatistics(void);
virtual const DWORD GetPacketStride(void);

View File

@ -21,7 +21,7 @@
<PropertyGroup Label="Globals">
<ProjectGuid>{32FCD075-2C1D-4796-926B-A0009ECCD1E8}</ProjectGuid>
<RootNamespace>libaribb25</RootNamespace>
<WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
<WindowsTargetPlatformVersion>10.0.17763.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@ -109,7 +109,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>Full</Optimization>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>
@ -132,7 +132,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>Full</Optimization>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>
@ -157,6 +157,7 @@
<ClCompile Include="b_cas_card.c" />
<ClCompile Include="libaribb25.cpp" />
<ClCompile Include="multi2.c" />
<ClCompile Include="multi2_simd.c" />
<ClCompile Include="ts_section_parser.c" />
</ItemGroup>
<ItemGroup>
@ -168,7 +169,9 @@
<ClInclude Include="libaribb25.h" />
<ClInclude Include="multi2.h" />
<ClInclude Include="multi2_error_code.h" />
<ClInclude Include="multi2_simd.h" />
<ClInclude Include="portable.h" />
<ClInclude Include="simd_instruction_type.h" />
<ClInclude Include="ts_common_types.h" />
<ClInclude Include="ts_section_parser.h" />
<ClInclude Include="ts_section_parser_error_code.h" />

View File

@ -30,6 +30,9 @@
<ClCompile Include="libaribb25.cpp">
<Filter>ソース ファイル</Filter>
</ClCompile>
<ClCompile Include="multi2_simd.c">
<Filter>ソース ファイル</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="multi2.h">
@ -68,5 +71,11 @@
<ClInclude Include="libaribb25.h">
<Filter>ヘッダー ファイル</Filter>
</ClInclude>
<ClInclude Include="multi2_simd.h">
<Filter>ヘッダー ファイル</Filter>
</ClInclude>
<ClInclude Include="simd_instruction_type.h">
<Filter>ヘッダー ファイル</Filter>
</ClInclude>
</ItemGroup>
</Project>

View File

@ -2,6 +2,7 @@
#include <string.h>
#include "multi2.h"
#include "multi2_simd.h"
#include "multi2_error_code.h"
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
@ -35,8 +36,9 @@ typedef struct {
} CORE_PARAM;
typedef struct {
uint32_t l;
// change for 64bit bswap
uint32_t r;
uint32_t l;
} CORE_DATA;
typedef struct {
@ -52,6 +54,8 @@ typedef struct {
uint32_t round;
uint32_t state;
MULTI2_SIMD_DATA *simd;
} MULTI2_PRIVATE_DATA;
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
@ -67,12 +71,14 @@ typedef struct {
static void release_multi2(void *m2);
static int add_ref_multi2(void *m2);
static int set_round_multi2(void *m2, int32_t val);
static int set_simd_multi2(void *m2, enum INSTRUCTION_TYPE);
static int set_system_key_multi2(void *m2, uint8_t *val);
static int set_init_cbc_multi2(void *m2, uint8_t *val);
static int set_scramble_key_multi2(void *m2, uint8_t *val);
static int clear_scramble_key_multi2(void *m2);
static int encrypt_multi2(void *m2, int32_t type, uint8_t *buf, int32_t size);
static int decrypt_multi2(void *m2, int32_t type, uint8_t *buf, intptr_t size);
static int decrypt_with_simd_multi2(void *m2, int32_t type, uint8_t *buf, intptr_t size);
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
global function implementation
@ -97,10 +103,12 @@ MULTI2 *create_multi2(void)
prv->ref_count = 1;
prv->round = 4;
prv->simd = NULL;
r->release = release_multi2;
r->add_ref = add_ref_multi2;
r->set_round = set_round_multi2;
r->set_simd = set_simd_multi2;
r->set_system_key = set_system_key_multi2;
r->set_init_cbc = set_init_cbc_multi2;
r->set_scramble_key = set_scramble_key_multi2;
@ -126,6 +134,9 @@ static void core_pi2(CORE_DATA *dst, CORE_DATA *src, uint32_t a);
static void core_pi3(CORE_DATA *dst, CORE_DATA *src, uint32_t a, uint32_t b);
static void core_pi4(CORE_DATA *dst, CORE_DATA *src, uint32_t a);
static void alloc_data_for_simd(MULTI2_PRIVATE_DATA *prv);
static void release_data_for_simd(MULTI2_PRIVATE_DATA *prv);
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
interface method implementation
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@ -141,6 +152,7 @@ static void release_multi2(void *m2)
prv->ref_count -= 1;
if(prv->ref_count == 0){
release_data_for_simd(prv);
free(prv);
}
}
@ -170,14 +182,57 @@ static int set_round_multi2(void *m2, int32_t val)
}
prv->round = val;
set_round_for_simd(val);
return 0;
}
static int set_simd_multi2(void *m2, enum INSTRUCTION_TYPE instruction)
{
MULTI2_PRIVATE_DATA *prv;
MULTI2 *r;
MULTI2_SIMD_DATA *simd;
prv = private_data(m2);
simd = prv->simd;
if( instruction == get_simd_instruction() ){
if( (simd != NULL) || (instruction == INSTRUCTION_NORMAL) ){
return 0;
}
}
r = (MULTI2 *)(prv+1);
if( initialize_multi2_simd(instruction, m2) ){
r->decrypt = decrypt_with_simd_multi2;
if(simd == NULL){
alloc_data_for_simd(prv);
simd = prv->simd;
}
instruction = get_simd_instruction();
if(instruction == INSTRUCTION_AVX2){
simd->decrypt = decrypt_multi2_with_avx2;
}else if(instruction == INSTRUCTION_SSSE3){
simd->decrypt = decrypt_multi2_with_ssse3;
}else if(instruction == INSTRUCTION_SSE2){
simd->decrypt = decrypt_multi2_with_sse2;
}else{
simd->decrypt = decrypt_multi2_without_simd;
}
return 0;
}else{
r->decrypt = decrypt_multi2;
release_data_for_simd(prv);
return MULTI2_ERROR_INVALID_PARAMETER;
}
}
static int set_system_key_multi2(void *m2, uint8_t *val)
{
#ifndef USE_MULTI2_INTRINSIC
int i;
uint8_t *p;
#endif
MULTI2_PRIVATE_DATA *prv;
@ -186,10 +241,14 @@ static int set_system_key_multi2(void *m2, uint8_t *val)
return MULTI2_ERROR_INVALID_PARAMETER;
}
#ifdef USE_MULTI2_INTRINSIC
set_system_key_with_bswap((MULTI2_SIMD_SYS_KEY *)&(prv->sys), val);
#else
p = val;
for(i=0;i<8;i++){
p = load_be_uint32(prv->sys.key+i, p);
}
#endif
prv->state |= MULTI2_STATE_SYSTEM_KEY_SET;
@ -209,8 +268,12 @@ static int set_init_cbc_multi2(void *m2, uint8_t *val)
p = val;
#ifdef USE_MULTI2_INTRINSIC
set_data_key_with_bswap((MULTI2_SIMD_DATA_KEY *)&(prv->cbc_init), p);
#else
p = load_be_uint32(&(prv->cbc_init.l), p);
p = load_be_uint32(&(prv->cbc_init.r), p);
#endif
prv->state |= MULTI2_STATE_CBC_INIT_SET;
@ -222,6 +285,9 @@ static int set_scramble_key_multi2(void *m2, uint8_t *val)
uint8_t *p;
MULTI2_PRIVATE_DATA *prv;
#ifdef ENABLE_MULTI2_SIMD
MULTI2_SIMD_DATA *simd;
#endif
prv = private_data(m2);
if( (prv == NULL) || (val == NULL) ){
@ -230,14 +296,32 @@ static int set_scramble_key_multi2(void *m2, uint8_t *val)
p = val;
#ifdef USE_MULTI2_INTRINSIC
set_data_key_with_bswap((MULTI2_SIMD_DATA_KEY *)&(prv->scr[0]), p);
set_data_key_with_bswap((MULTI2_SIMD_DATA_KEY *)&(prv->scr[1]), p+8);
#else
p = load_be_uint32(&(prv->scr[0].l), p);
p = load_be_uint32(&(prv->scr[0].r), p);
p = load_be_uint32(&(prv->scr[1].l), p);
p = load_be_uint32(&(prv->scr[1].r), p);
#endif
core_schedule(prv->wrk+0, &(prv->sys), prv->scr+0);
core_schedule(prv->wrk+1, &(prv->sys), prv->scr+1);
#ifdef ENABLE_MULTI2_SIMD
simd = prv->simd;
if(simd != NULL){
if(get_simd_instruction() == INSTRUCTION_AVX2){
set_work_key_for_avx2(simd->wrk+0, (MULTI2_SIMD_SYS_KEY *)(prv->wrk+0));
set_work_key_for_avx2(simd->wrk+1, (MULTI2_SIMD_SYS_KEY *)(prv->wrk+1));
}else{
set_work_key_for_simd(simd->wrk+0, (MULTI2_SIMD_SYS_KEY *)(prv->wrk+0));
set_work_key_for_simd(simd->wrk+1, (MULTI2_SIMD_SYS_KEY *)(prv->wrk+1));
}
}
#endif
prv->state |= MULTI2_STATE_SCRAMBLE_KEY_SET;
return 0;
@ -390,6 +474,45 @@ static int decrypt_multi2(void *m2, int32_t type, uint8_t *buf, intptr_t size)
return 0;
}
static int decrypt_with_simd_multi2(void *m2, int32_t type, uint8_t *buf, intptr_t size)
{
MULTI2_SIMD_DATA *simd;
MULTI2_SIMD_SYS_KEY *prm;
MULTI2_SIMD_WORK_KEY *pck_wrk_key;
MULTI2_PRIVATE_DATA *prv;
prv = private_data(m2);
if( (prv == NULL) || (buf == NULL) || (size < 1) ){
return MULTI2_ERROR_INVALID_PARAMETER;
}
if(prv->state != (MULTI2_STATE_CBC_INIT_SET|MULTI2_STATE_SYSTEM_KEY_SET|MULTI2_STATE_SCRAMBLE_KEY_SET)){
if( (prv->state & MULTI2_STATE_CBC_INIT_SET) == 0 ){
return MULTI2_ERROR_UNSET_CBC_INIT;
}
if( (prv->state & MULTI2_STATE_SYSTEM_KEY_SET) == 0 ){
return MULTI2_ERROR_UNSET_SYSTEM_KEY;
}
if( (prv->state & MULTI2_STATE_SCRAMBLE_KEY_SET) == 0 ){
return MULTI2_ERROR_UNSET_SCRAMBLE_KEY;
}
}
simd = prv->simd;
if(type == 0x02){
prm = (MULTI2_SIMD_SYS_KEY *)(prv->wrk+1);
pck_wrk_key = simd->wrk+1;
}else{
prm = (MULTI2_SIMD_SYS_KEY *)(prv->wrk+0);
pck_wrk_key = simd->wrk+0;
}
simd->decrypt(buf, (uint32_t)size, prm, pck_wrk_key, (MULTI2_SIMD_DATA_KEY *)(&prv->cbc_init));
return 0;
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
private method implementation
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@ -525,3 +648,17 @@ static void core_pi4(CORE_DATA *dst, CORE_DATA *src, uint32_t a)
dst->l = src->l ^ t1;
dst->r = src->r;
}
void alloc_data_for_simd(MULTI2_PRIVATE_DATA *prv)
{
release_data_for_simd(prv);
prv->simd = (MULTI2_SIMD_DATA *)mem_aligned_alloc(sizeof(MULTI2_SIMD_DATA));
}
void release_data_for_simd(MULTI2_PRIVATE_DATA *prv)
{
if(prv->simd != NULL){
mem_aligned_free(prv->simd);
prv->simd = NULL;
}
}

View File

@ -2,6 +2,7 @@
#define MULTI2_H
#include "portable.h"
#include "simd_instruction_type.h"
typedef struct {
@ -11,6 +12,7 @@ typedef struct {
int (* add_ref)(void *m2);
int (* set_round)(void *m2, int32_t val);
int (* set_simd)(void *m2, enum INSTRUCTION_TYPE);
int (* set_system_key)(void *m2, uint8_t *val);
int (* set_init_cbc)(void *m2, uint8_t *val);

1879
aribb25/multi2_simd.c Normal file

File diff suppressed because it is too large Load Diff

136
aribb25/multi2_simd.h Normal file
View File

@ -0,0 +1,136 @@
#ifndef MULTI2_SIMD_H
#define MULTI2_SIMD_H
#include <stdint.h>
#include <stdbool.h>
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include "portable.h"
#include "simd_instruction_type.h"
#define USE_MULTI2_INTRINSIC // use intrinsic functions
#define ENABLE_MULTI2_SIMD // enable SIMD instructions
#ifdef ENABLE_MULTI2_SIMD
#define ENABLE_MULTI2_SSE2 // enable SSE2 instructions
#define ENABLE_MULTI2_SSSE3 // enable SSSE3 instructions
#ifdef ENABLE_MULTI2_SSSE3
#define ENABLE_MULTI2_AVX2 // enable AVX2 instructions
#endif
//#define USE_MULTI2_SIMD_ICC // use Intel C++ Compiler
#endif // ENABLE_MULTI2_SIMD
#ifdef ENABLE_MULTI2_AVX2
typedef union {
__m256i key256[8];
__m128i key[8];
} MULTI2_SIMD_WORK_KEY;
#else
typedef struct {
__m128i key[8];
} MULTI2_SIMD_WORK_KEY;
#endif
typedef struct {
union {
//#if !defined(USE_MULTI2_INTRINSIC) || !defined(_M_X64)
#if defined(_M_X64) || !defined(USE_MULTI2_INTRINSIC) || !defined(_M_X64)
struct {
uint32_t key1, key2, key3, key4, key5, key6, key7, key8;
};
#else
struct {
uint32_t key2, key1, key4, key3, key6, key5, key8, key7;
};
uint64_t data64[4];
#endif
uint8_t data[32];
};
} MULTI2_SIMD_SYS_KEY /* system key(Sk), expanded key(Wk) 256bit */;
typedef struct {
union {
struct {
uint32_t right, left;
};
uint64_t data64;
uint8_t data[8];
};
} MULTI2_SIMD_DATA_KEY /* data key(Dk) 64bit */;
typedef struct {
MULTI2_SIMD_WORK_KEY wrk[2]; /* 0: odd, 1: even */
void (* decrypt)(uint8_t * __restrict data, const uint32_t size,
const MULTI2_SIMD_SYS_KEY * __restrict work_key,
const MULTI2_SIMD_WORK_KEY * __restrict packed_work_key,
const MULTI2_SIMD_DATA_KEY * __restrict cbc_init);
} MULTI2_SIMD_DATA /* data set for SIMD */;
#ifdef __cplusplus
extern "C" {
#endif
extern bool is_simd_enabled();
extern bool is_sse2_available();
extern bool is_ssse3_available();
extern bool is_avx2_available();
extern bool initialize_multi2_simd(enum INSTRUCTION_TYPE instruction, void* m2);
extern void set_simd_instruction(enum INSTRUCTION_TYPE instruction);
extern enum INSTRUCTION_TYPE get_simd_instruction();
extern enum INSTRUCTION_TYPE get_supported_simd_instruction();
extern void alloc_work_key_for_simd(MULTI2_SIMD_WORK_KEY **work_key_odd, MULTI2_SIMD_WORK_KEY **work_key_even);
extern void free_work_key_for_simd(MULTI2_SIMD_WORK_KEY **work_key_odd, MULTI2_SIMD_WORK_KEY **work_key_even);
extern void set_work_key_for_simd(MULTI2_SIMD_WORK_KEY *work_key, const MULTI2_SIMD_SYS_KEY *src_key);
extern void set_work_key_for_avx2(MULTI2_SIMD_WORK_KEY *work_key, const MULTI2_SIMD_SYS_KEY *src_key);
extern void set_round_for_simd(const uint32_t round);
extern void set_system_key_with_bswap(MULTI2_SIMD_SYS_KEY *sys_key, const uint8_t *hex_data);
extern void get_system_key_with_bswap(const MULTI2_SIMD_SYS_KEY *sys_key, uint8_t *hex_data);
extern void set_data_key_with_bswap(MULTI2_SIMD_DATA_KEY *data_key, const uint8_t *hex_data);
extern void get_data_key_with_bswap(const MULTI2_SIMD_DATA_KEY *data_key, uint8_t *hex_data);
extern void decrypt_multi2_without_simd(uint8_t * __restrict data, const uint32_t size,
const MULTI2_SIMD_SYS_KEY * __restrict work_key,
const MULTI2_SIMD_WORK_KEY * __restrict packed_work_key,
const MULTI2_SIMD_DATA_KEY * __restrict cbc_init);
#ifdef ENABLE_MULTI2_SSE2
extern void decrypt_multi2_with_sse2(uint8_t * __restrict data, const uint32_t size,
const MULTI2_SIMD_SYS_KEY * __restrict work_key,
const MULTI2_SIMD_WORK_KEY * __restrict packed_work_key,
const MULTI2_SIMD_DATA_KEY * __restrict cbc_init);
#endif
#ifdef ENABLE_MULTI2_SSSE3
extern void decrypt_multi2_with_ssse3(uint8_t * __restrict data, const uint32_t size,
const MULTI2_SIMD_SYS_KEY * __restrict work_key,
const MULTI2_SIMD_WORK_KEY * __restrict packed_work_key,
const MULTI2_SIMD_DATA_KEY * __restrict cbc_init);
#endif
#ifdef ENABLE_MULTI2_AVX2
extern void decrypt_multi2_with_avx2(uint8_t * __restrict data, const uint32_t size,
const MULTI2_SIMD_SYS_KEY * __restrict work_key,
const MULTI2_SIMD_WORK_KEY * __restrict packed_work_key,
const MULTI2_SIMD_DATA_KEY * __restrict cbc_init);
#endif
#ifdef __cplusplus
}
#endif
#endif /* MULTI2_SIMD_H */

View File

@ -20,6 +20,76 @@
#define _S_IREAD (S_IRUSR|S_IRGRP|S_IROTH)
#define _S_IWRITE (S_IWUSR|S_IWGRP|S_IWOTH)
#ifndef __forceinline
#define __forceinline __attribute__((always_inline))
#endif
#ifndef __restrict
#define __restrict __restrict__
#endif
#ifdef __i386__
#define _M_IX86 __i386__
#endif
#ifdef __x86_64__
#define _M_X64 __x86_64__
#define _M_AMD64 __x86_64__
#endif
#if defined(__APPLE__)
#include <libkern/OSByteOrder.h>
#define _byteswap_ulong(x) OSSwapInt32(x)
#define _byteswap_uint64(x) OSSwapInt64(x)
#elif defined(__sun) || defined(sun)
#include <sys/byteorder.h>
#define _byteswap_ulong(x) BSWAP_32(x)
#define _byteswap_uint64(x) BSWAP_64(x)
#elif defined(__FreeBSD__)
#include <sys/endian.h>
#define _byteswap_ulong(x) bswap32(x)
#define _byteswap_uint64(x) bswap64(x)
#elif defined(__OpenBSD__)
#include <sys/types.h>
#define _byteswap_ulong(x) swap32(x)
#define _byteswap_uint64(x) swap64(x)
#elif defined(__NetBSD__)
#include <sys/types.h>
#include <machine/bswap.h>
#if defined(__BSWAP_RENAME) && !defined(_byteswap_ulong)
#define _byteswap_ulong(x) bswap32(x)
#define _byteswap_uint64(x) bswap64(x)
#endif
#else
#include <byteswap.h>
#define _byteswap_ulong(x) bswap_32(x)
#define _byteswap_uint64(x) bswap_64(x)
#endif /* defined(__APPLE__) */
#define mem_aligned_alloc(s) aligned_alloc(s, 32)
#define mem_aligned_free free
#define ALIGNAS(s) __attribute__((aligned(s)))
#else /* !defined(_WIN32) */
#define mem_aligned_alloc(s) _aligned_malloc(s, 32)
#define mem_aligned_free _aligned_free
#define ALIGNAS(s) __declspec(align(s))
#endif /* !defined(_WIN32) */
#endif /* PORTABLE_H */

View File

@ -0,0 +1,12 @@
#ifndef SIMD_INSTRUCTION_TYPE_H
#define SIMD_INSTRUCTION_TYPE_H
enum INSTRUCTION_TYPE
{
INSTRUCTION_NORMAL,
INSTRUCTION_SSE2,
INSTRUCTION_SSSE3,
INSTRUCTION_AVX2
};
#endif /* SIMD_INSTRUCTION_TYPE_H */