\doxysection{conv\+\_\+acc\+\_\+neon\+\_\+impl.\+h} \label{conv__acc__neon__impl_8h_source}\index{src/core/conv\_acc\_neon\_impl.h@{src/core/conv\_acc\_neon\_impl.h}} \textbf{ Go to the documentation of this file.} \begin{DoxyCode}{0} \DoxyCodeLine{1 } \DoxyCodeLine{4 \textcolor{comment}{/*}} \DoxyCodeLine{5 \textcolor{comment}{ * (C) 2020 by sysmocom -\/ s.f.m.c. GmbH}} \DoxyCodeLine{6 \textcolor{comment}{ * Author: Eric Wild}} \DoxyCodeLine{7 \textcolor{comment}{ *}} \DoxyCodeLine{8 \textcolor{comment}{ * All Rights Reserved}} \DoxyCodeLine{9 \textcolor{comment}{ *}} \DoxyCodeLine{10 \textcolor{comment}{ * SPDX-\/License-\/Identifier: GPL-\/2.0+}} \DoxyCodeLine{11 \textcolor{comment}{ *}} \DoxyCodeLine{12 \textcolor{comment}{ * This program is free software; you can redistribute it and/or modify}} \DoxyCodeLine{13 \textcolor{comment}{ * it under the terms of the GNU General Public License as published by}} \DoxyCodeLine{14 \textcolor{comment}{ * the Free Software Foundation; either version 2 of the License, or}} \DoxyCodeLine{15 \textcolor{comment}{ * (at your option) any later version.}} \DoxyCodeLine{16 \textcolor{comment}{ *}} \DoxyCodeLine{17 \textcolor{comment}{ * This program is distributed in the hope that it will be useful,}} \DoxyCodeLine{18 \textcolor{comment}{ * but WITHOUT ANY WARRANTY; without even the implied warranty of}} \DoxyCodeLine{19 \textcolor{comment}{ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the}} \DoxyCodeLine{20 \textcolor{comment}{ * GNU General Public License for more details.}} \DoxyCodeLine{21 \textcolor{comment}{ */}} \DoxyCodeLine{22 } \DoxyCodeLine{23 \textcolor{comment}{/* Some distributions (notably Alpine Linux) for some strange reason}} \DoxyCodeLine{24 \textcolor{comment}{ * don't have this \#define */}} \DoxyCodeLine{25 \textcolor{preprocessor}{\#ifndef \_\_always\_inline}} \DoxyCodeLine{26 \textcolor{preprocessor}{\#define \_\_always\_inline inline \_\_attribute\_\_((always\_inline))}} \DoxyCodeLine{27 \textcolor{preprocessor}{\#endif}} \DoxyCodeLine{28 } \DoxyCodeLine{29 \textcolor{preprocessor}{\#define NEON\_BUTTERFLY(M0,M1,M2,M3,M4) \(\backslash\)}} \DoxyCodeLine{30 \textcolor{preprocessor}{\{ \(\backslash\)}} \DoxyCodeLine{31 \textcolor{preprocessor}{ M3 = vqaddq\_s16(M0, M2); \(\backslash\)}} \DoxyCodeLine{32 \textcolor{preprocessor}{ M4 = vqsubq\_s16(M1, M2); \(\backslash\)}} \DoxyCodeLine{33 \textcolor{preprocessor}{ M0 = vqsubq\_s16(M0, M2); \(\backslash\)}} \DoxyCodeLine{34 \textcolor{preprocessor}{ M1 = vqaddq\_s16(M1, M2); \(\backslash\)}} \DoxyCodeLine{35 \textcolor{preprocessor}{ M2 = vmaxq\_s16(M3, M4); \(\backslash\)}} \DoxyCodeLine{36 \textcolor{preprocessor}{ M3 = vreinterpretq\_s16\_u16(vcgtq\_s16(M3, M4)); \(\backslash\)}} \DoxyCodeLine{37 \textcolor{preprocessor}{ M4 = vmaxq\_s16(M0, M1); \(\backslash\)}} \DoxyCodeLine{38 \textcolor{preprocessor}{ M1 = vreinterpretq\_s16\_u16(vcgtq\_s16(M0, M1)); \(\backslash\)}} \DoxyCodeLine{39 \textcolor{preprocessor}{\}}} \DoxyCodeLine{40 } \DoxyCodeLine{41 \textcolor{preprocessor}{\#define NEON\_DEINTERLEAVE\_K5(M0,M1,M2,M3) \(\backslash\)}} \DoxyCodeLine{42 \textcolor{preprocessor}{\{ \(\backslash\)}} \DoxyCodeLine{43 \textcolor{preprocessor}{ int16x8x2\_t tmp; \(\backslash\)}} \DoxyCodeLine{44 \textcolor{preprocessor}{ tmp = vuzpq\_s16(M0, M1); \(\backslash\)}} \DoxyCodeLine{45 \textcolor{preprocessor}{ M2 = tmp.val[0]; \(\backslash\)}} \DoxyCodeLine{46 \textcolor{preprocessor}{ M3 = tmp.val[1]; \(\backslash\)}} \DoxyCodeLine{47 \textcolor{preprocessor}{\}}} \DoxyCodeLine{48 } \DoxyCodeLine{49 \textcolor{preprocessor}{\#define NEON\_DEINTERLEAVE\_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11,M12,M13,M14,M15) \(\backslash\)}} \DoxyCodeLine{50 \textcolor{preprocessor}{\{ \(\backslash\)}} \DoxyCodeLine{51 \textcolor{preprocessor}{ int16x8x2\_t tmp; \(\backslash\)}} \DoxyCodeLine{52 \textcolor{preprocessor}{ tmp = vuzpq\_s16(M0, M1); \(\backslash\)}} \DoxyCodeLine{53 \textcolor{preprocessor}{ M8 = tmp.val[0]; M9 = tmp.val[1]; \(\backslash\)}} \DoxyCodeLine{54 \textcolor{preprocessor}{ tmp = vuzpq\_s16(M2, M3); \(\backslash\)}} \DoxyCodeLine{55 \textcolor{preprocessor}{ M10 = tmp.val[0]; M11 = tmp.val[1]; \(\backslash\)}} \DoxyCodeLine{56 \textcolor{preprocessor}{ tmp = vuzpq\_s16(M4, M5); \(\backslash\)}} \DoxyCodeLine{57 \textcolor{preprocessor}{ M12 = tmp.val[0]; M13 = tmp.val[1]; \(\backslash\)}} \DoxyCodeLine{58 \textcolor{preprocessor}{ tmp = vuzpq\_s16(M6, M7); \(\backslash\)}} \DoxyCodeLine{59 \textcolor{preprocessor}{ M14 = tmp.val[0]; M15 = tmp.val[1]; \(\backslash\)}} \DoxyCodeLine{60 \textcolor{preprocessor}{\}}} \DoxyCodeLine{61 } \DoxyCodeLine{62 \textcolor{preprocessor}{\#define NEON\_BRANCH\_METRIC\_N2(M0,M1,M2,M3,M4,M6,M7) \(\backslash\)}} \DoxyCodeLine{63 \textcolor{preprocessor}{\{ \(\backslash\)}} \DoxyCodeLine{64 \textcolor{preprocessor}{ M0 = vmulq\_s16(M4, M0); \(\backslash\)}} \DoxyCodeLine{65 \textcolor{preprocessor}{ M1 = vmulq\_s16(M4, M1); \(\backslash\)}} \DoxyCodeLine{66 \textcolor{preprocessor}{ M2 = vmulq\_s16(M4, M2); \(\backslash\)}} \DoxyCodeLine{67 \textcolor{preprocessor}{ M3 = vmulq\_s16(M4, M3); \(\backslash\)}} \DoxyCodeLine{68 \textcolor{preprocessor}{ M6 = vcombine\_s16(vpadd\_s16(vget\_low\_s16(M0), vget\_high\_s16(M0)), vpadd\_s16(vget\_low\_s16(M1), vget\_high\_s16(M1))); \(\backslash\)}} \DoxyCodeLine{69 \textcolor{preprocessor}{ M7 = vcombine\_s16(vpadd\_s16(vget\_low\_s16(M2), vget\_high\_s16(M2)), vpadd\_s16(vget\_low\_s16(M3), vget\_high\_s16(M3))); \(\backslash\)}} \DoxyCodeLine{70 \textcolor{preprocessor}{\}}} \DoxyCodeLine{71 } \DoxyCodeLine{72 \textcolor{preprocessor}{\#define NEON\_BRANCH\_METRIC\_N4(M0,M1,M2,M3,M4,M5) \(\backslash\)}} \DoxyCodeLine{73 \textcolor{preprocessor}{\{ \(\backslash\)}} \DoxyCodeLine{74 \textcolor{preprocessor}{ M0 = vmulq\_s16(M4, M0); \(\backslash\)}} \DoxyCodeLine{75 \textcolor{preprocessor}{ M1 = vmulq\_s16(M4, M1); \(\backslash\)}} \DoxyCodeLine{76 \textcolor{preprocessor}{ M2 = vmulq\_s16(M4, M2); \(\backslash\)}} \DoxyCodeLine{77 \textcolor{preprocessor}{ M3 = vmulq\_s16(M4, M3); \(\backslash\)}} \DoxyCodeLine{78 \textcolor{preprocessor}{ int16x4\_t t1 = vpadd\_s16(vpadd\_s16(vget\_low\_s16(M0), vget\_high\_s16(M0)), vpadd\_s16(vget\_low\_s16(M1), vget\_high\_s16(M1))); \(\backslash\)}} \DoxyCodeLine{79 \textcolor{preprocessor}{ int16x4\_t t2 = vpadd\_s16(vpadd\_s16(vget\_low\_s16(M2), vget\_high\_s16(M2)), vpadd\_s16(vget\_low\_s16(M3), vget\_high\_s16(M3))); \(\backslash\)}} \DoxyCodeLine{80 \textcolor{preprocessor}{ M5 = vcombine\_s16(t1, t2); \(\backslash\)}} \DoxyCodeLine{81 \textcolor{preprocessor}{\}}} \DoxyCodeLine{82 } \DoxyCodeLine{83 \textcolor{preprocessor}{\#define NEON\_NORMALIZE\_K5(M0,M1,M2,M3) \(\backslash\)}} \DoxyCodeLine{84 \textcolor{preprocessor}{\{ \(\backslash\)}} \DoxyCodeLine{85 \textcolor{preprocessor}{ M2 = vminq\_s16(M0, M1); \(\backslash\)}} \DoxyCodeLine{86 \textcolor{preprocessor}{ int16x4\_t t = vpmin\_s16(vget\_low\_s16(M2), vget\_high\_s16(M2)); \(\backslash\)}} \DoxyCodeLine{87 \textcolor{preprocessor}{ t = vpmin\_s16(t, t); \(\backslash\)}} \DoxyCodeLine{88 \textcolor{preprocessor}{ t = vpmin\_s16(t, t); \(\backslash\)}} \DoxyCodeLine{89 \textcolor{preprocessor}{ M2 = vdupq\_lane\_s16(t, 0); \(\backslash\)}} \DoxyCodeLine{90 \textcolor{preprocessor}{ M0 = vqsubq\_s16(M0, M2); \(\backslash\)}} \DoxyCodeLine{91 \textcolor{preprocessor}{ M1 = vqsubq\_s16(M1, M2); \(\backslash\)}} \DoxyCodeLine{92 \textcolor{preprocessor}{\}}} \DoxyCodeLine{93 } \DoxyCodeLine{94 \textcolor{preprocessor}{\#define NEON\_NORMALIZE\_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11) \(\backslash\)}} \DoxyCodeLine{95 \textcolor{preprocessor}{\{ \(\backslash\)}} \DoxyCodeLine{96 \textcolor{preprocessor}{ M8 = vminq\_s16(M0, M1); \(\backslash\)}} \DoxyCodeLine{97 \textcolor{preprocessor}{ M9 = vminq\_s16(M2, M3); \(\backslash\)}} \DoxyCodeLine{98 \textcolor{preprocessor}{ M10 = vminq\_s16(M4, M5); \(\backslash\)}} \DoxyCodeLine{99 \textcolor{preprocessor}{ M11 = vminq\_s16(M6, M7); \(\backslash\)}} \DoxyCodeLine{100 \textcolor{preprocessor}{ M8 = vminq\_s16(M8, M9); \(\backslash\)}} \DoxyCodeLine{101 \textcolor{preprocessor}{ M10 = vminq\_s16(M10, M11); \(\backslash\)}} \DoxyCodeLine{102 \textcolor{preprocessor}{ M8 = vminq\_s16(M8, M10); \(\backslash\)}} \DoxyCodeLine{103 \textcolor{preprocessor}{ int16x4\_t t = vpmin\_s16(vget\_low\_s16(M8), vget\_high\_s16(M8)); \(\backslash\)}} \DoxyCodeLine{104 \textcolor{preprocessor}{ t = vpmin\_s16(t, t); \(\backslash\)}} \DoxyCodeLine{105 \textcolor{preprocessor}{ t = vpmin\_s16(t, t); \(\backslash\)}} \DoxyCodeLine{106 \textcolor{preprocessor}{ M8 = vdupq\_lane\_s16(t, 0); \(\backslash\)}} \DoxyCodeLine{107 \textcolor{preprocessor}{ M0 = vqsubq\_s16(M0, M8); \(\backslash\)}} \DoxyCodeLine{108 \textcolor{preprocessor}{ M1 = vqsubq\_s16(M1, M8); \(\backslash\)}} \DoxyCodeLine{109 \textcolor{preprocessor}{ M2 = vqsubq\_s16(M2, M8); \(\backslash\)}} \DoxyCodeLine{110 \textcolor{preprocessor}{ M3 = vqsubq\_s16(M3, M8); \(\backslash\)}} \DoxyCodeLine{111 \textcolor{preprocessor}{ M4 = vqsubq\_s16(M4, M8); \(\backslash\)}} \DoxyCodeLine{112 \textcolor{preprocessor}{ M5 = vqsubq\_s16(M5, M8); \(\backslash\)}} \DoxyCodeLine{113 \textcolor{preprocessor}{ M6 = vqsubq\_s16(M6, M8); \(\backslash\)}} \DoxyCodeLine{114 \textcolor{preprocessor}{ M7 = vqsubq\_s16(M7, M8); \(\backslash\)}} \DoxyCodeLine{115 \textcolor{preprocessor}{\}}} \DoxyCodeLine{116 } \DoxyCodeLine{117 \_\_always\_inline \textcolor{keywordtype}{void} \_neon\_metrics\_k5\_n2(\textcolor{keyword}{const} int16\_t *val, \textcolor{keyword}{const} int16\_t *outa, int16\_t *sumsa, int16\_t *paths,} \DoxyCodeLine{118 \textcolor{keywordtype}{int} norm)} \DoxyCodeLine{119 \{} \DoxyCodeLine{120 int16\_t *\_\_restrict out = \_\_builtin\_assume\_aligned(outa, 8);} \DoxyCodeLine{121 int16\_t *\_\_restrict sums = \_\_builtin\_assume\_aligned(sumsa, 8);} \DoxyCodeLine{122 int16x8\_t m0, m1, m2, m3, m4, m5, m6;} \DoxyCodeLine{123 int16x4\_t input;} \DoxyCodeLine{124 } \DoxyCodeLine{125 \textcolor{comment}{/* (BMU) Load and expand 8-\/bit input out to 16-\/bits */}} \DoxyCodeLine{126 input = vld1\_s16(val);} \DoxyCodeLine{127 m2 = vcombine\_s16(input, input);} \DoxyCodeLine{128 } \DoxyCodeLine{129 \textcolor{comment}{/* (BMU) Load and compute branch metrics */}} \DoxyCodeLine{130 m0 = vld1q\_s16(\&out[0]);} \DoxyCodeLine{131 m1 = vld1q\_s16(\&out[8]);} \DoxyCodeLine{132 } \DoxyCodeLine{133 m0 = vmulq\_s16(m2, m0);} \DoxyCodeLine{134 m1 = vmulq\_s16(m2, m1);} \DoxyCodeLine{135 m2 = vcombine\_s16(vpadd\_s16(vget\_low\_s16(m0), vget\_high\_s16(m0)),} \DoxyCodeLine{136 vpadd\_s16(vget\_low\_s16(m1), vget\_high\_s16(m1)));} \DoxyCodeLine{137 } \DoxyCodeLine{138 \textcolor{comment}{/* (PMU) Load accumulated path matrics */}} \DoxyCodeLine{139 m0 = vld1q\_s16(\&sums[0]);} \DoxyCodeLine{140 m1 = vld1q\_s16(\&sums[8]);} \DoxyCodeLine{141 } \DoxyCodeLine{142 NEON\_DEINTERLEAVE\_K5(m0, m1, m3, m4)} \DoxyCodeLine{143 } \DoxyCodeLine{144 \textcolor{comment}{/* (PMU) Butterflies: 0-\/7 */}} \DoxyCodeLine{145 NEON\_BUTTERFLY(m3, m4, m2, m5, m6)} \DoxyCodeLine{146 } \DoxyCodeLine{147 \textcolor{keywordflow}{if} (norm)} \DoxyCodeLine{148 NEON\_NORMALIZE\_K5(m2, m6, m0, m1)} \DoxyCodeLine{149 } \DoxyCodeLine{150 vst1q\_s16(\&sums[0], m2);} \DoxyCodeLine{151 vst1q\_s16(\&sums[8], m6);} \DoxyCodeLine{152 vst1q\_s16(\&paths[0], m5);} \DoxyCodeLine{153 vst1q\_s16(\&paths[8], m4);} \DoxyCodeLine{154 \}} \DoxyCodeLine{155 } \DoxyCodeLine{156 \_\_always\_inline \textcolor{keywordtype}{void} \_neon\_metrics\_k5\_n4(\textcolor{keyword}{const} int16\_t *val, \textcolor{keyword}{const} int16\_t *outa, int16\_t *sumsa, int16\_t *paths,} \DoxyCodeLine{157 \textcolor{keywordtype}{int} norm)} \DoxyCodeLine{158 \{} \DoxyCodeLine{159 int16\_t *\_\_restrict out = \_\_builtin\_assume\_aligned(outa, 8);} \DoxyCodeLine{160 int16\_t *\_\_restrict sums = \_\_builtin\_assume\_aligned(sumsa, 8);} \DoxyCodeLine{161 int16x8\_t m0, m1, m2, m3, m4, m5, m6;} \DoxyCodeLine{162 int16x4\_t input;} \DoxyCodeLine{163 } \DoxyCodeLine{164 \textcolor{comment}{/* (BMU) Load and expand 8-\/bit input out to 16-\/bits */}} \DoxyCodeLine{165 input = vld1\_s16(val);} \DoxyCodeLine{166 m4 = vcombine\_s16(input, input);} \DoxyCodeLine{167 } \DoxyCodeLine{168 \textcolor{comment}{/* (BMU) Load and compute branch metrics */}} \DoxyCodeLine{169 m0 = vld1q\_s16(\&out[0]);} \DoxyCodeLine{170 m1 = vld1q\_s16(\&out[8]);} \DoxyCodeLine{171 m2 = vld1q\_s16(\&out[16]);} \DoxyCodeLine{172 m3 = vld1q\_s16(\&out[24]);} \DoxyCodeLine{173 } \DoxyCodeLine{174 NEON\_BRANCH\_METRIC\_N4(m0, m1, m2, m3, m4, m2)} \DoxyCodeLine{175 } \DoxyCodeLine{176 \textcolor{comment}{/* (PMU) Load accumulated path matrics */}} \DoxyCodeLine{177 m0 = vld1q\_s16(\&sums[0]);} \DoxyCodeLine{178 m1 = vld1q\_s16(\&sums[8]);} \DoxyCodeLine{179 } \DoxyCodeLine{180 NEON\_DEINTERLEAVE\_K5(m0, m1, m3, m4)} \DoxyCodeLine{181 } \DoxyCodeLine{182 \textcolor{comment}{/* (PMU) Butterflies: 0-\/7 */}} \DoxyCodeLine{183 NEON\_BUTTERFLY(m3, m4, m2, m5, m6)} \DoxyCodeLine{184 } \DoxyCodeLine{185 \textcolor{keywordflow}{if} (norm)} \DoxyCodeLine{186 NEON\_NORMALIZE\_K5(m2, m6, m0, m1)} \DoxyCodeLine{187 } \DoxyCodeLine{188 vst1q\_s16(\&sums[0], m2);} \DoxyCodeLine{189 vst1q\_s16(\&sums[8], m6);} \DoxyCodeLine{190 vst1q\_s16(\&paths[0], m5);} \DoxyCodeLine{191 vst1q\_s16(\&paths[8], m4);} \DoxyCodeLine{192 \}} \DoxyCodeLine{193 } \DoxyCodeLine{194 \_\_always\_inline \textcolor{keyword}{static} \textcolor{keywordtype}{void} \_neon\_metrics\_k7\_n2(\textcolor{keyword}{const} int16\_t *val, \textcolor{keyword}{const} int16\_t *outa, int16\_t *sumsa, int16\_t *paths,} \DoxyCodeLine{195 \textcolor{keywordtype}{int} norm)} \DoxyCodeLine{196 \{} \DoxyCodeLine{197 int16\_t *\_\_restrict out = \_\_builtin\_assume\_aligned(outa, 8);} \DoxyCodeLine{198 int16\_t *\_\_restrict sums = \_\_builtin\_assume\_aligned(sumsa, 8);} \DoxyCodeLine{199 int16x8\_t m0, m1, m2, m3, m4, m5, m6, m7;} \DoxyCodeLine{200 int16x8\_t m8, m9, m10, m11, m12, m13, m14, m15;} \DoxyCodeLine{201 int16x4\_t input;} \DoxyCodeLine{202 } \DoxyCodeLine{203 \textcolor{comment}{/* (PMU) Load accumulated path matrics */}} \DoxyCodeLine{204 m0 = vld1q\_s16(\&sums[0]);} \DoxyCodeLine{205 m1 = vld1q\_s16(\&sums[8]);} \DoxyCodeLine{206 m2 = vld1q\_s16(\&sums[16]);} \DoxyCodeLine{207 m3 = vld1q\_s16(\&sums[24]);} \DoxyCodeLine{208 m4 = vld1q\_s16(\&sums[32]);} \DoxyCodeLine{209 m5 = vld1q\_s16(\&sums[40]);} \DoxyCodeLine{210 m6 = vld1q\_s16(\&sums[48]);} \DoxyCodeLine{211 m7 = vld1q\_s16(\&sums[56]);} \DoxyCodeLine{212 } \DoxyCodeLine{213 \textcolor{comment}{/* (PMU) Deinterleave into even and odd packed registers */}} \DoxyCodeLine{214 NEON\_DEINTERLEAVE\_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)} \DoxyCodeLine{215 } \DoxyCodeLine{216 \textcolor{comment}{/* (BMU) Load and expand 8-\/bit input out to 16-\/bits */}} \DoxyCodeLine{217 input = vld1\_s16(val);} \DoxyCodeLine{218 m7 = vcombine\_s16(input, input);} \DoxyCodeLine{219 } \DoxyCodeLine{220 \textcolor{comment}{/* (BMU) Load and compute branch metrics */}} \DoxyCodeLine{221 m0 = vld1q\_s16(\&out[0]);} \DoxyCodeLine{222 m1 = vld1q\_s16(\&out[8]);} \DoxyCodeLine{223 m2 = vld1q\_s16(\&out[16]);} \DoxyCodeLine{224 m3 = vld1q\_s16(\&out[24]);} \DoxyCodeLine{225 } \DoxyCodeLine{226 NEON\_BRANCH\_METRIC\_N2(m0, m1, m2, m3, m7, m4, m5)} \DoxyCodeLine{227 } \DoxyCodeLine{228 m0 = vld1q\_s16(\&out[32]);} \DoxyCodeLine{229 m1 = vld1q\_s16(\&out[40]);} \DoxyCodeLine{230 m2 = vld1q\_s16(\&out[48]);} \DoxyCodeLine{231 m3 = vld1q\_s16(\&out[56]);} \DoxyCodeLine{232 } \DoxyCodeLine{233 NEON\_BRANCH\_METRIC\_N2(m0, m1, m2, m3, m7, m6, m7)} \DoxyCodeLine{234 } \DoxyCodeLine{235 \textcolor{comment}{/* (PMU) Butterflies: 0-\/15 */}} \DoxyCodeLine{236 NEON\_BUTTERFLY(m8, m9, m4, m0, m1)} \DoxyCodeLine{237 NEON\_BUTTERFLY(m10, m11, m5, m2, m3)} \DoxyCodeLine{238 } \DoxyCodeLine{239 vst1q\_s16(\&paths[0], m0);} \DoxyCodeLine{240 vst1q\_s16(\&paths[8], m2);} \DoxyCodeLine{241 vst1q\_s16(\&paths[32], m9);} \DoxyCodeLine{242 vst1q\_s16(\&paths[40], m11);} \DoxyCodeLine{243 } \DoxyCodeLine{244 \textcolor{comment}{/* (PMU) Butterflies: 17-\/31 */}} \DoxyCodeLine{245 NEON\_BUTTERFLY(m12, m13, m6, m0, m2)} \DoxyCodeLine{246 NEON\_BUTTERFLY(m14, m15, m7, m9, m11)} \DoxyCodeLine{247 } \DoxyCodeLine{248 vst1q\_s16(\&paths[16], m0);} \DoxyCodeLine{249 vst1q\_s16(\&paths[24], m9);} \DoxyCodeLine{250 vst1q\_s16(\&paths[48], m13);} \DoxyCodeLine{251 vst1q\_s16(\&paths[56], m15);} \DoxyCodeLine{252 } \DoxyCodeLine{253 \textcolor{keywordflow}{if} (norm)} \DoxyCodeLine{254 NEON\_NORMALIZE\_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)} \DoxyCodeLine{255 } \DoxyCodeLine{256 vst1q\_s16(\&sums[0], m4);} \DoxyCodeLine{257 vst1q\_s16(\&sums[8], m5);} \DoxyCodeLine{258 vst1q\_s16(\&sums[16], m6);} \DoxyCodeLine{259 vst1q\_s16(\&sums[24], m7);} \DoxyCodeLine{260 vst1q\_s16(\&sums[32], m1);} \DoxyCodeLine{261 vst1q\_s16(\&sums[40], m3);} \DoxyCodeLine{262 vst1q\_s16(\&sums[48], m2);} \DoxyCodeLine{263 vst1q\_s16(\&sums[56], m11);} \DoxyCodeLine{264 \}} \DoxyCodeLine{265 } \DoxyCodeLine{266 \_\_always\_inline \textcolor{keyword}{static} \textcolor{keywordtype}{void} \_neon\_metrics\_k7\_n4(\textcolor{keyword}{const} int16\_t *val, \textcolor{keyword}{const} int16\_t *outa, int16\_t *sumsa, int16\_t *paths,} \DoxyCodeLine{267 \textcolor{keywordtype}{int} norm)} \DoxyCodeLine{268 \{} \DoxyCodeLine{269 int16\_t *\_\_restrict out = \_\_builtin\_assume\_aligned(outa, 8);} \DoxyCodeLine{270 int16\_t *\_\_restrict sums = \_\_builtin\_assume\_aligned(sumsa, 8);} \DoxyCodeLine{271 int16x8\_t m0, m1, m2, m3, m4, m5, m6, m7;} \DoxyCodeLine{272 int16x8\_t m8, m9, m10, m11, m12, m13, m14, m15;} \DoxyCodeLine{273 int16x4\_t input;} \DoxyCodeLine{274 } \DoxyCodeLine{275 \textcolor{comment}{/* (PMU) Load accumulated path matrics */}} \DoxyCodeLine{276 m0 = vld1q\_s16(\&sums[0]);} \DoxyCodeLine{277 m1 = vld1q\_s16(\&sums[8]);} \DoxyCodeLine{278 m2 = vld1q\_s16(\&sums[16]);} \DoxyCodeLine{279 m3 = vld1q\_s16(\&sums[24]);} \DoxyCodeLine{280 m4 = vld1q\_s16(\&sums[32]);} \DoxyCodeLine{281 m5 = vld1q\_s16(\&sums[40]);} \DoxyCodeLine{282 m6 = vld1q\_s16(\&sums[48]);} \DoxyCodeLine{283 m7 = vld1q\_s16(\&sums[56]);} \DoxyCodeLine{284 } \DoxyCodeLine{285 \textcolor{comment}{/* (PMU) Deinterleave into even and odd packed registers */}} \DoxyCodeLine{286 NEON\_DEINTERLEAVE\_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)} \DoxyCodeLine{287 } \DoxyCodeLine{288 \textcolor{comment}{/* (BMU) Load and expand 8-\/bit input out to 16-\/bits */}} \DoxyCodeLine{289 input = vld1\_s16(val);} \DoxyCodeLine{290 m7 = vcombine\_s16(input, input);} \DoxyCodeLine{291 } \DoxyCodeLine{292 \textcolor{comment}{/* (BMU) Load and compute branch metrics */}} \DoxyCodeLine{293 m0 = vld1q\_s16(\&out[0]);} \DoxyCodeLine{294 m1 = vld1q\_s16(\&out[8]);} \DoxyCodeLine{295 m2 = vld1q\_s16(\&out[16]);} \DoxyCodeLine{296 m3 = vld1q\_s16(\&out[24]);} \DoxyCodeLine{297 } \DoxyCodeLine{298 NEON\_BRANCH\_METRIC\_N4(m0, m1, m2, m3, m7, m4)} \DoxyCodeLine{299 } \DoxyCodeLine{300 m0 = vld1q\_s16(\&out[32]);} \DoxyCodeLine{301 m1 = vld1q\_s16(\&out[40]);} \DoxyCodeLine{302 m2 = vld1q\_s16(\&out[48]);} \DoxyCodeLine{303 m3 = vld1q\_s16(\&out[56]);} \DoxyCodeLine{304 } \DoxyCodeLine{305 NEON\_BRANCH\_METRIC\_N4(m0, m1, m2, m3, m7, m5)} \DoxyCodeLine{306 } \DoxyCodeLine{307 m0 = vld1q\_s16(\&out[64]);} \DoxyCodeLine{308 m1 = vld1q\_s16(\&out[72]);} \DoxyCodeLine{309 m2 = vld1q\_s16(\&out[80]);} \DoxyCodeLine{310 m3 = vld1q\_s16(\&out[88]);} \DoxyCodeLine{311 } \DoxyCodeLine{312 NEON\_BRANCH\_METRIC\_N4(m0, m1, m2, m3, m7, m6)} \DoxyCodeLine{313 } \DoxyCodeLine{314 m0 = vld1q\_s16(\&out[96]);} \DoxyCodeLine{315 m1 = vld1q\_s16(\&out[104]);} \DoxyCodeLine{316 m2 = vld1q\_s16(\&out[112]);} \DoxyCodeLine{317 m3 = vld1q\_s16(\&out[120]);} \DoxyCodeLine{318 } \DoxyCodeLine{319 NEON\_BRANCH\_METRIC\_N4(m0, m1, m2, m3, m7, m7)} \DoxyCodeLine{320 } \DoxyCodeLine{321 \textcolor{comment}{/* (PMU) Butterflies: 0-\/15 */}} \DoxyCodeLine{322 NEON\_BUTTERFLY(m8, m9, m4, m0, m1)} \DoxyCodeLine{323 NEON\_BUTTERFLY(m10, m11, m5, m2, m3)} \DoxyCodeLine{324 } \DoxyCodeLine{325 vst1q\_s16(\&paths[0], m0);} \DoxyCodeLine{326 vst1q\_s16(\&paths[8], m2);} \DoxyCodeLine{327 vst1q\_s16(\&paths[32], m9);} \DoxyCodeLine{328 vst1q\_s16(\&paths[40], m11);} \DoxyCodeLine{329 } \DoxyCodeLine{330 \textcolor{comment}{/* (PMU) Butterflies: 17-\/31 */}} \DoxyCodeLine{331 NEON\_BUTTERFLY(m12, m13, m6, m0, m2)} \DoxyCodeLine{332 NEON\_BUTTERFLY(m14, m15, m7, m9, m11)} \DoxyCodeLine{333 } \DoxyCodeLine{334 vst1q\_s16(\&paths[16], m0);} \DoxyCodeLine{335 vst1q\_s16(\&paths[24], m9);} \DoxyCodeLine{336 vst1q\_s16(\&paths[48], m13);} \DoxyCodeLine{337 vst1q\_s16(\&paths[56], m15);} \DoxyCodeLine{338 } \DoxyCodeLine{339 \textcolor{keywordflow}{if} (norm)} \DoxyCodeLine{340 NEON\_NORMALIZE\_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)} \DoxyCodeLine{341 } \DoxyCodeLine{342 vst1q\_s16(\&sums[0], m4);} \DoxyCodeLine{343 vst1q\_s16(\&sums[8], m5);} \DoxyCodeLine{344 vst1q\_s16(\&sums[16], m6);} \DoxyCodeLine{345 vst1q\_s16(\&sums[24], m7);} \DoxyCodeLine{346 vst1q\_s16(\&sums[32], m1);} \DoxyCodeLine{347 vst1q\_s16(\&sums[40], m3);} \DoxyCodeLine{348 vst1q\_s16(\&sums[48], m2);} \DoxyCodeLine{349 vst1q\_s16(\&sums[56], m11);} \DoxyCodeLine{350 \}} \end{DoxyCode}