Lines Matching +full:- +full:j

16 …rror SSE2 instruction set is not enabled, try adding -march=native to CFLAGS or disable by adding 
18 #if !defined(PREFETCH_NTA) /* Use software prefetch and set non-temporal policy so that lines evict…
34 * Revision: 4.0-BETA, October 24, 1995
40 * the cost of floating-point operations relative to memory accesses.
46 * at least 20 clock-ticks. This will provide rate estimates
54 #define HLINE "-------------------------------------------------------------\n"
76 int BytesPerWord, j, k, size; in main() local
77 PetscInt node = -1; in main()
85 PetscCall(PetscOptionsGetInt(NULL, NULL, "-node", &node, NULL)); in main()
86 /* --- SETUP --- determine precision and check timing --- */ in main()
101 if (node == -1) { in main()
105 } else if (node == -2) { in main()
118 for (j = 0; j < N; j++) { in main()
119 a[j] = 1.0; in main()
120 b[j] = 2.0; in main()
121 c[j] = 0.0; in main()
124 for (j = 0; j < N; j++) a[j] = 1.0; in main()
125 for (j = 0; j < N; j++) b[j] = 2.0; in main()
126 for (j = 0; j < N; j++) c[j] = 0.0; in main()
135 for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; in main()
136 t = 1.0E6 * (Second() - t); in main()
145 PetscPrintf(PETSC_COMM_WORLD, "WARNING -- The above is only a rough guideline.\n"); in main()
150 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ in main()
155 /* ### COPY: c <- a ### */ in main()
161 for (j = 0; j < N; j += 8) { in main()
162 _mm_stream_pd(c + j + 0, _mm_load_pd(a + j + 0)); in main()
163 _mm_stream_pd(c + j + 2, _mm_load_pd(a + j + 2)); in main()
164 _mm_stream_pd(c + j + 4, _mm_load_pd(a + j + 4)); in main()
165 _mm_stream_pd(c + j + 6, _mm_load_pd(a + j + 6)); in main()
167 _mm_prefetch(a + j + 64, _MM_HINT_NTA); in main()
171 for (j = 0; j < N; j++) c[j] = a[j]; in main()
174 times[0][k] = Second() - times[0][k]; in main()
176 /* ### SCALE: b <- scalar * c ### */ in main()
182 for (j = 0; j < N; j += 8) { in main()
183 _mm_stream_pd(b + j + 0, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 0))); in main()
184 _mm_stream_pd(b + j + 2, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 2))); in main()
185 _mm_stream_pd(b + j + 4, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 4))); in main()
186 _mm_stream_pd(b + j + 6, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 6))); in main()
188 _mm_prefetch(c + j + 64, _MM_HINT_NTA); in main()
193 for (j = 0; j < N; j++) b[j] = scalar * c[j]; in main()
196 times[1][k] = Second() - times[1][k]; in main()
198 /* ### ADD: c <- a + b ### */ in main()
203 for (j = 0; j < N; j += 8) { in main()
204 _mm_stream_pd(c + j + 0, _mm_add_pd(_mm_load_pd(a + j + 0), _mm_load_pd(b + j + 0))); in main()
205 _mm_stream_pd(c + j + 2, _mm_add_pd(_mm_load_pd(a + j + 2), _mm_load_pd(b + j + 2))); in main()
206 _mm_stream_pd(c + j + 4, _mm_add_pd(_mm_load_pd(a + j + 4), _mm_load_pd(b + j + 4))); in main()
207 _mm_stream_pd(c + j + 6, _mm_add_pd(_mm_load_pd(a + j + 6), _mm_load_pd(b + j + 6))); in main()
209 _mm_prefetch(a + j + 64, _MM_HINT_NTA); in main()
210 _mm_prefetch(b + j + 64, _MM_HINT_NTA); in main()
215 for (j = 0; j < N; j++) c[j] = a[j] + b[j]; in main()
218 times[2][k] = Second() - times[2][k]; in main()
220 /* ### TRIAD: a <- b + scalar * c ### */ in main()
226 for (j = 0; j < N; j += 8) { in main()
227 …_mm_stream_pd(a + j + 0, _mm_add_pd(_mm_load_pd(b + j + 0), _mm_mul_pd(scalar2, _mm_load_pd(c + j in main()
228 …_mm_stream_pd(a + j + 2, _mm_add_pd(_mm_load_pd(b + j + 2), _mm_mul_pd(scalar2, _mm_load_pd(c + j in main()
229 …_mm_stream_pd(a + j + 4, _mm_add_pd(_mm_load_pd(b + j + 4), _mm_mul_pd(scalar2, _mm_load_pd(c + j in main()
230 …_mm_stream_pd(a + j + 6, _mm_add_pd(_mm_load_pd(b + j + 6), _mm_mul_pd(scalar2, _mm_load_pd(c + j in main()
232 _mm_prefetch(b + j + 64, _MM_HINT_NTA); in main()
233 _mm_prefetch(c + j + 64, _MM_HINT_NTA); in main()
238 for (j = 0; j < N; j++) a[j] = b[j] + scalar * c[j]; in main()
241 times[3][k] = Second() - times[3][k]; in main()
244 /* --- SUMMARY --- */ in main()
247 for (j = 0; j < 4; j++) { in main()
248 rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]); in main()
249 mintime[j] = MIN(mintime[j], times[j][k]); in main()
250 maxtime[j] = MAX(maxtime[j], times[j][k]); in main()
254 for (j = 0; j < 4; j++) { in main()
255 rmstime[j] = sqrt(rmstime[j] / (double)NTIMES); in main()
256 …4f %11.4f\n", label[j], 1.0e-06 * bytes[j] / mintime[j], size * 1.0e-06 * bytes[j] / mintime[j], … in main()
279 while ((t2 = Second()) - t1 < 1.0E-6) { } in checktick()
291 Delta = (int)(1.0E6 * (timesfound[i] - timesfound[i - 1])); in checktick()