2 .target sm_10, map_f64_to_f32
3 // compiled with /usr/local/cuda/bin/../open64/lib//be
4 // nvopencc 3.0 built on 2010-03-11
6 //-----------------------------------------------------------
7 // Compiling /tmp/tmpxft_00000236_00000000-7_prefix-sum.cpp3.i (/var/folders/K6/K6oI14wZ2RWhSE+BYqTjA++++TI/-Tmp-/ccBI#.0ATpGM)
8 //-----------------------------------------------------------
10 //-----------------------------------------------------------
12 //-----------------------------------------------------------
13 // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
14 // -O3 (Optimization level)
16 // -m2 (Report advisories)
17 //-----------------------------------------------------------
19 .file 1 "<command-line>"
20 .file 2 "/tmp/tmpxft_00000236_00000000-6_prefix-sum.cudafe2.gpu"
21 .file 3 "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
22 .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
23 .file 5 "/usr/local/cuda/bin/../include/host_defines.h"
24 .file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
25 .file 7 "/usr/local/cuda/bin/../include/device_types.h"
26 .file 8 "/usr/local/cuda/bin/../include/driver_types.h"
27 .file 9 "/usr/local/cuda/bin/../include/texture_types.h"
28 .file 10 "/usr/local/cuda/bin/../include/vector_types.h"
29 .file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
30 .file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h"
31 .file 13 "/usr/include/i386/_types.h"
32 .file 14 "/usr/include/time.h"
33 .file 15 "prefix-sum.cu"
34 .file 16 "/usr/local/cuda/bin/../include/common_functions.h"
35 .file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h"
36 .file 18 "/usr/local/cuda/bin/../include/math_functions.h"
37 .file 19 "/usr/local/cuda/bin/../include/device_functions.h"
38 .file 20 "/usr/local/cuda/bin/../include/math_constants.h"
39 .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
40 .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
41 .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
42 .file 24 "/usr/local/cuda/bin/../include/common_types.h"
43 .file 25 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
44 .file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
45 .file 27 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
46 .file 28 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
48 .extern .shared .align 4 .b8 temp[];
50 .entry _Z16prefix_sum_blockIjEvPT_S1_j (
51 .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in,
52 .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out,
53 .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n)
58 $LBB1__Z16prefix_sum_blockIjEvPT_S1_j:
59 ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
60 cvt.s32.u16 %r2, %tid.x;
61 setp.lt.u32 %p1, %r2, %r1;
64 ld.param.u32 %r3, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
65 mul24.lo.u32 %r4, %r2, 4;
66 add.u32 %r5, %r3, %r4;
67 ld.global.u32 %r6, [%r5+0];
74 add.u32 %r9, %r2, %r8;
75 mul.lo.u32 %r10, %r9, 4;
76 add.u32 %r11, %r10, %r7;
77 st.shared.u32 [%r11+0], %r6;
78 cvt.s32.u16 %r12, %ntid.x;
79 add.s32 %r13, %r12, %r2;
81 ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
83 setp.lt.u32 %p2, %r13, %r1;
86 ld.param.u32 %r14, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
87 mul.lo.u32 %r15, %r13, 4;
88 add.u32 %r16, %r14, %r15;
89 ld.global.u32 %r17, [%r16+0];
94 shr.u32 %r18, %r13, 4;
95 add.u32 %r19, %r13, %r18;
96 mul.lo.u32 %r20, %r19, 4;
97 add.u32 %r21, %r20, %r7;
98 st.shared.u32 [%r21+0], %r17;
102 setp.le.s32 %p3, %r12, %r23;
104 @%p3 bra $Lt_0_13314;
106 //<loop> Loop body line 39, nesting depth: 1, estimated iterations: unknown
109 setp.le.s32 %p4, %r22, %r2;
111 //<loop> Part of loop body line 39, head labeled $Lt_0_9218
113 mul24.lo.u32 %r25, %r2, 2;
114 add.u32 %r26, %r25, 1;
115 add.u32 %r27, %r25, 2;
116 mul.lo.u32 %r28, %r24, %r26;
117 mul.lo.u32 %r29, %r24, %r27;
118 sub.u32 %r30, %r29, 1;
119 shr.u32 %r31, %r30, 4;
120 add.u32 %r32, %r29, %r31;
121 mul.lo.u32 %r33, %r32, 4;
122 add.u32 %r34, %r33, %r7;
123 ld.shared.u32 %r35, [%r34+-4];
124 sub.u32 %r36, %r28, 1;
125 shr.u32 %r37, %r36, 4;
126 add.u32 %r38, %r28, %r37;
127 mul.lo.u32 %r39, %r38, 4;
128 add.u32 %r40, %r7, %r39;
129 ld.shared.u32 %r41, [%r40+-4];
130 add.u32 %r42, %r35, %r41;
131 st.shared.u32 [%r34+-4], %r42;
133 //<loop> Part of loop body line 39, head labeled $Lt_0_9218
135 shr.s32 %r22, %r22, 1;
136 shl.b32 %r24, %r24, 1;
138 setp.gt.s32 %p5, %r22, %r43;
144 setp.ne.s32 %p6, %r2, %r44;
145 @%p6 bra $Lt_0_10242;
147 mul24.lo.s32 %r45, %r12, 2;
149 sub.u32 %r47, %r45, 1;
150 shr.u32 %r48, %r47, 4;
151 add.u32 %r49, %r45, %r48;
152 mul.lo.u32 %r50, %r49, 4;
153 add.u32 %r51, %r7, %r50;
154 st.shared.u32 [%r51+-4], %r46;
157 setp.lt.s32 %p7, %r12, %r52;
158 @%p7 bra $Lt_0_10754;
161 //<loop> Loop body line 47, nesting depth: 1, estimated iterations: unknown
163 shr.s32 %r24, %r24, 1;
166 setp.le.s32 %p8, %r22, %r2;
167 @%p8 bra $Lt_0_11522;
168 //<loop> Part of loop body line 47, head labeled $Lt_0_11266
170 mul24.lo.u32 %r53, %r2, 2;
171 add.u32 %r54, %r53, 1;
172 mul.lo.u32 %r55, %r24, %r54;
173 sub.u32 %r56, %r55, 1;
174 shr.u32 %r57, %r56, 4;
175 add.u32 %r58, %r55, %r57;
176 mul.lo.u32 %r59, %r58, 4;
177 add.u32 %r60, %r59, %r7;
178 ld.shared.u32 %r61, [%r60+-4];
180 add.u32 %r62, %r53, 2;
181 mul.lo.u32 %r63, %r24, %r62;
182 sub.u32 %r64, %r63, 1;
183 shr.u32 %r65, %r64, 4;
184 add.u32 %r66, %r63, %r65;
185 mul.lo.u32 %r67, %r66, 4;
186 add.u32 %r68, %r67, %r7;
187 ld.shared.u32 %r69, [%r68+-4];
188 st.shared.u32 [%r60+-4], %r69;
190 ld.shared.u32 %r70, [%r68+-4];
191 add.u32 %r71, %r70, %r61;
192 st.shared.u32 [%r68+-4], %r71;
194 //<loop> Part of loop body line 47, head labeled $Lt_0_11266
196 shl.b32 %r22, %r22, 1;
197 setp.le.s32 %p9, %r22, %r12;
198 @%p9 bra $Lt_0_11266;
202 @!%p1 bra $Lt_0_12290;
204 ld.shared.u32 %r72, [%r11+0];
205 ld.param.u32 %r73, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
206 mul24.lo.u32 %r74, %r2, 4;
207 add.u32 %r75, %r73, %r74;
208 st.global.u32 [%r75+0], %r72;
210 @!%p2 bra $Lt_0_12802;
212 ld.shared.u32 %r76, [%r21+0];
213 ld.param.u32 %r77, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
214 mul.lo.u32 %r78, %r13, 4;
215 add.u32 %r79, %r77, %r78;
216 st.global.u32 [%r79+0], %r76;
220 $LDWend__Z16prefix_sum_blockIjEvPT_S1_j:
221 } // _Z16prefix_sum_blockIjEvPT_S1_j