| log in |
Message boards : Application Code Discussion : Source Code of ATi app
| Author | Message |
|---|---|
|
now that the ATi app is a stock app (and no more only a gift from Gipsel - thanks man!), could we have the source code, owing to your GPL promise? | |
| ID: 35460 | Rating: 0 | rate:
| |
now that the ATi app is a stock app (and no more only a gift from Gipsel - thanks man!), could we have the source code, owing to your GPL promise? Actually, they don't have it right now. So far, I've givem them only some parts to speed up the CUDA version as it was considered work in progress. But it looks like it stabilized quite a bit, especially as the severe issues with the WinXP driver versions between 9.3 and 9.10/9.11 are solved now with Catalyst 9.12. First, I would have to clean it up a bit I guess. Probably I should talk to Travis and Anthony after the defense of my PhD thesis about it as Anthony mentioned that there are some code changes coming that may need to be implemented before putting a half done version up. We will know more in a few weeks ;) | |
| ID: 35493 | Rating: 0 | rate:
| |
|
ok! | |
| ID: 35526 | Rating: 0 | rate:
| |
I asked it because I want to see your magic! There's no magic involved. I just tuned some small pieces of code where the app does 99.9% of all the calculations. Just as an example, the integrals (background and stream) for a single stream WU is calculated by the following kernel: il_ps_2_0
dcl_literal l0,0x3FF00000,0x3FE80000,0xC0080000,0x00000000
dcl_literal l1,0xBFE00000,0xBFB00000,0x42700000,0x00000000
dcl_literal l2,0x667F3BCD,0x3FF6A09E,0x652B82FE,0x3FF71547
dcl_literal l3,0xFEFA39EF,0x3FE62E42,0xB649A98F,0x3F84AA4E
dcl_literal l4,0x4D1F00B9,0x3EF52B5C,0x389CEFF9,0x3FABF3E7
dcl_literal l5,0x478FF1EB,0x3F33185F,0xD06316DC,0x3E8657CD
dcl_output_usage(generic) o0.xyzw
dcl_output_usage(generic) o1.xyzw
dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
dcl_resource_id(1)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
dcl_resource_id(2)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
dcl_resource_id(3)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
dcl_resource_id(4)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
dcl_resource_id(5)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
dcl_input_position_interp(linear_noperspective) v0.xy__
dcl_cb cb0[3]
dcl_cb cb1[1]
dcl_cb cb2[5]
call 47
endmain
func 37 // div r279 = r280/r281
d2f r282.x___,r281.xy00
rcp_zeroop(zero) r282.x___,r282.000x
f2d r282.__zw,r282.x000 // r282 22bit reciprocal estimate
dmad r283.xy__,r281_neg(y).xy00,r282.zw00,l0.wx00
dmad r282.xy__,r282.zw00,r283.xy00,r282.zw00
dmul r284.xy__,r280.xy00,r282.xy00
dmad r285.xy__,r281_neg(y).xy00,r284.xy00,r280.xy00
dmad r279.xy__,r285.xy00,r282.xy00,r284.xy00
ret
func 38 // r298 = sqrt(r299)
d2f r300.x___,r299.xy00
rsq_zeroop(zero) r300.x___,r300.000x
f2d r300.xy__,r300_neg(x).x000 // -x
dmul r301.xy__,r300.xy00,r300.xy00 // x*x
dmad r301.xy__,r299.xy00,r301.xy00,l0.wz00 // y*(x*x)-3
dmul r300.xy__,r300.xy00,r301.xy00 // x = -x*(y*(x*x)-3)
dmul r302.xy__,r299.xy00,r300.xy00 // res = x*y
dmul r300.xy__,r300.xy00,r302.xy00 // res*x
dmad r300.xy__,r300.xy00,l1.wy00,l0.wy00 // 0.75 - 0.0625*res*x
dmul r298.xy__,r300.xy00,r302.xy00 // res*(0.75 - 0.0625*res*x)
ret
func 39 // r316 = exp(r317)
dmul r318.xy__,r317.xy00,l2.zw00 // x=log2e*arg; log2e = l2.zw
dfrac r319.__zw,r318.xy00 // px=dfrac(x);
dadd r320.__zw,r319.zw00,r318_neg(y).xy00 // px-x = -n
d2f r320.x___,r320.zw00
ftoi r320.x___,r320_neg(x).x000 // (int)n
dadd r318.xy__,r319.zw00,l1.wx00 // x = px-0.5; l1.0x = -0.5
dmul r319.xy__,r318.xy00,r318.xy00 // xx = x*x
dmad r321.xy__,r319.xy00,l4.xy00,l3.zw00 // xx*p2+p1
dmad r321.xy__,r319.xy00,r321.xy00,l3.xy00 // p0+xx*(p1+xx*p2)
dmul r280.xy__,r318.xy00,r321.xy00 // px = x*(p0+xx*(p1+xx*p2))
dmad r322.xy__,r319.xy00,l5.zw00,l5.xy00 // xx*q3+q2
dmad r322.xy__,r319.xy00,r322.xy00,l4.zw00 // q1+xx*(q2+xx*q3)
dmad r322.xy__,r319.xy00,r322.xy00,l0.wx00 // 1.0+(q1+xx*(q2+xx*q3)) l0.0x=1.0
dmad r281.xy__,r280.xy00,l1.wx00,r322.xy00 // -0.5*px + (1.0+(q1+xx*(q2+xx*q3))) l1.0x = -0.5
call 37 // r279 = r280/r281 x=px/(Q(x)-0.5px)
dmad r318.xy__,r279.xy00,l2.xy00,l2.xy00 // x=sqrt2*x+sqrt2
dldexp r316.xy__,r318.xy00,r320.x000 // return(x*2^n)
ret
func 47
sample_resource(4)_sampler(0) r363.xyzw,v0.xy00
sample_resource(5)_sampler(0) r365.xyzw,v0.xy00
sample_resource(0)_sampler(0) r395.xyzw,v0.y000
sample_resource(1)_sampler(0) r380.xy__,v0.y000
sample_resource(2)_sampler(0) r403.xy__,v0.x000
dmul r390.xy__,r403.xy00,cb2[0].xy00 // V = rc[inst.x].x * rc[inst.x].y * vid;
mov r377.xy__,v0.0x00 // i = 0; r377 = [instance.x][i]
itof r377.___w,cb2[4].000x // n, number of iterations as float
mov r393.__zw,r393.0000 // prob.x = 0.0;
mov r394.xy__,r394.0000 // prob.y = 0.0;
whileloop
breakc_relop(ge) r377.x000,r377.wwww // while(i<n)
sample_resource(3)_sampler(0) r397.xyzw,r377.xy00 // r = rj[inst.x][i];
add r377.x___,r377.x000,r377.1000 // i=i+1
dmad r389.xy__,r397.xy00,r395.xy00,cb2[3].xy00 //x = r.x * trilv.x - lbr_r;
dmul r388.xy__,r397.xy00,r395.zw00 //y = r.x * trilv.y;
dmul r387.xy__,r397.xy00,r380.xy00 //z = r.x * tribv;
dmul r411.xy__,r389.xy00,r389.xy00
dmad r413.xy__,r388.xy00,r388.xy00,r411.xy00
dmul r414.xy__,r387.xy00,r387.xy00
dmad r299.xy__,r414.xy00,cb2[2].xy00,r413.xy00
call 38 // rg = fsqrtd(x*x + y*y + z*z*q2_inv);
dadd r383.xy__,r298.xy00,cb2[1].xy00
dmul r281.xy__,r298.xy00,r383.xy00
dmul r281.xy__,r281.xy00,r383.xy00
dmul r281.xy__,r281.xy00,r383.xy00
mov r280.xy__,r397.zw00
call 37 // divd(r.y,rg*rs*rs*rs)
dadd r393.__zw,r393.zw00,r279.xy00 // prob.x += r.y/(rg*rs*rs*rs)
dadd r386.__zw,r389.xy00,cb0[0].zw00 // xs = x - ac[0][0].y;
dadd r385.xy__,r388.xy00,cb0[1].zw00 // ys = y - ac[0][1].y;
dadd r384.__zw,r387.xy00,cb0[2].zw00 // zs = z - ac[0][2].y;
dmul r381.xy__,cb0[0].xy00,r386.zw00 // -xs * ac[0][0].x
dmad r381.xy__,cb0[1].xy00,r385.xy00,r381.xy00 // -ys * ac[0][1].x - xs * ac[0][0].x
dmad r381.xy__,cb0[2].xy00,r384.zw00,r381.xy00 // -dotted = - zs*ac[0][2].x - ys*ac[0][1].x - xs*ac[0][0].x;
dmad r386.xy__,r381_neg(y).xy00,cb0[0].xy00,r386.zw00 // xs -= dotted * ac[0][0].x;
dmad r385.xy__,r381_neg(y).xy00,cb0[1].xy00,r385.xy00 // ys -= dotted * ac[0][1].x;
dmad r384.xy__,r381_neg(y).xy00,cb0[2].xy00,r384.zw00 // zs -= dotted * ac[0][2].x;
dmul r317.xy__,r386.xy00,r386.xy00
dmad r317.xy__,r385.xy00,r385.xy00,r317.xy00
dmad r317.xy__,r384.xy00,r384.xy00,r317.xy00
dmul r317.xy__,r317.xy00,cb1[0].xy00
call 39 // fexpd( -(xs*xs + ys*ys + zs*zs) * sigma2_inv[0] ) * r.y
dmad r394.xy__,r316.xy00,r397.zw00,r394.xy00 // prob.y += fexpd( -(xs*xs + ys*ys + zs*zs) * sigma2_inv[0] ) * r.y;
endloop
dmul r393.__zw,r393.zw00,r390.xy00 // prob.x*=V;
dmul r394.xy__,r394.xy00,r390.xy00 // prob.y*=V;
dadd r920.xy__,r363.xy00,r393.zw00 // temp.x = prob_in.x + prob.x
dadd r920.__zw,r920_neg(y).xy00,r363.xy00 // temp.y = prob.in.x - temp.x
dadd r920.__zw,r920.zw00,r393.zw00 // temp.y = prob.x + (prob_in.x - temp.x)
dadd r920.__zw,r363.zw00,r920.zw00 // temp.y = prob_in.y + (prob.x + (prob_in.x - temp.x));
mov o0.xyzw,r920.xyzw
// switch components to enable concurrent calculatation of 2 adds
dadd r921.__zw,r365.xy00,r394.xy00 // temp.y = stream1_in.x + prob.y
dadd r921.xy__,r921_neg(y).zw00,r365.xy00 // temp.x = stream1_in.x - temp.y
dadd r921.xy__,r921.xy00,r394.xy00 // temp.x = prob.y + (stream1_in.x - temp.y)
dadd r921.xy__,r365.zw00,r921.xy00 // temp.x = stream1_in.y + (prob.y + (stream1_in.x - temp.y));
mov o1.xyzw,r921.zwxy // bring it to right order
ret
end
The code for WUs with more streams are slightly longer. I've put some comments to the IL assembly code and one should be able to recognize what it is doing if one compares it with the C code, even when I named some things a bit differently. The code skips some checks and safety catches for the division and square root operations compared to the native (compiler generated) solutions to speed it up a bit. The latest CUDA version uses the same techniques. The exponential is a typical implementation using a quotient of two polynomials. There are faster solutions around (even using the same general layout), but I opted for this as it is a very compact coded version known to deliver very precise results and doesn't need a lot of lookup values or something like this. Does somebody know what algorithm nvidia uses for its double precision exp? ATI offered no math library function for the double precision exp, so one had to implement it on its own. Would be interesting to see how it compares to nvidias solution. Has anybody looked at the PTX code (with decuda or a similar tool) and can shed some light on this? | |
| ID: 35548 | Rating: 0 | rate:
| |
|
The PTX code can be generated from the source, but I'm not well versed in how exp() would be calculated, what method was used to calculate it in the ATI version above? | |
| ID: 35577 | Rating: 0 | rate:
| |
The PTX code can be generated from the source, but I'm not well versed in how exp() would be calculated, what method was used to calculate it in the ATI version above? I just thought it may be interesting how nvidia does it. My implementation is the function 39 above (maybe a bit hard to read in the assembly, but the comments may help). It's actually somehow similar (but not identical) to the implementation one finds also in the Cephes library, a specialized mathematical library (also for extended precision data types) offering often better precision than the compiler supplied libraries. But there are other examples which work roughly the same. As I said generally it calculates two polynomials of the argument (or its square) and divide them (*). The precision is quite good. It is correct up the last bit of the mantissa. For the implementation above I tried to limit the effect of the roundoff errors when using a non fused multiply-add on older GPUs. The same is true for the square root, that's why it may look a bit strange when comparing it to other versions one can find. But effectively it does the same. (*): Actually, one uses the fact that e^x = 2^(log2(e) * x). After calculating that, one splits the log2e*x into an integer n and a fractional part dx. Calculating 2^n for the integer part is easy. What is left is the fractional part dx in the range -0.5 to +0.5 (depending on the implementation, it may be also between 0 and 1 or between -ln(2) and +ln(2) or whatever). As 2^(log2(e) * x) = 2^n * 2^dx, one has now only to calculate 2^dx for a small range of dx. As said, this is done as the quotient of two low order polynomials of dx. There are other (potentially faster) methods but they are either less precise or require a lot more parameters than the few one needs for this method. | |
| ID: 35579 | Rating: 0 | rate:
| |
|
and after this you still say there's not magic? | |
| ID: 35589 | Rating: 0 | rate:
| |
|
For a detailed discussion about exponentiation algorithms, it might be interesting to look for papers on Pippenger's exponentiation algorithm. Of course, you won't be able to affect ATI's implementation of the exp() function, but perhaps you can further improve how it is used. | |
| ID: 35595 | Rating: 0 | rate:
| |
|
now that they're releasing the new app, maybe it would be useful if you Gipsel release the whole source code of your old app... So that they can implement the best way of doing some calculations! | |
| ID: 47215 | Rating: 0 | rate:
| |
now that they're releasing the new app, maybe it would be useful if you Gipsel release the whole source code of your old app... So that they can implement the best way of doing some calculations!This won't happen and it won't be useful. He never gave us the source, so I had to rewrite it. I'm pretty sure there's a GPL violation here also. I am not at all happy about it. I've spent a great many hours of my life that could have been spent on something new. | |
| ID: 47247 | Rating: 0 | rate:
| |
now that they're releasing the new app, maybe it would be useful if you Gipsel release the whole source code of your old app... So that they can implement the best way of doing some calculations!This won't happen and it won't be useful. He never gave us the source, so I had to rewrite it. I'm pretty sure there's a GPL violation here also. I am not at all happy about it. I've spent a great many hours of my life that could have been spent on something new. please don't be harsh on gipsel. He didn't use any of your code, and on top of it at the beginning Travis code was not GPL if I remember well. On the other hand, we both are sure that his code would have been useful, but you got much back in all these days with so many wus done... | |
| ID: 47488 | Rating: 0 | rate:
| |
He didn't use any of your code, and on top of it at the beginning Travis code was not GPL if I remember well.That's not true; there is most definitely code from milkyway@home in there. If it wasn't GPL, and if the code didn't have a license he couldn't use it at all. He even mentions in this thread http://milkyway.cs.rpi.edu/milkyway/forum_thread.php?id=576 that he's violating it. | |
| ID: 47514 | Rating: 0 | rate:
| |
He didn't use any of your code, and on top of it at the beginning Travis code was not GPL if I remember well.That's not true; there is most definitely code from milkyway@home in there. If it wasn't GPL, and if the code didn't have a license he couldn't use it at all. He even mentions in this thread http://milkyway.cs.rpi.edu/milkyway/forum_thread.php?id=576 that he's violating it. Sorry, I don't want to be pedantic, but I don't see anywhere in that thread an admission that his code is stolen from milkyway's code. I don't know when did you start to work in this project, but the first GPL release benefitted SO MUCH from Andreas Przystawik's work that maybe you're not enough thankful. Original Travis code (not GPL) was so bad that it wasn't even useful to optimize it, he just rewrote it from scratch and for CAL/IL ATi Hardware structures. But, I'm with you that maybe he could have released his source. Just please don't be so harsh. He worked a lot for YOUR project, worked also better than an insider, wrote an application that never got published (Travis misterious project of two years ago) and just opened your project to these levels of computation. | |
| ID: 47520 | Rating: 0 | rate:
| |
Sorry, I don't want to be pedantic, but I don't see anywhere in that thread an admission that his code is stolen from milkyway's code. Here http://milkyway.cs.rpi.edu/milkyway/forum_thread.php?id=576&nowrap=true#14639: "I know. "There is still the copyright issue. From a legal standpoint the MW project could demand to stop the distribution." I interpreted this as violating the GPL. In any case, if the original code wasn't GPL, and there wasn't a license, then it couldn't be used. On a second reading I guess he could mean using licenseless code. I don't know when did you start to work in this project, but the first GPL release benefitted SO MUCH from Andreas Przystawik's work that maybe you're not enough thankful. Original Travis code (not GPL) was so bad that it wasn't even useful to optimize it, he just rewrote it from scratch and for CAL/IL ATi Hardware structures.I started at the very end of last May. Other people were trying for a while to get the source from him, but after various ignored requests (after various promises for it) it gradually became one of my jobs to rewrite it. But, I'm with you that maybe he could have released his source. Just please don't be so harsh. He worked a lot for YOUR project, worked also better than an insider, wrote an application that never got published (Travis misterious project of two years ago) and just opened your project to these levels of computation. | |
| ID: 47527 | Rating: 0 | rate:
| |
Sorry, I don't want to be pedantic, but I don't see anywhere in that thread an admission that his code is stolen from milkyway's code. But then there's this: Of course, the only one with a clear right to complain (or file suit :P) is Travis, and he obviously won't do that, since he (now) agrees with the work being done, which I guess counts as having given permission to Physik to do what he's doing. http://milkyway.cs.rpi.edu/milkyway/forum_thread.php?id=576&nowrap=true#14840 I would say that the MW project would not have progressed to the point it has without the code optimized by Andreas. It seems that somewhere along the line hard feelings developed. That said, your latest ATI version is working beautifully. Kudos :) | |
| ID: 47899 | Rating: 0 | rate:
| |
|
It's probably a bit late to answer here. But I want to explain a bit the course of events from my point of view. | |
| ID: 50545 | Rating: 0 | rate:
| |
|
thanks a lot, Gipsel! | |
| ID: 50611 | Rating: 0 | rate:
| |
|
Is there someone with licensed ICC (Intel C++ Compiler) who could compile CPU app MW_0.91_intel_sse3 into an ATOM-optimized code? It would be 31 percent faster on ATOM, because it is old fashioned "in-order-processor"... | |
| ID: 51667 | Rating: 0 | rate:
| |
|
you can download intel compilers for free for non-commercial and academic uses. | |
| ID: 51679 | Rating: 0 | rate:
| |
Message boards :
Application Code Discussion :
Source Code of ATi app