Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members   Related Pages  

ccvt_i386.S

00001 /*
00002 Colour conversion routines (RGB <-> YUV) in x86 assembly
00003  
00004 (C) 2000 Nemosoft Unv.    nemosoft@smcc.demon.nl
00005    
00006 This program is free software; you can redistribute it and/or
00007 modify it under the terms of the GNU General Public License
00008 as published by the Free Software Foundation; either version 2
00009 of the License, or (at your option) any later version.
00010 
00011 This program is distributed in the hope that it will be useful,
00012 but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 GNU General Public License for more details.
00015 
00016 You should have received a copy of the GNU General Public License
00017 along with this program; if not, write to the Free Software
00018 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.   
00019 
00020 */
00021 
00022 
00023 /* The ccvt_* functions always start with width and height, so these
00024    parameters are in 8(%ebp) and 12 (%ebp). The other parameters can be
00025    2 to 4 pointers, and one of these combinations:
00026    *src, *dst
00027    *srcy, *srcu, *srv, *dst
00028    *src, *dsty, *dstu, *dstv
00029  */   
00030 
00031 #define __ASSEMBLY__
00032 #include <linux/linkage.h>
00033 
00034 #define Width   8(%ebp)
00035 #define Height 12(%ebp)
00036 
00037 /* 2 parameters, 1 in, 1 out */
00038 #define Src2 16(%ebp)
00039 #define Dst2 20(%ebp)
00040 
00041 /* 4 parameters, 3 in, 1 out */
00042 #define SrcY 16(%ebp)
00043 #define SrcU 20(%ebp)
00044 #define SrcV 24(%ebp)
00045 #define Dst4 28(%ebp)
00046 
00047 /* 4 parameters, 1 in, 3 out */
00048 #define Src4 16(%ebp)
00049 #define DstY 20(%ebp)
00050 #define DstU 24(%ebp)
00051 #define DstV 28(%ebp)
00052 
00053 /* This buffer space used to be staticly allocted, but this is going to
00054    give problems with multiple cams (though I have yet to see it).
00055    Therefor, we reserve at least 64 + 8 = 72 bytes on the stack with 
00056    `enter'.
00057  */
00058 
00059 #define PixelBuffer -64(%ebp)
00060 #define Uptr        -68(%ebp)
00061 #define Vptr        -72(%ebp)
00062 
00063         .text
00064 
00065 /* This function will load the src and destination pointers, including
00066    Uptr/Vptr when necessary, and test the width/height parameters.
00067    - %esi will be set to Src or SrcY
00068    - %edi will be set to Dst or DstY
00069    the carry flag will be set if any of these tests fail. 
00070    It assumes %ebp has been set.
00071  */
00072 /* 2 parameters, src & dst */
00073 test_param_2:
00074         mov Src2, %esi
00075         mov Dst2, %edi
00076         
00077         cmp $0, %esi            # NULL pointers?
00078         je param_fail
00079         cmp $0, %edi
00080         je param_fail
00081 
00082         jmp test_width_height
00083 
00084 /* 3 inputs, 1 output */
00085 test_param_31:
00086         mov Dst4, %edi          # NULL pointers
00087         cmp $0, %edi
00088         je param_fail
00089         
00090         mov SrcV, %esi
00091         cmp $0, %esi
00092         je param_fail
00093         mov %esi, Vptr
00094 
00095         mov SrcU, %esi
00096         cmp $0, %esi
00097         je param_fail
00098         mov %esi, Uptr
00099         
00100         mov SrcY, %esi
00101         cmp $0, %esi
00102         je param_fail
00103         
00104         jmp test_width_height
00105 
00106 /* 1 input, 3 output */ 
00107 test_param_13:
00108         mov Src4, %esi          # NULL pointers
00109         cmp $0, %esi
00110         je param_fail
00111         
00112         mov DstV, %edi
00113         cmp $0, %edi
00114         je param_fail
00115         mov %edi, Vptr
00116         
00117         mov DstU, %edi
00118         cmp $0, %edi
00119         je param_fail
00120         mov %edi, Uptr
00121         
00122         mov DstY, %edi
00123         cmp $0, %edi
00124         je param_fail
00125         
00126         jmp test_width_height
00127         
00128         nop
00129 
00130 test_width_height:
00131         cmpl $0, Width
00132         jbe param_fail
00133         testl $3, Width         # multiple of 4?
00134         jnz param_fail          # Nope...
00135 
00136         cmp $0, Height          # check illegal height
00137         jbe param_fail
00138         testl $1, Height        # Odd no. of lines?
00139         jnz param_fail          # Aye
00140 
00141         /* fall through */
00142 
00143 /* exit points */
00144 param_ok:
00145         clc                     # Success: clear carry
00146         ret
00147 
00148 param_fail:
00149         stc                     # Fail: set carry
00150         ret
00151 
00152 
00153 
00154 # This will fill PixelBuffer with 4 grey scale pixels (Y)
00155 # In:           %eax = Value (Y3Y2Y1Y0)
00156 # Out:
00157 # Modifies:     %ecx (-4)
00158 # Destroys:     %edx
00159 expand_4_y:
00160         mov %eax, %edx          # Keep in edx (we need eax)
00161         lea PixelBuffer, %edi   
00162         
00163 0:      # This code is executed 4 times
00164         movzbl %dl, %eax        # move, zero extending byte-to-long
00165         shl $8, %eax            # 8 digit precision
00166         
00167         stosl                   # Expand into PixelBuffer
00168         stosl
00169         stosl
00170         add $4, %edi            # Skip alpha
00171 
00172         shr $8, %edx            # next Y
00173 
00174         dec %ecx
00175         test $3, %ecx
00176         jnz 0b
00177 
00178         ret                     # from expand_4_y
00179         
00180 # This will add the color factors to the (grey) values in PixelBuffer
00181 # In:           %ebx (U1U0V1V0)
00182 # Out:
00183 # Modifies:
00184 # Destroys:     %edi, %ebx, %eax, %edx
00185 expand_4_uv:
00186         lea PixelBuffer, %edi   # reset pointer
00187 
00188         # V0
00189         sub $128, %bl
00190         movsbl %bl, %eax
00191         mov $359, %edx          # Vr
00192         mul %edx
00193         add %eax, 0x00(%edi)
00194         add %eax, 0x10(%edi)
00195         
00196         movsbl %bl, %eax
00197         mov $183, %edx          # Vg
00198         mul %edx
00199         sub %eax, 0x04(%edi)
00200         sub %eax, 0x14(%edi)
00201         
00202         # V1
00203         sub $128, %bh
00204         movsbl %bh, %eax
00205         mov $359, %edx          # Vr
00206         mul %edx
00207         add %eax, 0x20(%edi)
00208         add %eax, 0x30(%edi)
00209         
00210         movsbl %bh, %eax
00211         mov $183, %edx          # Vg
00212         mul %edx
00213         sub %eax, 0x24(%edi)
00214         sub %eax, 0x34(%edi)
00215         
00216         # U0
00217         bswap %ebx              # Get U values in lower half
00218         sub $128, %bh
00219         movsbl %bh, %eax
00220         mov $88, %edx           # Ug
00221         mul %edx
00222         sub %eax, 0x04(%edi)
00223         sub %eax, 0x14(%edi)
00224 
00225         movsbl %bh, %eax
00226         mov $454, %edx          # Ub
00227         mul %edx
00228         add %eax, 0x08(%edi)
00229         add %eax, 0x18(%edi)
00230         
00231         # U1
00232         sub $128, %bl
00233         movsbl %bl, %eax
00234         mov $88, %edx           # Ug
00235         mul %edx
00236         sub %eax, 0x24(%edi)
00237         sub %eax, 0x34(%edi)
00238         
00239         movsbl %bl, %eax
00240         mov $454, %edx          # Ub
00241         mul %edx
00242         add %eax, 0x28(%edi)
00243         add %eax, 0x38(%edi)
00244         ret                     # expand_4_uv
00245 
00246 
00247 /* This function expands 4 420i pixels into PixelBuffer */
00248 do_four_yuvi:
00249         push %edi
00250 
00251         lodsl                   # 4 bytes at a time
00252         
00253         call expand_4_y
00254         
00255         # now do UV values. on even lines, Y is followed by U values; on 
00256         # odd lines V values follow. The U and V values are always pushed
00257         # on the stack in this order:
00258         # U V
00259         
00260         # First, calculate offset per line (1.5 * width)
00261         mov Width, %ebx # width
00262         shl %ebx                # 2 *
00263         add Width, %ebx # 3 * 
00264         shr %ebx                # 1.5 *
00265 
00266         # even or odd lines     
00267         testl $1, Height
00268         jz 2f
00269 
00270         # odd line; we are at V data, but do U data first
00271         neg %ebx                # make ebx offset negative
00272         mov (%esi,%ebx),%ax     # U
00273         push %ax
00274         lodsw                   # V
00275         push %ax
00276         jmp 3f  
00277         
00278 2:      # even line
00279         lodsw                   # U
00280         push %ax
00281         sub $2, %ebx
00282         mov (%esi,%ebx), %ax    # V
00283         push %ax
00284 
00285 3:      # Okay, so we now have the U and V values... expand into PixelBuffer
00286 
00287         pop %ebx
00288         call expand_4_uv
00289 
00290         pop %edi
00291         ret                     # from do_four_yuvi
00292 
00293 
00294 # Do four pixels, in planar format
00295 do_four_yuvp:
00296         push %edi
00297 
00298         # The first part is the same as for interlaced (4 bytes Y)
00299         lodsl                   # 4 bytes at a time
00300         call expand_4_y
00301         
00302         # now gather U and V values... 
00303         mov Uptr, %ebx          # Use Uptr/Vptr
00304         mov (%ebx), %ax
00305         push %ax
00306         add $2, %ebx
00307         mov %ebx, Uptr
00308 
00309         mov Vptr, %ebx
00310         mov (%ebx), %ax
00311         push %ax
00312         add $2, %ebx
00313         mov %ebx, Vptr
00314         
00315         pop %ebx
00316         call expand_4_uv
00317         
00318         pop %edi
00319         ret
00320 
00321 
00322 # Do four pixels, in yuyv interlaced format
00323 do_four_yuyv:
00324         push %edi
00325 
00326         lodsl                   # v0y1u0y0
00327         mov %eax, %ebx
00328         bswap %ebx              # y0u0y1v0
00329         mov %bh, %ah            # v0y1y1y0
00330         and $0x00ff00ff, %ebx   # __u0__v0
00331         push %ax                # y1y0
00332 
00333         lodsl                   # v1y3u1y2      # mix register instructions
00334         mov %eax, %edx                          # so CPU pipeline doesnt stall
00335         rol $16, %eax           # u1y2v1y3      
00336         mov %dl, %dh            # v1y3y2y2
00337         and $0xff00ff00, %eax   # u1__v1__
00338         mov $0, %dl             # v1y3y2__
00339         or %eax, %ebx           # u1u0v1v0
00340         shl $8, %edx            # y3y2____
00341         pop %dx                 # y3y2y1y0
00342         mov %edx, %eax
00343         call expand_4_y
00344         call expand_4_uv
00345         
00346         pop %edi
00347         ret
00348 
00349 limit_pixels:
00350         # Limit all values in PixelBuffer
00351         push %esi
00352         push %edi
00353         push %ecx
00354         lea PixelBuffer, %esi
00355         mov %esi, %edi
00356         mov $16, %ecx
00357 0:      lodsl
00358         cmp $0, %eax            # this would have been a perfect spot for CMOVxx instructions...
00359         jl 2f                   #  except they only work on Pentium Pro processors,
00360         cmp $0xff00, %eax       #  and not even all of them
00361         jg 3f
00362         add $4, %edi            # no use for stosl here
00363         loop 0b
00364         jmp 9f
00365 2:      mov $0, %eax
00366         stosl
00367         loop 0b
00368         jmp 9f
00369 3:      mov $0xff00, %eax
00370         stosl
00371         loop 0b
00372         jmp 9f
00373 
00374 9:      pop %ecx
00375         pop %edi
00376         pop %esi
00377         ret                     # from limit_pixels
00378 
00379 /* Copy RGB values from PixelBuffer into destination buffer, 4 bytes
00380    with alpha 
00381  */
00382 
00383 /* Push 3 pixel (12 bytes), in correct order */
00384 push_rgb24:
00385         push %ecx
00386         push %esi
00387         lea PixelBuffer, %esi
00388         mov $4, %ecx
00389 0:      lodsl
00390         shr $8, %eax
00391         mov %al, (%edi)         # Red
00392         lodsl
00393         shr $8, %eax
00394         mov %al, 1(%edi)        # Green
00395         lodsl
00396         shr $8, %eax
00397         mov %al, 2(%edi)        # Blue
00398         add $3, %edi
00399         lodsl                   # dummy
00400         loop 0b
00401         pop %esi
00402         pop %ecx
00403         ret
00404 
00405 /* Push 3 pixels (12 bytes), in wrong order */
00406 push_bgr24:
00407         push %ecx
00408         push %esi
00409         lea PixelBuffer, %esi
00410         mov $4, %ecx
00411 0:      lodsl
00412         shr $8, %eax
00413         mov %al, 2(%edi)        # Red
00414         lodsl
00415         shr $8, %eax
00416         mov %al, 1(%edi)        # Green
00417         lodsl
00418         shr $8, %eax
00419         mov %al, (%edi)         # Blue
00420         add $3, %edi
00421         lodsl                   # dummy
00422         loop 0b
00423         pop %esi
00424         pop %ecx
00425         ret
00426 
00427 /* The simplest format: push 4 bytes, RGBa */
00428 push_rgb32:
00429         push %ecx
00430         push %esi
00431         mov $16, %ecx
00432         lea PixelBuffer, %esi
00433 0:      lodsl                   # red
00434         shr $8, %eax            # 8 bit precision
00435         stosb
00436         loop 0b
00437         pop %esi
00438         pop %ecx
00439         ret
00440 
00441 
00442 /* Gosh. Would you believe it. They even made this format... (Qt 2.*) */
00443 push_bgr32:
00444         # copy all 4 values to output buffer
00445         push %ecx
00446         push %esi
00447         mov $4, %ecx
00448         lea PixelBuffer, %esi
00449 0:      lodsl                   # red
00450         shr $8, %eax            # 8 bit precision
00451         mov %al, 2(%edi)
00452         lodsl                   # green
00453         shr $8, %eax
00454         mov %al, 1(%edi)
00455         lodsl                   # blue
00456         shr $8, %eax
00457         mov %al, (%edi)
00458         add $4, %edi
00459         lodsl                   # dummy
00460         loop 0b 
00461         pop %esi
00462         pop %ecx
00463         ret
00464 
00465 /*************************************/
00466 
00467 /* Functions to go from YUV interlaced formats to RGB */
00468 
00469 /* Go from interlaced to RGB, red first */
00470 
00471 ENTRY(ccvt_420i_rgb24)
00472         enter $72, $0           # no extra space, no stackframes
00473         push %ebx
00474         push %esi
00475         push %edi
00476 
00477         call test_param_2
00478         jc 9f
00479         
00480 0:      mov Width, %ecx         # width
00481 1:      call do_four_yuvi
00482         call limit_pixels
00483         call push_rgb24
00484                         
00485         cmp $0, %ecx
00486         jnz 1b                  # end of line?
00487         decl Height             # yes; decrement line counter
00488         jnz 0b
00489 
00490 9:      pop %edi
00491         pop %esi
00492         pop %ebx
00493         leave
00494         ret
00495 
00496 /* Go from interlaced to BGR, blue first */
00497 
00498 ENTRY(ccvt_420i_bgr24)
00499         enter $72, $0           # no extra space, no stackframes
00500         push %ebx
00501         push %esi
00502         push %edi
00503 
00504         call test_param_2
00505         jc 9f
00506         
00507 0:      mov Width, %ecx # width
00508 1:      call do_four_yuvi
00509         call limit_pixels
00510         call push_bgr24
00511                         
00512         cmp $0, %ecx
00513         jnz 1b                  # end of line?
00514         decl Height             # yes; decrement line counter
00515         jnz 0b
00516 
00517 9:      pop %edi
00518         pop %esi
00519         pop %ebx
00520         leave
00521         ret
00522 
00523 
00524 /* From interlaced to RGBa */
00525 
00526 ENTRY(ccvt_420i_rgb32)
00527         enter $72, $0           # no extra space, no stackframes
00528         push %ebx
00529         push %esi
00530         push %edi
00531 
00532         call test_param_2
00533         jc 9f
00534 
00535 0:      mov Width, %ecx         # width
00536 1:      call do_four_yuvi
00537         call limit_pixels
00538         call push_rgb32
00539                 
00540         cmp $0, %ecx            # end of line?
00541         jnz 1b
00542         decl Height             # yes; decrement line counter
00543         jnz 0b
00544 
00545 9:      pop %edi
00546         pop %esi
00547         pop %ebx
00548         leave
00549         ret
00550 
00551 /* Guess what? Go from interlaced to BGRa */
00552 
00553 ENTRY(ccvt_420i_bgr32)
00554         enter $72, $0           # no extra space, no stackframes
00555         push %ebx
00556         push %esi
00557         push %edi
00558 
00559         call test_param_2
00560         jc 9f
00561 
00562 0:      mov Width, %ecx         # width
00563 1:      call do_four_yuvi
00564         call limit_pixels
00565         call push_bgr32
00566                 
00567         cmp $0, %ecx            # end of line?
00568         jnz 1b
00569         decl Height             # yes; decrement line counter
00570         jnz 0b
00571 
00572 9:      pop %edi
00573         pop %esi
00574         pop %ebx
00575         leave
00576         ret
00577 
00578 
00579 
00580 
00581 /* From YUYV to RGBa */
00582 
00583 ENTRY(ccvt_yuyv_rgb32)
00584         enter $72, $0           # no extra space, no stackframes
00585         push %ebx
00586         push %esi
00587         push %edi
00588 
00589         call test_param_2
00590         jc 9f
00591         
00592 0:      mov Width, %ecx         # width
00593 1:      call do_four_yuyv
00594         call limit_pixels
00595         call push_rgb32
00596                 
00597         cmp $0, %ecx            # end of line?
00598         jnz 1b
00599 
00600 8:      decl Height             # yes; decrement line counter
00601         jnz 0b
00602 
00603 9:      pop %edi
00604         pop %esi
00605         pop %ebx
00606         leave
00607         ret
00608 
00609 /* From YUYV to BGRa */
00610 ENTRY(ccvt_yuyv_bgr32)
00611         enter $72, $0           # no extra space, no stackframes
00612         push %ebx
00613         push %esi
00614         push %edi
00615 
00616         call test_param_2
00617         jc 9f
00618         
00619         # YUYV -> RGBa RGBa
00620 
00621 0:      mov Width, %ecx         # width
00622 1:      call do_four_yuyv
00623         call limit_pixels
00624         call push_bgr32
00625                 
00626         cmp $0, %ecx            # end of line?
00627         jnz 1b
00628 
00629 8:      decl Height             # yes; decrement line counter
00630         jnz 0b
00631 
00632 9:      pop %edi
00633         pop %esi
00634         pop %ebx
00635         leave
00636         ret
00637 
00638 
00639 
00640 
00641 /* Planar to RGBa */
00642 
00643 ENTRY(ccvt_420p_rgb32)
00644         enter $72, $0
00645         push %ebx
00646         push %esi
00647         push %edi
00648         
00649         call test_param_31
00650         jc 9f
00651 
00652         mov Width, %eax         # width
00653         mull Height             # * height
00654         mov SrcU, %eax          # Copy U/V pointers
00655         mov %eax, Uptr
00656         mov SrcV, %eax
00657         mov %eax, Vptr
00658 
00659 0:      mov Width, %ecx         # width
00660 1:      call do_four_yuvp
00661         call limit_pixels
00662         call push_rgb32 
00663                 
00664         cmp $0, %ecx            # end of line?
00665         jnz 1b
00666 
00667         testl $1, Height        # odd/even line
00668         jnz 8f
00669         
00670         mov Width, %eax         # Even: rewind U/V pointers
00671         shr %eax
00672         sub %eax, Uptr
00673         sub %eax, Vptr
00674 
00675 8:      decl Height             # yes; decrement line counter
00676         jnz 0b
00677 
00678 9:      pop %edi
00679         pop %esi
00680         pop %ebx
00681         leave
00682         ret
00683 
00684 /* Okay... eventually, you end up with a very complete set of conversion
00685    routines. I just wished things were a bit simpler. */
00686 
00687 ENTRY(ccvt_420p_bgr32)
00688         enter $72, $0
00689         push %ebx
00690         push %esi
00691         push %edi
00692         
00693         call test_param_31
00694         jc 9f
00695         
00696         mov Width, %eax         # width
00697         mull Height             # * height
00698         mov SrcU, %eax          # Copy U/V pointers
00699         mov %eax, Uptr
00700         mov SrcV, %eax
00701         mov %eax, Vptr
00702 
00703 0:      mov Width, %ecx         # width
00704 1:      call do_four_yuvp
00705         call limit_pixels
00706         call push_bgr32
00707                 
00708         cmp $0, %ecx            # end of line?
00709         jnz 1b
00710 
00711         testl $1, Height        # odd/even line
00712         jnz 8f
00713         
00714         mov Width, %eax         # Even: rewind U/V pointers
00715         shr %eax
00716         sub %eax, Uptr
00717         sub %eax, Vptr
00718 
00719 8:      decl Height             # yes; decrement line counter
00720         jnz 0b
00721 
00722 9:      pop %edi
00723         pop %esi
00724         pop %ebx
00725         leave
00726         ret
00727 
00728 
00729 
00730 
00731 /* Go from RGB (red first) to 4:2:0 planar.
00732  * Note: this requires decimation of the U/V space by 2 in both directions 
00733  * Also, a matrix multiply would be QUITE convenient...
00734 
00735    This is the matrix:
00736      (Y )   ( 77  150   29)   (R)
00737      (Cb) = (-43  -85  128) * (G)
00738      (Cr)   (128 -107  -21)   (B)
00739  */
00740 
00741 ENTRY(ccvt_rgb24_420p)
00742         enter $96, $0           # 24 bytes extra stack, no stackframes
00743         push %ebx               #  -76: line width in bytes
00744         push %esi               #  -80: height (copy)
00745         push %edi               #  -84: width (copy)
00746                                 #  -88: red factor
00747                                 #  -92: green factor
00748                                 #  -96: blue factor
00749         call test_param_13
00750         jc 9f
00751 
00752         mov Width, %eax
00753         shl %eax
00754         add Width, %eax         # 3 * width = line increment
00755         mov %eax, -76(%ebp)
00756 
00757         mov Height, %eax
00758         mov %eax, -80(%ebp)     # copy height into stackframe
00759         
00760         /*
00761           This is a bit complicated... since U/V decimation is taking 
00762           place both in horizontal and vertical direction, we have to
00763           process 2 lines in parallel. Also, 2 adjacent pixels are
00764           considered. We average the U/V values over these 4 pixels
00765           (of course, we could have just taken the U/V value of the first
00766           pixel and be done with it, but that's not how we do things around
00767           here)
00768          */
00769         
00770         # 1st pass: Y values. Set factors       
00771         movl $77 , -88(%ebp)    # 0.299
00772         movl $150, -92(%ebp)    # 0.587
00773         movl $29 , -96(%ebp)    # 0.114
00774 
00775 0:      mov Width, %ecx         # width
00776 1:      xor %ebx, %ebx          # 0
00777         call rgb_multiply
00778         shr $8, %ebx            # divide by 256 (no need for limitor, since 77 + 150 + 29 = 256)
00779         mov %bl, %al
00780         stosb                   # store it into Y buffer
00781         
00782         dec %ecx                # end of line?
00783         jnz 1b
00784         decl -80(%ebp)          # end of image?
00785         jnz 0b
00786 
00787         # Okay, now the U/V pointers... 
00788         # The following code is passed twice, with different factors
00789         # Note that the %esi pointer jumps around quite a bit
00790 
00791         # factors for U
00792         movl $-43, -88(%ebp)    # -0.1687
00793         movl $-85, -92(%ebp)    # -0.3313
00794         movl $128, -96(%ebp)    # 0.5
00795         mov DstU, %edi          # Set %edi register now
00796                 
00797 7:      mov Src4, %esi          # Rewind source pointer
00798 
00799         mov Height, %eax        # height
00800         shr %eax                #  / 2
00801         mov %eax, -80(%ebp)     #   copy
00802 
00803 2:      mov Width, %eax         # width
00804         shr %eax                #  / 2
00805         mov %eax, -84(%ebp)     #   copy
00806 
00807 3:      xor %ebx, %ebx          # 0
00808         mov $4, %ecx            # average over 4 pixels
00809 
00810 4:      call rgb_multiply
00811 
00812         dec %ecx
00813         jz 5f                   # done?
00814         cmp $2, %ecx            # 3rd pixel.. move %esi to next line, with offset
00815         jne 4b
00816         sub $6, %esi            # backup to where we started
00817         add -76(%ebp), %esi     # add line increment
00818         jmp 4b
00819 
00820 5:      # okay, 4 pixels done... 
00821         sub -76(%ebp), %esi     # Get %esi back to its proper place
00822 
00823         add $0x20000, %ebx      # add 0.5 factor
00824         shr $10, %ebx           # Divide by 4 * 256
00825         mov %bl, %al
00826         stosb                   # store it!
00827 
00828         decl -84(%ebp)          # end of line?
00829         jnz 3b
00830         add -76(%ebp), %esi     # %esi to next line (actually, 2 lines further)
00831         decl -80(%ebp)          # end of image?
00832         jnz 2b
00833 
00834         # check if 3rd pass has been done
00835         cmpl $128, -88(%ebp)
00836         je 9f                   # Done!
00837         # Set factors for V pass
00838         movl $128 , -88(%ebp)   # 0.5
00839         movl $-107, -92(%ebp)   # -0.4187
00840         movl $-21 , -96(%ebp)   # -0.0813
00841         mov DstV, %edi          # %edi to V buffer
00842         jmp 7b                  # "Do it to me one more time..."
00843 
00844 9:      pop %edi
00845         pop %esi
00846         pop %ebx
00847         leave
00848         ret
00849 
00850 
00851 
00852 
00853 ENTRY(ccvt_bgr24_420p)
00854         enter $96, $0           # 24 bytes extra stack, no stackframes
00855         push %ebx               #   -4: line width in bytes
00856         push %esi               #   -8: height (copy)
00857         push %edi               #  -12: width (copy)
00858                                 #  -16: red factor
00859                                 #  -20: green factor
00860                                 #  -24: blue factor
00861         call test_param_13
00862         jc 9f
00863 
00864         /* No surprise, this code looks just like rgb24_420p, but with swapped factors */
00865          
00866         mov Width, %eax
00867         shl %eax
00868         add Width, %eax         # 3 * width = line increment
00869         mov %eax, -76(%ebp)
00870 
00871         mov Height, %eax
00872         mov %eax, -80(%ebp)     # copy height into stackframe
00873         
00874         # 1st pass: Y values. Set factors       
00875         movl $29 , -88(%ebp)    # 0.114
00876         movl $150, -92(%ebp)    # 0.587
00877         movl $77 , -96(%ebp)    # 0.299
00878 
00879 0:      mov Width, %ecx         # width
00880 1:      xor %ebx, %ebx          # 0
00881         call rgb_multiply
00882         shr $8, %ebx            # divide by 256 (no need for limitor, since 77 + 150 + 29 = 256)
00883         mov %bl, %al
00884         stosb                   # store it into Y buffer
00885         
00886         dec %ecx                # end of line?
00887         jnz 1b
00888         decl -80(%ebp)          # end of image?
00889         jnz 0b
00890 
00891         # Okay, now the U/V pointers... 
00892         # The following code is passed twice, with different factors
00893         # Note that the %esi pointer jumps around quite a bit
00894 
00895         # factors for U
00896         movl $123, -88(%ebp)    #  0.5
00897         movl $-85, -92(%ebp)    # -0.3313
00898         movl $-43, -96(%ebp)    # -0.1687
00899         mov DstU, %edi          # Set %edi register now
00900                 
00901 7:      mov Src4, %esi          # Rewind source pointer
00902 
00903         mov Height, %eax        # height
00904         shr %eax                #  / 2
00905         mov %eax, -80(%ebp)     #   copy
00906 
00907 2:      mov Width, %eax         # width
00908         shr %eax                #  / 2
00909         mov %eax, -84(%ebp)     #   copy
00910 
00911 3:      xor %ebx, %ebx          # 0
00912         mov $4, %ecx            # average over 4 pixels
00913 
00914 4:      call rgb_multiply
00915 
00916         dec %ecx
00917         jz 5f                   # done?
00918         cmp $2, %ecx            # 3rd pixel.. move %esi to next line, with offset
00919         jne 4b
00920         sub $6, %esi            # backup to where we started
00921         add -76(%ebp), %esi     # add line increment
00922         jmp 4b
00923 
00924 5:      # okay, 4 pixels done... 
00925         sub -76(%ebp), %esi     # Get %esi back to its proper place
00926 
00927         add $0x20000, %ebx      # add 0.5 factor
00928         shr $10, %ebx           # Divide by 4 * 256
00929         mov %bl, %al
00930         stosb                   # store it!
00931 
00932         decl -84(%ebp)          # end of line?
00933         jnz 3b
00934         add -76(%ebp), %esi     # %esi to next line (actually, 2 lines further)
00935         decl -80(%ebp)          # end of image?
00936         jnz 2b
00937 
00938         # check if 3rd pass has been done
00939         cmpl $-21, -88(%ebp)
00940         je 9f                   # Done!
00941         # Set factors for V pass
00942         movl $-21 , -88(%ebp)   # -0.0813
00943         movl $-107, -92(%ebp)   # -0.4187
00944         movl $128 , -96(%ebp)   #  0.5
00945         mov DstV, %edi          # %edi to V buffer
00946         jmp 7b                  # "Do it to me one more time..."
00947 
00948 9:      pop %edi
00949         pop %esi
00950         pop %ebx
00951         leave
00952         ret
00953 
00954 
00955 /* RGB-to-YUV helper functions */
00956 
00957 rgb_multiply:
00958         # do one RGB vector multiplication; its assumed the RGB factors
00959         # are set on the stack. The data is accumulated in ebx.
00960         lodsb                   # red byte
00961         and $0xff, %eax
00962         mov -88(%ebp), %edx     # red factor
00963         mul %edx
00964         add %eax, %ebx
00965         lodsb                   # green byte
00966         and $0xff, %eax
00967         mov -92(%ebp), %edx     # green factor
00968         mul %edx
00969         add %eax, %ebx
00970         lodsb                   # blue byte
00971         and $0xff, %eax
00972         mov -96(%ebp), %edx     # blue factor
00973         mul %edx
00974         add %eax, %ebx          # ebx now contains sum
00975         ret
00976 
00977 
00978 
00979 /**************************************************************************/
00980 
00981 
00982 /* Go from 'interlaced' (YYYY UU/VV) format to planar */
00983 
00984 ENTRY(ccvt_420i_420p)
00985         enter $76, $0           # 4 bytes extra space, no stackframes
00986         push %ebx               # -4: width / 4
00987         push %esi
00988         push %edi
00989 
00990         call test_param_13
00991         jc 9f
00992 
00993         # Okay, this is fairly easy... we first grab the Y values (4 bytes
00994         #  at a time), then rewind and do the U values, and repeat for V.
00995         #  This leaves us with a nice planar format
00996 
00997         mov Width, %eax
00998         shr %eax
00999         shr %eax                # width / 4
01000         mov %eax, -76(%ebp)     # Store
01001 
01002         # Y
01003         mov Height, %edx        # line counter
01004 0:      mov -76(%ebp), %ecx
01005 1:      lodsl                   # get 4 bytes...
01006         stosl                   # ...push 4 bytes
01007         add $2, %esi            # Skip U or V
01008         loop 1b
01009         dec %edx
01010         jnz 0b
01011 
01012         # U
01013         mov Src4, %esi          # rewind source pointer
01014         mov DstU, %edi
01015         add $4, %esi            # set to U 
01016         mov Height, %edx
01017         shr %edx                # height / 2
01018         mov Width, %ebx
01019         shl %ebx
01020         add Width, %ebx
01021         shr %ebx                # Width * 1.5 (line offset)
01022 
01023 2:      mov -76(%ebp), %ecx     # width / 4
01024 3:      lodsw                   # 2 bytes at a time
01025         stosw
01026         add $4, %esi            # skip Y
01027         loop 3b
01028         add %ebx, %esi          # Skip line (U is on even lines)
01029         dec %edx
01030         jnz 2b
01031         
01032         # V
01033         mov Src4, %esi          # rewind, set to V in first odd line
01034         add $4, %esi
01035         add %ebx, %esi          # register re-use; no compiler can beat that :)
01036         mov DstV, %edi          # V ptr
01037         mov Height, %edx
01038         shr %edx                # height / 2
01039         
01040 4:      mov -76(%ebp), %ecx     # Get width/4
01041 5:      lodsw
01042         stosw
01043         add $4, %esi            # Skip Y
01044         loop 5b
01045         add %ebx, %esi          # Skip line (V is on odd lines)
01046         dec %edx
01047         jnz 4b
01048         
01049         /* That's it! */
01050         
01051 9:      pop %edi
01052         pop %esi
01053         pop %ebx
01054         leave
01055         ret
01056 
01057 
01058 /* Go from 4:2:0 interlaced to 'normal' YUYV */
01059 
01060 ENTRY(ccvt_420i_yuyv)
01061         enter $80, $0           # 8 bytes extra space, no stackframes
01062         push %ebx
01063         push %esi
01064         push %edi
01065 
01066         call test_param_2
01067         jc 9f
01068         
01069         mov Width, %ecx         # -4: width / 4 = no. loops per line
01070         shr %ecx
01071         shr %ecx
01072         mov %ecx, -76(%ebp)
01073 
01074         mov Width, %ebx         # -8: width * 1.5 = line offset
01075         shl %ebx
01076         add Width, %ebx
01077         shr %ebx
01078         mov %ebx, -80(%ebp)
01079         
01080         # Okay, this requires a bit of byte shuffling... we go from
01081         #  YYYY UU
01082         #  YYYY VV
01083         # to
01084         #  YUYV YUYV
01085         #  YUYV YUYV
01086         # which indeed takes up more space
01087 
01088         # 
01089         
01090 0:      mov -76(%ebp), %ecx
01091 
01092 1:      lodsl                   # 4 Y in eax
01093         testl $1, Height        # even or odd line?
01094         jnz 2f
01095         
01096         # Even
01097         mov -80(%ebp), %ebx
01098         mov (%ebx, %esi), %dx   # 16 bits V 
01099         shl $16, %edx           # store in high word
01100         mov (%esi), %dx         # 16 bits U 
01101         add $2, %esi
01102         jmp 3f
01103         
01104 2:      # Odd
01105         mov -80(%ebp), %ebx
01106         neg %ebx                # negative offset
01107         mov (%esi), %dx         # 16 bits V
01108         shl $16, %edx           # store in high word
01109         mov (%ebx, %esi), %dx   # 16 bits U
01110         add $2, %esi
01111 
01112 3:      # eax = Y3Y2Y1Y0, edx = V1V0U1U0, ebx is free
01113         push %eax
01114 
01115         movzbl %al, %ebx        # ______y0
01116         and $0xFF00, %eax       # ____y1__
01117         shl $8, %eax            # __y1____
01118         or %ebx, %eax           # __y1__y0
01119         mov %edx, %ebx          # v1v0u1u0
01120         shl $8, %ebx            # v0u1u0__
01121         and $0xff00ff00, %ebx   # v0__u0__
01122         or %ebx, %eax           # v0y1u0y0
01123         stosl   
01124 
01125         pop %eax                # y3y2y1y0
01126         # Second half
01127         shr $8, %eax            # __y3y2y1
01128         shr $8, %ax             # __y3__y2
01129         and $0xff00ff00, %edx   # v1__u1__
01130         or %edx, %eax           # v1y3u1y2
01131         stosl
01132         
01133         loop 1b
01134 
01135 
01136         decl Height             # height--
01137         jnz 0b
01138         # Done
01139 
01140 9:      pop %edi
01141         pop %esi
01142         pop %ebx
01143         leave
01144         ret

Generated at Fri Aug 13 17:29:20 2004 for libRTImage by doxygen1.2.8.1 written by Dimitri van Heesch, © 1997-2001