From 1d00d9947067da76ac4d8d0a6b9ef2c28e73349e Mon Sep 17 00:00:00 2001
From: Dana Jansens <danakj@orodu.net>
Date: Sun, 10 Feb 2008 16:49:16 -0500
Subject: use memcpy's to make splitvertical gradient much faster - using log n
 memcpy's is much quicker than setting a pointer value n times

Here are some profiling results.  splitvertical1 is the original code, splitvertical2 is some slight improvements in locality for it, and splitvertical3 is the new O(log n) memcpy code

  %   cumulative   self              self     total
 time   seconds   seconds    calls  ms/call  ms/call  name
 49.44      0.88     0.88     1063     0.83     0.83  gradient_splitvertical1
 47.19      1.72     0.84     1063     0.79     0.79  gradient_splitvertical2
  2.81      1.77     0.05     1063     0.05     0.05  gradient_splitvertical3

i also tested this with 'time' to draw 1000 gradients, and the new code used approximately half the user time, and finished 10 seconds quicker.  so yeah, it's magical and works well.
---
 render/gradient.c | 78 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 59 insertions(+), 19 deletions(-)

(limited to 'render/gradient.c')

diff --git a/render/gradient.c b/render/gradient.c
index 6439b301..bbd2a5c9 100644
--- a/render/gradient.c
+++ b/render/gradient.c
@@ -425,8 +425,7 @@ static void gradient_splitvertical(RrAppearance *a, gint w, gint h)
 {
     gint x, y1, y2, y3;
     RrSurface *sf = &a->surface;
-    RrPixel32 *data = sf->pixel_data;
-    RrPixel32 current;
+    RrPixel32 *data, *start;
     gint y1sz, y2sz, y3sz;
 
     VARS(y1);
@@ -455,28 +454,69 @@ static void gradient_splitvertical(RrAppearance *a, gint w, gint h)
     }
     SETUP(y3, sf->secondary, sf->split_secondary,  y3sz);
 
-    for (y1 = y1sz; y1 > 0; --y1) {
-        current = COLOR(y1);
-        for (x = w - 1; x >= 0; --x)
-            *(data++) = current;
+    /* find the color for the first pixel of each row first */
+    data = sf->pixel_data;
 
+    for (y1 = y1sz-1; y1 > 0; --y1) {
+        *data = COLOR(y1);
+        data += w;
         NEXT(y1);
     }
-
-    for (y2 = y2sz; y2 > 0; --y2) {
-        current = COLOR(y2);
-        for (x = w - 1; x >= 0; --x)
-            *(data++) = current;
-
+    *data = COLOR(y1);
+    data += w;
+    for (y2 = y2sz-1; y2 > 0; --y2) {
+        *data = COLOR(y2);
+        data += w;
         NEXT(y2);
     }
+    *data = COLOR(y2);
+    data += w;
+    for (y3 = y3sz-1; y3 > 0; --y3) {
+        *data = COLOR(y3);
+        data += w;
+        NEXT(y3);
+    }
+    *data = COLOR(y3);
 
-    for (y3 = y3sz; y3 > 0; --y3) {
-        current = COLOR(y3);
-        for (x = w - 1; x >= 0; --x)
-            *(data++) = current;
+    /* copy the first pixels into the whole rows */
 
-        NEXT(y3);
+    start = sf->pixel_data;
+    data = start + 1;
+
+    for (y1 = h; y1 > 0; --y1) {
+        /* for really small things, just copy ourselves */
+        if (w < 8) {
+            for (x = w-1; x > 0; --x)
+                *(data++) = *start;
+        }
+        /* for >= 8, then use O(log n) memcpy's... */
+        else {
+            gint len = 4;
+            gint lenbytes = 4 * sizeof(RrPixel32);
+
+            /* copy the first 3 * 32 bits (3 words) ourselves - then we have
+               3 + the original 1 = 4 words to make copies of at a time
+
+               this is faster than doing memcpy for 1 or 2 words at a time
+            */
+            for (x = 3; x > 0; --x)
+                *(data++) = *start;
+
+            for (x = w - 4; x > 0;) {
+                memcpy(data, start, lenbytes);
+                x -= len;
+                data += len;
+                len <<= 1;
+                lenbytes <<= 1;
+                if (len > x) {
+                    len = x;
+                    lenbytes = x * sizeof(RrPixel32);
+                }
+            }
+        }
+
+        start += w;
+        ++data;
     }
 }
 
@@ -551,13 +591,13 @@ static void gradient_vertical(RrSurface *sf, gint w, gint h)
 
     for (y = h - 1; y > 0; --y) {  /* 0 -> h-1 */
         current = COLOR(y);
-        for (x = w - 1; x >= 0; --x)  /* 0 -> w */
+        for (x = w; x > 0; --x)  /* 0 -> w */
             *(data++) = current;
 
         NEXT(y);
     }
     current = COLOR(y);
-    for (x = w - 1; x >= 0; --x)  /* 0 -> w */
+    for (x = w; x > 0; --x)  /* 0 -> w */
         *(data++) = current;
 }
 
-- 
cgit v1.2.3


From 3611c8210cc632c2a21c67ccbf40857342c10371 Mon Sep 17 00:00:00 2001
From: Dana Jansens <danakj@orodu.net>
Date: Sun, 10 Feb 2008 17:06:18 -0500
Subject: use memcpy's to speed up vertical gradients too.  split the fancy
 memcpy() code out into the repeat_pixel function.

---
 render/gradient.c | 115 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 66 insertions(+), 49 deletions(-)

(limited to 'render/gradient.c')

diff --git a/render/gradient.c b/render/gradient.c
index bbd2a5c9..d1d90e41 100644
--- a/render/gradient.c
+++ b/render/gradient.c
@@ -197,6 +197,52 @@ static void create_bevel_colors(RrAppearance *l)
     l->surface.bevel_dark = RrColorNew(l->inst, r, g, b);
 }
 
+/*! Repeat the first pixel over the entire block of memory
+  @param start The block of memory. start[0] will be copied
+         to the rest of the block.
+  @param w The width of the block of memory (including the already-set first
+           element
+*/
+static inline void repeat_pixel(RrPixel32 *start, gint w)
+{
+    gint x;
+    RrPixel32 *dest;
+
+    dest = start + 1;
+
+    /* for really small things, just copy ourselves */
+    if (w < 8) {
+        for (x = w-1; x > 0; --x)
+            *(dest++) = *start;
+    }
+
+    /* for >= 8, then use O(log n) memcpy's... */
+    else {
+        gint len = 4;
+        gint lenbytes = 4 * sizeof(RrPixel32);
+
+        /* copy the first 3 * 32 bits (3 words) ourselves - then we have
+           3 + the original 1 = 4 words to make copies of at a time
+
+           this is faster than doing memcpy for 1 or 2 words at a time
+        */
+        for (x = 3; x > 0; --x)
+            *(dest++) = *start;
+
+        for (x = w - 4; x > 0;) {
+            memcpy(dest, start, lenbytes);
+            x -= len;
+            dest += len;
+            len <<= 1;
+            lenbytes <<= 1;
+            if (len > x) {
+                len = x;
+                lenbytes = x * sizeof(RrPixel32);
+            }
+        }
+    }
+}
+
 static void gradient_parentrelative(RrAppearance *a, gint w, gint h)
 {
     RrPixel32 *source, *dest;
@@ -423,9 +469,9 @@ static void gradient_solid(RrAppearance *l, gint w, gint h)
 
 static void gradient_splitvertical(RrAppearance *a, gint w, gint h)
 {
-    gint x, y1, y2, y3;
+    gint y1, y2, y3;
     RrSurface *sf = &a->surface;
-    RrPixel32 *data, *start;
+    RrPixel32 *data;
     gint y1sz, y2sz, y3sz;
 
     VARS(y1);
@@ -479,44 +525,10 @@ static void gradient_splitvertical(RrAppearance *a, gint w, gint h)
     *data = COLOR(y3);
 
     /* copy the first pixels into the whole rows */
-
-    start = sf->pixel_data;
-    data = start + 1;
-
+    data = sf->pixel_data;
     for (y1 = h; y1 > 0; --y1) {
-        /* for really small things, just copy ourselves */
-        if (w < 8) {
-            for (x = w-1; x > 0; --x)
-                *(data++) = *start;
-        }
-        /* for >= 8, then use O(log n) memcpy's... */
-        else {
-            gint len = 4;
-            gint lenbytes = 4 * sizeof(RrPixel32);
-
-            /* copy the first 3 * 32 bits (3 words) ourselves - then we have
-               3 + the original 1 = 4 words to make copies of at a time
-
-               this is faster than doing memcpy for 1 or 2 words at a time
-            */
-            for (x = 3; x > 0; --x)
-                *(data++) = *start;
-
-            for (x = w - 4; x > 0;) {
-                memcpy(data, start, lenbytes);
-                x -= len;
-                data += len;
-                len <<= 1;
-                lenbytes <<= 1;
-                if (len > x) {
-                    len = x;
-                    lenbytes = x * sizeof(RrPixel32);
-                }
-            }
-        }
-
-        start += w;
-        ++data;
+        repeat_pixel(data, w);
+        data += w;
     }
 }
 
@@ -582,23 +594,28 @@ static void gradient_mirrorhorizontal(RrSurface *sf, gint w, gint h)
 
 static void gradient_vertical(RrSurface *sf, gint w, gint h)
 {
-    gint x, y;
-    RrPixel32 *data = sf->pixel_data;
-    RrPixel32 current;
+    gint y;
+    RrPixel32 *data;
 
     VARS(y);
     SETUP(y, sf->primary, sf->secondary, h);
 
-    for (y = h - 1; y > 0; --y) {  /* 0 -> h-1 */
-        current = COLOR(y);
-        for (x = w; x > 0; --x)  /* 0 -> w */
-            *(data++) = current;
+    /* find the color for the first pixel of each row first */
+    data = sf->pixel_data;
 
+    for (y = h - 1; y > 0; --y) {  /* 0 -> h-1 */
+        *data = COLOR(y);
+        data += w;
         NEXT(y);
     }
-    current = COLOR(y);
-    for (x = w; x > 0; --x)  /* 0 -> w */
-        *(data++) = current;
+    *data = COLOR(y);
+
+    /* copy the first pixels into the whole rows */
+    data = sf->pixel_data;
+    for (y = h; y > 0; --y) {
+        repeat_pixel(data, w);
+        data += w;
+    }
 }
 
 
-- 
cgit v1.2.3


From 9c729986844a65545f7736d053359ad24d2df120 Mon Sep 17 00:00:00 2001
From: Dana Jansens <danakj@orodu.net>
Date: Sun, 10 Feb 2008 18:08:05 -0500
Subject: a small optimization for the vertical gradients, and use the same
 log(n) strategy to use less memcpy's for filling out the horizontal gradients

---
 render/gradient.c | 79 ++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 19 deletions(-)

(limited to 'render/gradient.c')

diff --git a/render/gradient.c b/render/gradient.c
index d1d90e41..d7401a98 100644
--- a/render/gradient.c
+++ b/render/gradient.c
@@ -218,8 +218,8 @@ static inline void repeat_pixel(RrPixel32 *start, gint w)
 
     /* for >= 8, then use O(log n) memcpy's... */
     else {
-        gint len = 4;
-        gint lenbytes = 4 * sizeof(RrPixel32);
+        gchar *cdest;
+        gint lenbytes;
 
         /* copy the first 3 * 32 bits (3 words) ourselves - then we have
            3 + the original 1 = 4 words to make copies of at a time
@@ -229,16 +229,38 @@ static inline void repeat_pixel(RrPixel32 *start, gint w)
         for (x = 3; x > 0; --x)
             *(dest++) = *start;
 
-        for (x = w - 4; x > 0;) {
-            memcpy(dest, start, lenbytes);
-            x -= len;
-            dest += len;
-            len <<= 1;
+        /* cdest is a pointer to the pixel data that is typed char* so that
+           adding 1 to its position moves it only one byte
+
+           lenbytes is the amount of bytes that we will be copying each
+           iteration.  this doubles each time through the loop.
+
+           x is the number of bytes left to copy into.  lenbytes will alwaysa
+           be bounded by x
+
+           this loop will run O(log n) times (n is the number of bytes we
+           need to copy into), since the size of the copy is doubled each
+           iteration.  it seems that gcc does some nice optimizations to make
+           this memcpy very fast on hardware with support for vector operations
+           such as mmx or see.  here is an idea of the kind of speed up we are
+           getting by doing this (splitvertical3 switches from doing
+           "*(data++) = color" n times to doing this memcpy thing log n times:
+
+           %   cumulative   self              self     total           
+           time   seconds   seconds    calls  ms/call  ms/call  name    
+           49.44      0.88     0.88     1063     0.83     0.83  splitvertical1
+           47.19      1.72     0.84     1063     0.79     0.79  splitvertical2
+            2.81      1.77     0.05     1063     0.05     0.05  splitvertical3
+        */
+        cdest = (gchar*)dest;
+        lenbytes = 4 * sizeof(RrPixel32);
+        for (x = (w - 4) * sizeof(RrPixel32); x > 0;) {
+            memcpy(cdest, start, lenbytes);
+            x -= lenbytes;
+            cdest += lenbytes;
             lenbytes <<= 1;
-            if (len > x) {
-                len = x;
-                lenbytes = x * sizeof(RrPixel32);
-            }
+            if (lenbytes > x)
+                lenbytes = x;
         }
     }
 }
@@ -534,12 +556,14 @@ static void gradient_splitvertical(RrAppearance *a, gint w, gint h)
 
 static void gradient_horizontal(RrSurface *sf, gint w, gint h)
 {
-    gint x, y;
+    gint x, y, cpbytes;
     RrPixel32 *data = sf->pixel_data, *datav;
+    gchar *datac;
 
     VARS(x);
     SETUP(x, sf->primary, sf->secondary, w);
 
+    /* set the color values for the first row */
     datav = data;
     for (x = w - 1; x > 0; --x) {  /* 0 -> w - 1 */
         *datav = COLOR(x);
@@ -549,22 +573,32 @@ static void gradient_horizontal(RrSurface *sf, gint w, gint h)
     *datav = COLOR(x);
     ++datav;
 
-    for (y = h - 1; y > 0; --y) { /* 1 -> h */
-        memcpy(datav, data, w * sizeof(RrPixel32));
-        datav += w;
+    /* copy the first row to the rest in O(logn) copies */
+    datac = (gchar*)datav;
+    cpbytes = 1 * w * sizeof(RrPixel32);
+    for (y = (h - 1) * w * sizeof(RrPixel32); y > 0;) {
+        memcpy(datac, data, cpbytes);
+        y -= cpbytes;
+        datac += cpbytes;
+        cpbytes <<= 1;
+        if (cpbytes > y)
+            cpbytes = y;
     }
 }
 
 static void gradient_mirrorhorizontal(RrSurface *sf, gint w, gint h)
 {
-    gint x, y, half1, half2;
+    gint x, y, half1, half2, cpbytes;
     RrPixel32 *data = sf->pixel_data, *datav;
+    gchar *datac;
 
     VARS(x);
 
     half1 = (w + 1) / 2;
     half2 = w / 2;
 
+    /* set the color values for the first row */
+
     SETUP(x, sf->primary, sf->secondary, half1);
     datav = data;
     for (x = half1 - 1; x > 0; --x) {  /* 0 -> half1 - 1 */
@@ -586,9 +620,16 @@ static void gradient_mirrorhorizontal(RrSurface *sf, gint w, gint h)
         ++datav;
     }
 
-    for (y = h - 1; y > 0; --y) {  /* 1 -> h */
-        memcpy(datav, data, w * sizeof(RrPixel32));
-        datav += w;
+    /* copy the first row to the rest in O(logn) copies */
+    datac = (gchar*)datav;
+    cpbytes = 1 * w * sizeof(RrPixel32);
+    for (y = (h - 1) * w * sizeof(RrPixel32); y > 0;) {
+        memcpy(datac, data, cpbytes);
+        y -= cpbytes;
+        datac += cpbytes;
+        cpbytes <<= 1;
+        if (cpbytes > y)
+            cpbytes = y;
     }
 }
 
-- 
cgit v1.2.3


From 6bda8c29038649f4bd4c54ce011473b1344bb291 Mon Sep 17 00:00:00 2001
From: Dana Jansens <danakj@orodu.net>
Date: Mon, 11 Feb 2008 01:15:06 -0500
Subject: speed up the pyramid gradient using memcpy's.  also make it not crash
 for 1px high textures.

here are some sample profiling results.  pyramid2 is the new code

  %   cumulative   self              self     total
 time   seconds   seconds    calls  ms/call  ms/call  name

 58.78      1.54     1.54      255     6.04     6.04  gradient_pyramid1
 40.46      2.60     1.06      255     4.16     4.16  gradient_pyramid2

 54.88      2.25     2.25      504     4.46     4.46  gradient_pyramid1
 44.88      4.09     1.84      504     3.65     3.65  gradient_pyramid2
---
 render/gradient.c | 87 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 48 insertions(+), 39 deletions(-)

(limited to 'render/gradient.c')

diff --git a/render/gradient.c b/render/gradient.c
index d7401a98..fc75047f 100644
--- a/render/gradient.c
+++ b/render/gradient.c
@@ -754,14 +754,13 @@ static void gradient_crossdiagonal(RrSurface *sf, gint w, gint h)
     *data = COLOR(x);
 }
 
-static void gradient_pyramid(RrSurface *sf, gint inw, gint inh)
+static void gradient_pyramid(RrSurface *sf, gint w, gint h)
 {
-    gint x, y, w = (inw >> 1) + 1, h = (inh >> 1) + 1;
-    RrPixel32 *data = sf->pixel_data;
-    RrPixel32 *end = data + inw*inh - 1;
-    RrPixel32 current;
+    RrPixel32 *ldata, *rdata;
+    RrPixel32 *cp;
     RrColor left, right;
     RrColor extracorner;
+    gint x, y, halfw, halfh, midx, midy;
 
     VARS(lefty);
     VARS(righty);
@@ -771,54 +770,64 @@ static void gradient_pyramid(RrSurface *sf, gint inw, gint inh)
     extracorner.g = (sf->primary->g + sf->secondary->g) / 2;
     extracorner.b = (sf->primary->b + sf->secondary->b) / 2;
 
-    SETUP(lefty, (&extracorner), sf->secondary, h);
-    SETUP(righty, sf->primary, (&extracorner), h);
+    halfw = w >> 1;
+    halfh = h >> 1;
+    midx = w - halfw - halfw; /* 0 or 1, depending if w is even or odd */
+    midy = h - halfh - halfh;   /* 0 or 1, depending if h is even or odd */
+
+    SETUP(lefty, sf->primary, (&extracorner), halfh + midy);
+    SETUP(righty, (&extracorner), sf->secondary, halfh + midy);
+
+    /* draw the top half
+
+       it is faster to draw both top quarters together than to draw one and
+       then copy it over to the other side.
+    */
+
+    ldata = sf->pixel_data;
+    rdata = ldata + w - 1;
+    for (y = halfh + midy; y > 0; --y) {  /* 0 -> (h+1)/2 */
+        RrPixel32 c;
 
-    for (y = h - 1; y > 0; --y) {  /* 0 -> h-1 */
         COLOR_RR(lefty, (&left));
         COLOR_RR(righty, (&right));
 
-        SETUP(x, (&left), (&right), w);
+        SETUP(x, (&left), (&right), halfw + midx);
 
-        for (x = w - 1; x > 0; --x) {  /* 0 -> w-1 */
-            current = COLOR(x);
-            *(data+x) = current;
-            *(data+inw-x) = current;
-            *(end-x) = current;
-            *(end-(inw-x)) = current;
+        for (x = halfw + midx - 1; x > 0; --x) {  /* 0 -> (w+1)/2 */
+            c = COLOR(x);
+            *(ldata++) = *(rdata--) = c;
 
             NEXT(x);
         }
-        current = COLOR(x);
-        *(data+x) = current;
-        *(data+inw-x) = current;
-        *(end-x) = current;
-        *(end-(inw-x)) = current;
-
-        data+=inw;
-        end-=inw;
+        c = COLOR(x);
+        *ldata = *rdata = c;
+        ldata += halfw + 1;
+        rdata += halfw - 1 + midx + w;
 
         NEXT(lefty);
         NEXT(righty);
     }
-    COLOR_RR(lefty, (&left));
-    COLOR_RR(righty, (&right));
 
-    SETUP(x, (&left), (&right), w);
+    /* copy the top half into the bottom half, mirroring it, so we can only
+       copy one row at a time
 
-    for (x = w - 1; x > 0; --x) {  /* 0 -> w-1 */
-        current = COLOR(x);
-        *(data+x) = current;
-        *(data+inw-x) = current;
-        *(end-x) = current;
-        *(end-(inw-x)) = current;
+       it is faster, to move the writing pointer forward, and the reading
+       pointer backward
 
-        NEXT(x);
+       this is the current code, moving the write pointer forward and read
+       pointer backward
+       41.78      4.26     1.78      504     3.53     3.53  gradient_pyramid2
+       this is the opposite, moving the read pointer forward and the write
+       pointer backward
+       42.27      4.40     1.86      504     3.69     3.69  gradient_pyramid2
+       
+    */
+    ldata = sf->pixel_data + (halfh - 1) * w;
+    cp = ldata + (midy + 1) * w;
+    for (y = halfh; y > 0; --y) {
+        memcpy(cp, ldata, w * sizeof(RrPixel32));
+        ldata -= w;
+        cp += w;
     }
-    current = COLOR(x);
-    *(data+x) = current;
-    *(data+inw-x) = current;
-    *(end-x) = current;
-    *(end-(inw-x)) = current;
 }
-
-- 
cgit v1.2.3