From 147cebe99eceb120569a527cf9cac9a70b497645 Mon Sep 17 00:00:00 2001
From: Ryan VanderMeulen <rvandermeulen@mozilla.com>
Date: Fri, 2 May 2025 13:58:19 +0000
Subject: [PATCH] Bug 1963705 - Update libpixman to release 0.46.0. r=jfkthame

Differential Revision: https://phabricator.services.mozilla.com/D247432
---
 gfx/cairo/README                              |    2 -
 gfx/cairo/cairo/src/pixman-rename.h           |   29 +
 gfx/cairo/libpixman/src/meson.build           |    1 +
 gfx/cairo/libpixman/src/moz.build             |    1 +
 gfx/cairo/libpixman/src/pixman-access.c       |   81 +
 .../libpixman/src/pixman-arma64-neon-asm.S    |   95 +-
 .../libpixman/src/pixman-arma64-neon-asm.h    |   21 +-
 gfx/cairo/libpixman/src/pixman-glyph.c        |   15 +-
 gfx/cairo/libpixman/src/pixman-image.c        |   24 +
 gfx/cairo/libpixman/src/pixman-private.h      |    4 +
 gfx/cairo/libpixman/src/pixman-region.c       |  135 +-
 gfx/cairo/libpixman/src/pixman-region16.c     |    3 +
 gfx/cairo/libpixman/src/pixman-region32.c     |    3 +
 gfx/cairo/libpixman/src/pixman-region64f.c    |   50 +
 gfx/cairo/libpixman/src/pixman-rvv.c          | 2468 ++++++++++++++++-
 gfx/cairo/libpixman/src/pixman-utils.c        |   37 +
 gfx/cairo/libpixman/src/pixman-version.h      |    6 +-
 gfx/cairo/libpixman/src/pixman-vmx.c          | 1583 +++++------
 gfx/cairo/libpixman/src/pixman.c              |   20 +-
 gfx/cairo/libpixman/src/pixman.h              |  189 ++
 gfx/cairo/pixman-neon.patch                   |   30 -
 21 files changed, 3688 insertions(+), 1109 deletions(-)
 create mode 100644 gfx/cairo/libpixman/src/pixman-region64f.c
 delete mode 100644 gfx/cairo/pixman-neon.patch

diff --git a/gfx/cairo/README b/gfx/cairo/README
index 11b7fb59a764..873d9476fb05 100644
--- a/gfx/cairo/README
+++ b/gfx/cairo/README
@@ -58,6 +58,4 @@ pixman-export.patch: make sure pixman symbols are not exported in libxul
 
 pixman-interp.patch: use lower quality interpolation by default on mobile
 
-pixman-neon.patch: fix for a build failure with clang on armhf linux
-
 pixman-rename.patch: include pixman-rename.h for renaming of external symbols
diff --git a/gfx/cairo/cairo/src/pixman-rename.h b/gfx/cairo/cairo/src/pixman-rename.h
index 431cd92e968a..c6fb1118ae8d 100644
--- a/gfx/cairo/cairo/src/pixman-rename.h
+++ b/gfx/cairo/cairo/src/pixman-rename.h
@@ -59,6 +59,35 @@
 #define pixman_region32_reset _moz_pixman_region32_reset
 #define pixman_region32_clear _moz_pixman_region32_clear
 #define pixman_region32_print _moz_pixman_region32_print
+#define pixman_region64f_init _moz_pixman_region64f_init
+#define pixman_region64f_init_rect _moz_pixman_region64f_init_rect
+#define pixman_region64f_init_rectf _moz_pixman_region64f_init_rectf
+#define pixman_region64f_init_rects _moz_pixman_region64f_init_rects
+#define pixman_region64f_init_with_extents _moz_pixman_region64f_init_with_extents
+#define pixman_region64f_init_from_image _moz_pixman_region64f_init_from_image
+#define pixman_region64f_fini _moz_pixman_region64f_fini
+#define pixman_region64f_translate _moz_pixman_region64f_translate
+#define pixman_region64f_copy _moz_pixman_region64f_copy
+#define pixman_region64f_intersect _moz_pixman_region64f_intersect
+#define pixman_region64f_union _moz_pixman_region64f_union
+#define pixman_region64f_intersect_rect _moz_pixman_region64f_intersect_rect
+#define pixman_region64f_intersect_rectf _moz_pixman_region64f_intersect_rectf
+#define pixman_region64f_union_rect _moz_pixman_region64f_union_rect
+#define pixman_region64f_union_rectf _moz_pixman_region64f_union_rectf
+#define pixman_region64f_subtract _moz_pixman_region64f_subtract
+#define pixman_region64f_inverse _moz_pixman_region64f_inverse
+#define pixman_region64f_contains_point _moz_pixman_region64f_contains_point
+#define pixman_region64f_contains_rectangle _moz_pixman_region64f_contains_rectangle
+#define pixman_region64f_empty _moz_pixman_region64f_empty
+#define pixman_region64f_not_empty _moz_pixman_region64f_not_empty
+#define pixman_region64f_extents _moz_pixman_region64f_extents
+#define pixman_region64f_n_rects _moz_pixman_region64f_n_rects
+#define pixman_region64f_rectangles _moz_pixman_region64f_rectangles
+#define pixman_region64f_equal _moz_pixman_region64f_equal
+#define pixman_region64f_selfcheck _moz_pixman_region64f_selfcheck
+#define pixman_region64f_reset _moz_pixman_region64f_reset
+#define pixman_region64f_clear _moz_pixman_region64f_clear
+#define pixman_region64f_print _moz_pixman_region64f_print
 #define pixman_blt _moz_pixman_blt
 #define pixman_fill _moz_pixman_fill
 #define pixman_transform_point_3d _moz_pixman_transform_point_3d
diff --git a/gfx/cairo/libpixman/src/meson.build b/gfx/cairo/libpixman/src/meson.build
index fd41288c8cca..a91c73e2926f 100644
--- a/gfx/cairo/libpixman/src/meson.build
+++ b/gfx/cairo/libpixman/src/meson.build
@@ -97,6 +97,7 @@ pixman_files = files(
   'pixman-radial-gradient.c',
   'pixman-region16.c',
   'pixman-region32.c',
+  'pixman-region64f.c',
   'pixman-riscv.c',
   'pixman-solid-fill.c',
   'pixman-timer.c',
diff --git a/gfx/cairo/libpixman/src/moz.build b/gfx/cairo/libpixman/src/moz.build
index 434f605cd5ed..662f7d44ae88 100644
--- a/gfx/cairo/libpixman/src/moz.build
+++ b/gfx/cairo/libpixman/src/moz.build
@@ -34,6 +34,7 @@ SOURCES += [
     'pixman-radial-gradient.c',
     'pixman-region16.c',
     'pixman-region32.c',
+    'pixman-region64f.c',
     'pixman-riscv.c',
     'pixman-solid-fill.c',
     'pixman-trap.c',
diff --git a/gfx/cairo/libpixman/src/pixman-access.c b/gfx/cairo/libpixman/src/pixman-access.c
index 7bd7a5a258c7..822bef6a3a57 100644
--- a/gfx/cairo/libpixman/src/pixman-access.c
+++ b/gfx/cairo/libpixman/src/pixman-access.c
@@ -710,6 +710,36 @@ fetch_scanline_rgbaf_float (bits_image_t   *image,
 }
 #endif
 
+static void
+fetch_scanline_a16b16g16r16_float (bits_image_t *  image,
+				   int             x,
+				   int             y,
+				   int             width,
+				   uint32_t *      b,
+				   const uint32_t *mask)
+{
+    const uint64_t *bits = (uint64_t *)(image->bits + y * image->rowstride);
+    const uint64_t *pixel = bits + x;
+    const uint64_t *end = pixel + width;
+    argb_t *buffer = (argb_t *)b;
+
+    while (pixel < end)
+    {
+	uint64_t p = READ (image, pixel++);
+	uint64_t a = (p >> 48) & 0xffff;
+	uint64_t b = (p >> 32) & 0xffff;
+	uint64_t g = (p >> 16) & 0xffff;
+	uint64_t r = (p >> 0) & 0xffff;
+
+	buffer->a = pixman_unorm_to_float (a, 16);
+	buffer->r = pixman_unorm_to_float (r, 16);
+	buffer->g = pixman_unorm_to_float (g, 16);
+	buffer->b = pixman_unorm_to_float (b, 16);
+
+	buffer++;
+    }
+}
+
 static void
 fetch_scanline_x2r10g10b10_float (bits_image_t   *image,
 				  int             x,
@@ -907,6 +937,27 @@ fetch_pixel_rgbaf_float (bits_image_t *image,
 }
 #endif
 
+static argb_t
+fetch_pixel_a16b16g16r16_float (bits_image_t *image,
+				int           offset,
+				int           line)
+{
+    uint64_t *bits = (uint64_t *)(image->bits + line * image->rowstride);
+    uint64_t p = READ (image, bits + offset);
+    uint64_t a = (p >> 48) & 0xffff;
+    uint64_t b = (p >> 32) & 0xffff;
+    uint64_t g = (p >> 16) & 0xffff;
+    uint64_t r = (p >> 0)  & 0xffff;
+    argb_t argb;
+
+    argb.a = pixman_unorm_to_float (a, 16);
+    argb.r = pixman_unorm_to_float (r, 16);
+    argb.g = pixman_unorm_to_float (g, 16);
+    argb.b = pixman_unorm_to_float (b, 16);
+
+    return argb;
+}
+
 static argb_t
 fetch_pixel_x2r10g10b10_float (bits_image_t *image,
 			       int	   offset,
@@ -1121,6 +1172,32 @@ store_scanline_rgbf_float (bits_image_t *  image,
 }
 #endif
 
+static void
+store_scanline_a16b16g16r16_float (bits_image_t *  image,
+				   int             x,
+				   int             y,
+				   int             width,
+				   const uint32_t *v)
+{
+    uint64_t *bits = (uint64_t *)(image->bits + image->rowstride * y);
+    uint64_t *pixel = bits + x;
+    argb_t *values = (argb_t *)v;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint64_t a, r, g, b;
+
+	a = pixman_float_to_unorm (values[i].a, 16);
+	r = pixman_float_to_unorm (values[i].r, 16);
+	g = pixman_float_to_unorm (values[i].g, 16);
+	b = pixman_float_to_unorm (values[i].b, 16);
+
+	WRITE (image, pixel++,
+	       (a << 48) | (b << 32) | (g << 16) | (r << 0));
+    }
+}
+
 static void
 store_scanline_a2r10g10b10_float (bits_image_t *  image,
 				  int             x,
@@ -1633,6 +1710,10 @@ static const format_info_t accessors[] =
       fetch_pixel_generic_lossy_32, fetch_pixel_rgbf_float,
       NULL, store_scanline_rgbf_float },
 #endif
+    { PIXMAN_a16b16g16r16,
+      NULL, fetch_scanline_a16b16g16r16_float,
+      fetch_pixel_generic_lossy_32, fetch_pixel_a16b16g16r16_float,
+      NULL, store_scanline_a16b16g16r16_float },
 
     { PIXMAN_a2r10g10b10,
       NULL, fetch_scanline_a2r10g10b10_float,
diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S
index 7329d4b2b29d..519ff5b16f72 100644
--- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S
+++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S
@@ -305,16 +305,14 @@
         mov         v28.d[0], v14.d[0]
         mov         v29.d[0], v14.d[1]
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
 10:
     raddhn      v20.8b, v10.8h, v17.8h
     raddhn      v23.8b, v11.8h, v19.8h
                                     PF ble, 10f
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_SRC, #1
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
     raddhn      v22.8b, v12.8h, v18.8h
         st1         {v14.8h}, [DST_W], #16
@@ -497,9 +495,8 @@ generate_composite_function \
     ushll       v14.8h, v2.8b, #7
     sli         v14.8h, v14.8h, #1
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
 10:
     ushll       v9.8h, v0.8b, #7
     sli         v9.8h, v9.8h, #1
@@ -585,12 +582,10 @@ generate_composite_function \
 10:
     uqadd       v28.8b, v0.8b, v4.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
     uqadd       v29.8b, v1.8b, v5.8b
     uqadd       v30.8b, v2.8b, v6.8b
@@ -631,12 +626,10 @@ generate_composite_function \
 10:
     uqadd       v28.8b, v0.8b, v4.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
     uqadd       v29.8b, v1.8b, v5.8b
     uqadd       v30.8b, v2.8b, v6.8b
@@ -719,15 +712,13 @@ generate_composite_function_single_scanline \
 10:
     umull      v9.8h, v22.8b, v5.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
 10:
     umull      v10.8h, v22.8b, v6.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
      umull     v11.8h, v22.8b, v7.8b
 .endm
@@ -793,15 +784,13 @@ generate_composite_function_single_scanline \
 10:
     umull      v9.8h, v22.8b, v5.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
 10:
     umull      v10.8h, v22.8b, v6.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
     umull      v11.8h, v22.8b, v7.8b
 .endm
@@ -886,9 +875,8 @@ generate_composite_function_single_scanline \
                                     PF subs, PF_CTL, PF_CTL, #0x10
     umull       v11.8h, v24.8b, v7.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
         st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
 .endm
@@ -950,9 +938,8 @@ generate_composite_function \
     umull       v9.8h, v22.8b, v5.8b
     umull       v10.8h, v22.8b, v6.8b
                                     PF blt, 10f
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
     umull       v11.8h, v22.8b, v7.8b
 .endm
@@ -1436,9 +1423,8 @@ generate_composite_function \
 10:
     umull          v11.8h, v24.8b, v3.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_MASK, DUMMY]
-                                    PF add, PF_MASK, PF_MASK, #1
+                                    PF add, PF_MASK, PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_MASK]
 10:
         st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
     ursra       v8.8h, v8.8h, #8
@@ -1517,9 +1503,8 @@ generate_composite_function \
 10:
     umull       v3.8h,  v27.8b, v16.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_MASK, DUMMY]
-                                    PF add, PF_MASK, PF_MASK, #1
+                                    PF add, PF_MASK, PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_MASK]
 10:
         st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
     ursra       v0.8h, v0.8h,  #8
@@ -1628,15 +1613,13 @@ generate_composite_function \
 10:
     umull       v19.8h, v24.8b, v11.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-                                    PF add, PF_DST, PF_DST, #1
+                                    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST]
 10:
         uqadd       v28.8b, v0.8b, v28.8b
                                     PF ble, 10f
-                                    PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_MASK, DUMMY]
-                                    PF add, PF_MASK, PF_MASK, #1
+                                    PF add, PF_MASK, PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_MASK]
 10:
         uqadd        v29.8b, v1.8b, v29.8b
         uqadd        v30.8b, v2.8b, v30.8b
@@ -2699,9 +2682,8 @@ generate_composite_function \
                                     PF sub, PF_X, PF_X, ORIG_W
                                     PF subs, PF_CTL, PF_CTL, #0x10
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
 10:
 .endm
 
@@ -2768,9 +2750,8 @@ generate_composite_function \
                                     PF sub, PF_X, PF_X, ORIG_W
                                     PF subs, PF_CTL, PF_CTL, #0x10
                                     PF ble, 10f
-                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC]
 10:
 .endm
 
diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h
index ec3d76fea81d..516ebb8f3485 100644
--- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h
+++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h
@@ -474,25 +474,21 @@
     PF lsl, DUMMY, PF_X, #mask_bpp_shift
     PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
 .endif
-    PF ble, 71f
+    PF ble, 72f
     PF sub, PF_X, PF_X, ORIG_W
     PF subs, PF_CTL, PF_CTL, #0x10
-71:
     PF ble, 72f
 .if src_bpp_shift >= 0
-    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-    PF add, PF_SRC, PF_SRC, #1
+    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
+    PF ldrsb, DUMMY, [PF_SRC]
 .endif
 .if dst_r_bpp != 0
-    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-    PF ldrsb, DUMMY, [PF_DST, DUMMY]
-    PF add, PF_DST, PF_DST, #1
+    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
+    PF ldrsb, DUMMY, [PF_DST]
 .endif
 .if mask_bpp_shift >= 0
-    PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
-    PF ldrsb, DUMMY, [PF_MASK, DUMMY]
-    PF add, PF_MASK, PF_MASK, #1
+    PF add, PF_MASK, PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift
+    PF ldrsb, DUMMY, [PF_MASK]
 .endif
 72:
 .endif
@@ -858,8 +854,7 @@
     PF mov,     PF_DST, DST_R
     PF mov,     PF_MASK, MASK
     /* PF_CTL = \prefetch_distance | ((h - 1) << 4) */
-    PF lsl,     DUMMY, H, #4
-    PF mov,     PF_CTL, DUMMY
+    PF lsl,     PF_CTL, H, #4
     PF add,     PF_CTL, PF_CTL, #(\prefetch_distance - 0x10)
 
     \init
diff --git a/gfx/cairo/libpixman/src/pixman-glyph.c b/gfx/cairo/libpixman/src/pixman-glyph.c
index dc9041180e51..346d78947d95 100644
--- a/gfx/cairo/libpixman/src/pixman-glyph.c
+++ b/gfx/cairo/libpixman/src/pixman-glyph.c
@@ -391,7 +391,8 @@ box32_intersect (pixman_box32_t *dest,
     return dest->x2 > dest->x1 && dest->y2 > dest->y1;
 }
 
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+#if defined(__GNUC__) && defined(__i386__) && !defined(__x86_64__) &&          \
+    !defined(__amd64__)
 __attribute__((__force_align_arg_pointer__))
 #endif
 PIXMAN_EXPORT void
@@ -418,10 +419,10 @@ pixman_composite_glyphs_no_mask (pixman_op_t            op,
 
     _pixman_image_validate (src);
     _pixman_image_validate (dest);
-    
+
     dest_format = dest->common.extended_format_code;
     dest_flags = dest->common.flags;
-    
+
     pixman_region32_init (&region);
     if (!_pixman_compute_composite_region32 (
 	    &region,
@@ -452,9 +453,9 @@ pixman_composite_glyphs_no_mask (pixman_op_t            op,
 	glyph_box.y1 = dest_y + glyphs[i].y - glyph->origin_y;
 	glyph_box.x2 = glyph_box.x1 + glyph->image->bits.width;
 	glyph_box.y2 = glyph_box.y1 + glyph->image->bits.height;
-	
+
 	pbox = pixman_region32_rectangles (&region, &n);
-	
+
 	info.mask_image = glyph_img;
 
 	while (n--)
@@ -588,7 +589,7 @@ add_glyphs (pixman_glyph_cache_t *cache,
 	glyph_box.y1 = glyphs[i].y - glyph->origin_y + off_y;
 	glyph_box.x2 = glyph_box.x1 + glyph->image->bits.width;
 	glyph_box.y2 = glyph_box.y1 + glyph->image->bits.height;
-	
+
 	if (box32_intersect (&composite_box, &glyph_box, &dest_box))
 	{
 	    int src_x = composite_box.x1 - glyph_box.x1;
@@ -623,7 +624,7 @@ out:
  *
  * Then (mask_x, mask_y) in the infinite mask and (src_x, src_y) in the source
  * image are both aligned with (dest_x, dest_y) in the destination image. Then
- * these three images are composited within the 
+ * these three images are composited within the
  *
  *       (dest_x, dest_y, dst_x + width, dst_y + height)
  *
diff --git a/gfx/cairo/libpixman/src/pixman-image.c b/gfx/cairo/libpixman/src/pixman-image.c
index 72796fc9c9f7..d089cb174dd4 100644
--- a/gfx/cairo/libpixman/src/pixman-image.c
+++ b/gfx/cairo/libpixman/src/pixman-image.c
@@ -613,6 +613,30 @@ pixman_image_set_clip_region (pixman_image_t *   image,
     return result;
 }
 
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_clip_region64f (pixman_image_t *   image,
+                                 const pixman_region64f_t *region)
+{
+    image_common_t *common = (image_common_t *)image;
+    pixman_bool_t result;
+
+    if (region)
+    {
+	if ((result = pixman_region32_copy_from_region64f (&common->clip_region, region)))
+	    image->common.have_clip_region = TRUE;
+    }
+    else
+    {
+	_pixman_image_reset_clip_region (image);
+
+	result = TRUE;
+    }
+
+    image_property_changed (image);
+
+    return result;
+}
+
 PIXMAN_EXPORT void
 pixman_image_set_has_client_clip (pixman_image_t *image,
                                   pixman_bool_t   client_clip)
diff --git a/gfx/cairo/libpixman/src/pixman-private.h b/gfx/cairo/libpixman/src/pixman-private.h
index eab3adff90d6..aa8b290dc6f4 100644
--- a/gfx/cairo/libpixman/src/pixman-private.h
+++ b/gfx/cairo/libpixman/src/pixman-private.h
@@ -893,6 +893,10 @@ pixman_bool_t
 pixman_region32_copy_from_region16 (pixman_region32_t *dst,
                                     const pixman_region16_t *src);
 
+pixman_bool_t
+pixman_region32_copy_from_region64f (pixman_region32_t *dst,
+                                     const pixman_region64f_t *src);
+
 pixman_bool_t
 pixman_region16_copy_from_region32 (pixman_region16_t *dst,
                                     const pixman_region32_t *src);
diff --git a/gfx/cairo/libpixman/src/pixman-region.c b/gfx/cairo/libpixman/src/pixman-region.c
index d75e51991b23..849f615cf361 100644
--- a/gfx/cairo/libpixman/src/pixman-region.c
+++ b/gfx/cairo/libpixman/src/pixman-region.c
@@ -346,7 +346,11 @@ PREFIX (_print) (region_type_t *rgn)
     rects = PIXREGION_RECTS (rgn);
 
     fprintf (stderr, "num: %d size: %d\n", num, size);
-    fprintf (stderr, "extents: %d %d %d %d\n",
+    fprintf (stderr, "extents: "
+		     PRINT_SPECIFIER " "
+		     PRINT_SPECIFIER " "
+		     PRINT_SPECIFIER " "
+		     PRINT_SPECIFIER "\n",
              rgn->extents.x1,
 	     rgn->extents.y1,
 	     rgn->extents.x2,
@@ -354,7 +358,10 @@ PREFIX (_print) (region_type_t *rgn)
     
     for (i = 0; i < num; i++)
     {
-	fprintf (stderr, "%d %d %d %d \n",
+	fprintf (stderr, PRINT_SPECIFIER " "
+			 PRINT_SPECIFIER " "
+			 PRINT_SPECIFIER " "
+			 PRINT_SPECIFIER " \n",
 	         rects[i].x1, rects[i].y1, rects[i].x2, rects[i].y2);
     }
     
@@ -394,6 +401,29 @@ PREFIX (_init_rect) (region_type_t *	region,
     region->data = NULL;
 }
 
+PIXMAN_EXPORT void
+PREFIX (_init_rectf) (region_type_t *	region,
+                      double		x,
+		      double		y,
+		      double		width,
+		      double		height)
+{
+    region->extents.x1 = x;
+    region->extents.y1 = y;
+    region->extents.x2 = x + width;
+    region->extents.y2 = y + height;
+
+    if (!GOOD_RECT (&region->extents))
+    {
+        if (BAD_RECT (&region->extents))
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
+        PREFIX (_init) (region);
+        return;
+    }
+
+    region->data = NULL;
+}
+
 PIXMAN_EXPORT void
 PREFIX (_init_with_extents) (region_type_t *region, const box_type_t *extents)
 {
@@ -572,7 +602,7 @@ pixman_coalesce (region_type_t * region,      /* Region to coalesce		 */
     box_type_t *prev_box;       /* Current box in previous band	     */
     box_type_t *cur_box;        /* Current box in current band       */
     int numRects;               /* Number rectangles in both bands   */
-    int y2;                     /* Bottom of current band	     */
+    primitive_t y2;             /* Bottom of current band	     */
 
     /*
      * Figure out how many rectangles are in the band.
@@ -658,8 +688,8 @@ static inline pixman_bool_t
 pixman_region_append_non_o (region_type_t * region,
 			    box_type_t *    r,
 			    box_type_t *    r_end,
-			    int             y1,
-			    int             y2)
+			    primitive_t     y1,
+			    primitive_t     y2)
 {
     box_type_t *next_rect;
     int new_rects;
@@ -741,8 +771,8 @@ typedef pixman_bool_t (*overlap_proc_ptr) (region_type_t *region,
 					   box_type_t *   r1_end,
 					   box_type_t *   r2,
 					   box_type_t *   r2_end,
-					   int            y1,
-					   int            y2);
+					   primitive_t    y1,
+					   primitive_t    y2);
 
 static pixman_bool_t
 pixman_op (region_type_t *  new_reg,               /* Place to store result	    */
@@ -762,8 +792,8 @@ pixman_op (region_type_t *  new_reg,               /* Place to store result
     box_type_t *r2;                 /* Pointer into 2d region	     */
     box_type_t *r1_end;             /* End of 1st region	     */
     box_type_t *r2_end;             /* End of 2d region		     */
-    int ybot;                       /* Bottom of intersection	     */
-    int ytop;                       /* Top of intersection	     */
+    primitive_t ybot;               /* Bottom of intersection	     */
+    primitive_t ytop;               /* Top of intersection	     */
     region_data_type_t *old_data;   /* Old data for new_reg	     */
     int prev_band;                  /* Index of start of
 				     * previous band in new_reg       */
@@ -771,10 +801,10 @@ pixman_op (region_type_t *  new_reg,               /* Place to store result
 				     * band in new_reg		     */
     box_type_t * r1_band_end;       /* End of current band in r1     */
     box_type_t * r2_band_end;       /* End of current band in r2     */
-    int top;                        /* Top of non-overlapping band   */
-    int bot;                        /* Bottom of non-overlapping band*/
-    int r1y1;                       /* Temps for r1->y1 and r2->y1   */
-    int r2y1;
+    primitive_t top;                /* Top of non-overlapping band   */
+    primitive_t bot;                /* Bottom of non-overlapping band*/
+    primitive_t r1y1;               /* Temps for r1->y1 and r2->y1   */
+    primitive_t r2y1;
     int new_size;
     int numRects;
 
@@ -1110,11 +1140,11 @@ pixman_region_intersect_o (region_type_t *region,
                            box_type_t *   r1_end,
                            box_type_t *   r2,
                            box_type_t *   r2_end,
-                           int            y1,
-                           int            y2)
+                           primitive_t    y1,
+                           primitive_t    y2)
 {
-    int x1;
-    int x2;
+    primitive_t x1;
+    primitive_t x2;
     box_type_t *        next_rect;
 
     next_rect = PIXREGION_TOP (region);
@@ -1262,12 +1292,12 @@ pixman_region_union_o (region_type_t *region,
 		       box_type_t *   r1_end,
 		       box_type_t *   r2,
 		       box_type_t *   r2_end,
-		       int            y1,
-		       int            y2)
+		       primitive_t    y1,
+		       primitive_t    y2)
 {
     box_type_t *next_rect;
-    int x1;            /* left and right side of current union */
-    int x2;
+    primitive_t x1;    /* left and right side of current union */
+    primitive_t x2;
 
     critical_if_fail (y1 < y2);
     critical_if_fail (r1 != r1_end && r2 != r2_end);
@@ -1337,6 +1367,24 @@ PREFIX(_intersect_rect) (region_type_t *dest,
     return PREFIX(_intersect) (dest, source, &region);
 }
 
+PIXMAN_EXPORT pixman_bool_t
+PREFIX(_intersect_rectf) (region_type_t *dest,
+			  const region_type_t *source,
+			  double x, double y,
+			  double width,
+			  double height)
+{
+    region_type_t region;
+
+    region.data = NULL;
+    region.extents.x1 = x;
+    region.extents.y1 = y;
+    region.extents.x2 = x + width;
+    region.extents.y2 = y + height;
+
+    return PREFIX(_intersect) (dest, source, &region);
+}
+
 /* Convenience function for performing union of region with a
  * single rectangle
  */
@@ -1367,6 +1415,33 @@ PREFIX (_union_rect) (region_type_t *dest,
     return PREFIX (_union) (dest, source, &region);
 }
 
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_union_rectf) (region_type_t *dest,
+                       const region_type_t *source,
+                       double         x,
+		       double         y,
+                       double         width,
+		       double         height)
+{
+    region_type_t region;
+
+    region.extents.x1 = x;
+    region.extents.y1 = y;
+    region.extents.x2 = x + width;
+    region.extents.y2 = y + height;
+
+    if (!GOOD_RECT (&region.extents))
+    {
+        if (BAD_RECT (&region.extents))
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
+	return PREFIX (_copy) (dest, source);
+    }
+
+    region.data = NULL;
+
+    return PREFIX (_union) (dest, source, &region);
+}
+
 PIXMAN_EXPORT pixman_bool_t
 PREFIX (_union) (region_type_t *      new_reg,
                  const region_type_t *reg1,
@@ -1467,8 +1542,8 @@ quick_sort_rects (
     box_type_t rects[],
     int        numRects)
 {
-    int y1;
-    int x1;
+    primitive_t y1;
+    primitive_t x1;
     int i, j;
     box_type_t *r;
 
@@ -1833,11 +1908,11 @@ pixman_region_subtract_o (region_type_t * region,
 			  box_type_t *    r1_end,
 			  box_type_t *    r2,
 			  box_type_t *    r2_end,
-			  int             y1,
-			  int             y2)
+			  primitive_t     y1,
+			  primitive_t     y2)
 {
     box_type_t *        next_rect;
-    int x1;
+    primitive_t x1;
 
     x1 = r1->x1;
 
@@ -2066,7 +2141,7 @@ PREFIX (_inverse) (region_type_t *      new_reg,  /* Destination region */
  * Return @end if no such box exists.
  */
 static box_type_t *
-find_box_for_y (box_type_t *begin, box_type_t *end, int y)
+find_box_for_y (box_type_t *begin, box_type_t *end, primitive_t y)
 {
     box_type_t *mid;
 
@@ -2120,7 +2195,7 @@ PREFIX (_contains_rectangle) (const region_type_t *  region,
     box_type_t *     pbox_end;
     int part_in, part_out;
     int numRects;
-    int x, y;
+    primitive_t x, y;
 
     GOOD (region);
 
@@ -2571,8 +2646,8 @@ static inline box_type_t *
 bitmap_addrect (region_type_t *reg,
                 box_type_t *r,
                 box_type_t **first_rect,
-                int rx1, int ry1,
-                int rx2, int ry2)
+                primitive_t rx1, primitive_t ry1,
+                primitive_t rx2, primitive_t ry2)
 {
     if ((rx1 < rx2) && (ry1 < ry2) &&
 	(!(reg->data->numRects &&
diff --git a/gfx/cairo/libpixman/src/pixman-region16.c b/gfx/cairo/libpixman/src/pixman-region16.c
index da4719e7a84d..ff7b038b2473 100644
--- a/gfx/cairo/libpixman/src/pixman-region16.c
+++ b/gfx/cairo/libpixman/src/pixman-region16.c
@@ -35,6 +35,7 @@
 typedef pixman_box16_t		box_type_t;
 typedef pixman_region16_data_t	region_data_type_t;
 typedef pixman_region16_t	region_type_t;
+typedef int                     primitive_t;
 typedef int32_t                 overflow_int_t;
 
 typedef struct {
@@ -46,6 +47,8 @@ typedef struct {
 #define PIXMAN_REGION_MAX INT16_MAX
 #define PIXMAN_REGION_MIN INT16_MIN
 
+#define PRINT_SPECIFIER "%d"
+
 #include "pixman-region.c"
 
 /* This function exists only to make it possible to preserve the X ABI -
diff --git a/gfx/cairo/libpixman/src/pixman-region32.c b/gfx/cairo/libpixman/src/pixman-region32.c
index 68b456bf3c90..fbaa21663006 100644
--- a/gfx/cairo/libpixman/src/pixman-region32.c
+++ b/gfx/cairo/libpixman/src/pixman-region32.c
@@ -33,6 +33,7 @@
 typedef pixman_box32_t		box_type_t;
 typedef pixman_region32_data_t	region_data_type_t;
 typedef pixman_region32_t	region_type_t;
+typedef int                     primitive_t;
 typedef int64_t                 overflow_int_t;
 
 typedef struct {
@@ -44,4 +45,6 @@ typedef struct {
 #define PIXMAN_REGION_MAX INT32_MAX
 #define PIXMAN_REGION_MIN INT32_MIN
 
+#define PRINT_SPECIFIER "%d"
+
 #include "pixman-region.c"
diff --git a/gfx/cairo/libpixman/src/pixman-region64f.c b/gfx/cairo/libpixman/src/pixman-region64f.c
new file mode 100644
index 000000000000..fb8cc5ae1306
--- /dev/null
+++ b/gfx/cairo/libpixman/src/pixman-region64f.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2008 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without
+ * fee, provided that the above copyright notice appear in all copies
+ * and that both that copyright notice and this permission notice
+ * appear in supporting documentation, and that the name of
+ * Red Hat, Inc. not be used in advertising or publicity pertaining to
+ * distribution of the software without specific, written prior
+ * permission. Red Hat, Inc. makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * RED HAT, INC. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL RED HAT, INC. BE LIABLE FOR ANY SPECIAL,
+ * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
+ * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+ * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author: Soren Sandmann <sandmann@redhat.com>
+ */
+#ifdef HAVE_CONFIG_H
+#include <pixman-config.h>
+#endif
+
+#include "pixman-private.h"
+
+#include <stdlib.h>
+
+typedef pixman_box64f_t		box_type_t;
+typedef pixman_region64f_data_t	region_data_type_t;
+typedef pixman_region64f_t	region_type_t;
+typedef double                  primitive_t;
+typedef int64_t                 overflow_int_t;
+
+typedef struct {
+    double x, y;
+} point_type_t;
+
+#define PREFIX(x) pixman_region64f##x
+
+#define PIXMAN_REGION_MAX INT32_MAX
+#define PIXMAN_REGION_MIN INT32_MIN
+
+#define PRINT_SPECIFIER "%f"
+
+#include "pixman-region.c"
diff --git a/gfx/cairo/libpixman/src/pixman-rvv.c b/gfx/cairo/libpixman/src/pixman-rvv.c
index 6808f50e5615..570799ca06d6 100644
--- a/gfx/cairo/libpixman/src/pixman-rvv.c
+++ b/gfx/cairo/libpixman/src/pixman-rvv.c
@@ -2,6 +2,8 @@
  * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
  *             2005 Lars Knoll & Zack Rusin, Trolltech
  *             2024 Filip Wasil, Samsung Electronics
+ *             2024 Bernard Gingold, Samsung Electronics
+ *             2025 Marek Pikuła, Samsung Electronics
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
  * the above copyright notice appear in all copies and that both that
@@ -27,6 +29,8 @@
 #endif
 
 #include "pixman-combine-float.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
 #include "pixman-private.h"
 
 #include <riscv_vector.h>
@@ -40,6 +44,50 @@
 #include <stdlib.h>
 #include <string.h>
 
+// Convenience macros {
+
+#define __FE_PTR(p, vl) ((p) += (vl))
+
+#define _RVV_FE_PRE(total_len, vn, vl, vspec)                                  \
+    size_t vn = total_len, vl = __riscv_vsetvl_##vspec (vn);                   \
+    vn > 0
+
+#define _RVV_FE_POST(vn, vl, vspec) vn -= (vl), vl = __riscv_vsetvl_##vspec (vn)
+
+#define RVV_FOREACH_1(total_len, vl, vspec, p1)                                \
+    for (_RVV_FE_PRE (total_len, vn, vl, vspec);                               \
+	 __FE_PTR (p1, vl), _RVV_FE_POST (vn, vl, vspec))
+
+#define RVV_FOREACH_2(total_len, vl, vspec, p1, p2)                            \
+    for (_RVV_FE_PRE (total_len, vn, vl, vspec);                               \
+	 __FE_PTR (p1, vl), __FE_PTR (p2, vl), _RVV_FE_POST (vn, vl, vspec))
+
+#define RVV_FOREACH_3(total_len, vl, vspec, p1, p2, p3)                        \
+    for (_RVV_FE_PRE (total_len, vn, vl, vspec);                               \
+	 __FE_PTR (p1, vl), __FE_PTR (p2, vl), __FE_PTR (p3, vl),              \
+	 _RVV_FE_POST (vn, vl, vspec))
+
+// vuintXXmYY_t for use in macros (less token concatenation).
+#define VUINT(ELEN, LMUL) vuint##ELEN##LMUL##_t
+#define VUINT32(LMUL)     VUINT (32, LMUL)
+#define VUINT16(LMUL)     VUINT (16, LMUL)
+#define VUINT8(LMUL)      VUINT (8, LMUL)
+
+// Short for vreinterpret commonly used for ARGB batch operations.
+#define RVV_U8x4_U32(LMUL, value)                                              \
+    __riscv_vreinterpret_v_u8##LMUL##_u32##LMUL (value)
+#define RVV_U8x4_U32_m2(value) RVV_U8x4_U32 (m2, value)
+#define RVV_U8x4_U32_m4(value) RVV_U8x4_U32 (m4, value)
+
+#define RVV_U32_U8x4(LMUL, value)                                              \
+    __riscv_vreinterpret_v_u32##LMUL##_u8##LMUL (value)
+#define RVV_U32_U8x4_m2(value) RVV_U32_U8x4 (m2, value)
+#define RVV_U32_U8x4_m4(value) RVV_U32_U8x4 (m4, value)
+
+// }
+
+// Float implementation
+
 /*
  * Screen
  *
@@ -49,11 +97,11 @@
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_screen (const vfloat32m1_t sa,
-		  const vfloat32m1_t s,
-		  const vfloat32m1_t da,
-		  const vfloat32m1_t d,
-		  size_t             vl)
+rvv_blend_screen_float (const vfloat32m1_t sa,
+			const vfloat32m1_t s,
+			const vfloat32m1_t da,
+			const vfloat32m1_t d,
+			size_t             vl)
 {
     vfloat32m1_t t0, t1, t2;
     t0 = __riscv_vfmul_vv_f32m1 (s, da, vl);
@@ -72,11 +120,11 @@ rvv_blend_screen (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_multiply (const vfloat32m1_t sa,
-		    const vfloat32m1_t s,
-		    const vfloat32m1_t da,
-		    const vfloat32m1_t d,
-		    size_t             vl)
+rvv_blend_multiply_float (const vfloat32m1_t sa,
+			  const vfloat32m1_t s,
+			  const vfloat32m1_t da,
+			  const vfloat32m1_t d,
+			  size_t             vl)
 {
     return __riscv_vfmul_vv_f32m1 (s, d, vl);
 }
@@ -105,14 +153,14 @@ rvv_blend_multiply (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_overlay (const vfloat32m1_t sa,
-		   const vfloat32m1_t s,
-		   const vfloat32m1_t da,
-		   const vfloat32m1_t d,
-		   size_t             vl)
+rvv_blend_overlay_float (const vfloat32m1_t sa,
+			 const vfloat32m1_t s,
+			 const vfloat32m1_t da,
+			 const vfloat32m1_t d,
+			 size_t             vl)
 {
     vfloat32m1_t t0, t1, t2, t3, t4, f0, f1, f2;
-    vbool32_t             vb;
+    vbool32_t    vb;
     t0 = __riscv_vfadd_vv_f32m1 (d, d, vl);
     t1 = __riscv_vfmul_vv_f32m1 (__riscv_vfadd_vv_f32m1 (s, s, vl), d, vl);
     vb = __riscv_vmflt_vv_f32m1_b32 (t0, da, vl);
@@ -134,14 +182,14 @@ rvv_blend_overlay (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_darken (const vfloat32m1_t sa,
-		  const vfloat32m1_t s,
-		  const vfloat32m1_t da,
-		  const vfloat32m1_t d,
-		  size_t             vl)
+rvv_blend_darken_float (const vfloat32m1_t sa,
+			const vfloat32m1_t s,
+			const vfloat32m1_t da,
+			const vfloat32m1_t d,
+			size_t             vl)
 {
     vfloat32m1_t ss, dd;
-    vbool32_t             vb;
+    vbool32_t    vb;
     ss = __riscv_vfmul_vv_f32m1 (da, s, vl);
     dd = __riscv_vfmul_vv_f32m1 (sa, d, vl);
     vb = __riscv_vmfgt_vv_f32m1_b32 (ss, dd, vl);
@@ -157,14 +205,14 @@ rvv_blend_darken (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_lighten (const vfloat32m1_t sa,
-		   const vfloat32m1_t s,
-		   const vfloat32m1_t da,
-		   const vfloat32m1_t d,
-		   size_t             vl)
+rvv_blend_lighten_float (const vfloat32m1_t sa,
+			 const vfloat32m1_t s,
+			 const vfloat32m1_t da,
+			 const vfloat32m1_t d,
+			 size_t             vl)
 {
     vfloat32m1_t ss, dd;
-    vbool32_t             vb;
+    vbool32_t    vb;
     ss = __riscv_vfmul_vv_f32m1 (s, da, vl);
     dd = __riscv_vfmul_vv_f32m1 (d, sa, vl);
     vb = __riscv_vmfgt_vv_f32m1_b32 (ss, dd, vl);
@@ -191,14 +239,14 @@ rvv_blend_lighten (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_color_dodge (const vfloat32m1_t sa,
-		       const vfloat32m1_t s,
-		       const vfloat32m1_t da,
-		       const vfloat32m1_t d,
-		       size_t             vl)
+rvv_blend_color_dodge_float (const vfloat32m1_t sa,
+			     const vfloat32m1_t s,
+			     const vfloat32m1_t da,
+			     const vfloat32m1_t d,
+			     size_t             vl)
 {
     vfloat32m1_t t0, t1, t2, t3, t4;
-    vbool32_t             is_d_zero, vb, is_t0_non_zero;
+    vbool32_t    is_d_zero, vb, is_t0_non_zero;
 
     is_d_zero = __riscv_vmfeq_vf_f32m1_b32 (d, 0.0f, vl);
 
@@ -241,11 +289,11 @@ rvv_blend_color_dodge (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_color_burn (const vfloat32m1_t sa,
-		      const vfloat32m1_t s,
-		      const vfloat32m1_t da,
-		      const vfloat32m1_t d,
-		      size_t             vl)
+rvv_blend_color_burn_float (const vfloat32m1_t sa,
+			    const vfloat32m1_t s,
+			    const vfloat32m1_t da,
+			    const vfloat32m1_t d,
+			    size_t             vl)
 {
     vfloat32m1_t t0, t1, t2, t3, t4, t5, t6, t7;
     vbool32_t    is_d_ge_da, is_s_zero, vb;
@@ -289,14 +337,14 @@ rvv_blend_color_burn (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_hard_light (const vfloat32m1_t sa,
-		      const vfloat32m1_t s,
-		      const vfloat32m1_t da,
-		      const vfloat32m1_t d,
-		      size_t             vl)
+rvv_blend_hard_light_float (const vfloat32m1_t sa,
+			    const vfloat32m1_t s,
+			    const vfloat32m1_t da,
+			    const vfloat32m1_t d,
+			    size_t             vl)
 {
     vfloat32m1_t t0, t1, t2, t3, t4;
-    vbool32_t             vb;
+    vbool32_t    vb;
     t0 = __riscv_vfadd_vv_f32m1 (s, s, vl);
     t1 = __riscv_vfmul_vv_f32m1 (__riscv_vfadd_vv_f32m1 (s, s, vl), d, vl);
     vb = __riscv_vmfgt_vv_f32m1_b32 (t0, sa, vl);
@@ -328,15 +376,14 @@ rvv_blend_hard_light (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_soft_light (const vfloat32m1_t sa,
-		      const vfloat32m1_t s,
-		      const vfloat32m1_t da,
-		      const vfloat32m1_t d,
-		      size_t             vl)
+rvv_blend_soft_light_float (const vfloat32m1_t sa,
+			    const vfloat32m1_t s,
+			    const vfloat32m1_t da,
+			    const vfloat32m1_t d,
+			    size_t             vl)
 {
-    vfloat32m1_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12,
-	t13;
-    vbool32_t is_sa_lt_2s, is_da_ls_4d, is_da_non_zero;
+    vfloat32m1_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13;
+    vbool32_t    is_sa_lt_2s, is_da_ls_4d, is_da_non_zero;
     is_da_non_zero = __riscv_vmfne_vf_f32m1_b32 (da, 0.0f, vl);
     t0             = __riscv_vfadd_vv_f32m1 (s, s, vl); // 2 * s
     is_sa_lt_2s    = __riscv_vmflt_vv_f32m1_b32 (sa, t0, vl);
@@ -358,8 +405,8 @@ rvv_blend_soft_light (const vfloat32m1_t sa,
 					  __riscv_vfadd_vv_f32m1 (d, d, vl), vl);
     is_da_ls_4d = __riscv_vmflt_vv_f32m1_b32 (da, t6, vl);
     t10         = __riscv_vfsub_vv_f32m1 (
-		__riscv_vfsqrt_v_f32m1 (__riscv_vfmul_vv_f32m1 (d, da, vl), vl), d,
-		vl); // sqrtf (d * da) - d
+        __riscv_vfsqrt_v_f32m1 (__riscv_vfmul_vv_f32m1 (d, da, vl), vl), d,
+        vl); // sqrtf (d * da) - d
     t11 = __riscv_vfmul_vv_f32m1 (t2, t10,
 				  vl); // (sqrtf (d * da) - d) * (sa - 2 * s)
     t12 = __riscv_vfsub_vv_f32m1 (
@@ -397,14 +444,14 @@ rvv_blend_soft_light (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_difference (const vfloat32m1_t sa,
-		      const vfloat32m1_t s,
-		      const vfloat32m1_t da,
-		      const vfloat32m1_t d,
-		      size_t             vl)
+rvv_blend_difference_float (const vfloat32m1_t sa,
+			    const vfloat32m1_t s,
+			    const vfloat32m1_t da,
+			    const vfloat32m1_t d,
+			    size_t             vl)
 {
     vfloat32m1_t dsa, sda;
-    vbool32_t             vb;
+    vbool32_t    vb;
     dsa = __riscv_vfmul_vv_f32m1 (d, sa, vl);
     sda = __riscv_vfmul_vv_f32m1 (s, da, vl);
     vb  = __riscv_vmflt_vv_f32m1_b32 (sda, dsa, vl);
@@ -422,11 +469,11 @@ rvv_blend_difference (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_exclusion (const vfloat32m1_t sa,
-		     const vfloat32m1_t s,
-		     const vfloat32m1_t da,
-		     const vfloat32m1_t d,
-		     size_t             vl)
+rvv_blend_exclusion_float (const vfloat32m1_t sa,
+			   const vfloat32m1_t s,
+			   const vfloat32m1_t da,
+			   const vfloat32m1_t d,
+			   size_t             vl)
 {
     vfloat32m1_t t0, t1;
     t0 = __riscv_vfmul_vv_f32m1 (__riscv_vfadd_vv_f32m1 (d, d, vl), s, vl);
@@ -442,13 +489,13 @@ typedef vfloat32m1_t (*rvv_combine_channel_float_t) (const vfloat32m1_t sa,
 						     size_t             vl);
 
 static force_inline void
-rvv_combine_inner (pixman_bool_t               component,
-		   float                      *dest,
-		   const float                *src,
-		   const float                *mask,
-		   int                         n_pixels,
-		   rvv_combine_channel_float_t combine_a,
-		   rvv_combine_channel_float_t combine_c)
+rvv_combine_inner_float (pixman_bool_t               component,
+			 float                      *dest,
+			 const float                *src,
+			 const float                *mask,
+			 int                         n_pixels,
+			 rvv_combine_channel_float_t combine_a,
+			 rvv_combine_channel_float_t combine_c)
 {
     float *__restrict__ pd       = dest;
     const float *__restrict__ ps = src;
@@ -618,8 +665,8 @@ rvv_combine_inner (pixman_bool_t               component,
 	pixman_implementation_t *imp, pixman_op_t op, float *dest,             \
 	const float *src, const float *mask, int n_pixels)                     \
     {                                                                          \
-	rvv_combine_inner (component, dest, src, mask, n_pixels, combine_a,    \
-			   combine_c);                                         \
+	rvv_combine_inner_float (component, dest, src, mask, n_pixels,         \
+				 combine_a, combine_c);                        \
     }
 
 #define RVV_MAKE_COMBINERS(name, combine_a, combine_c)                         \
@@ -627,10 +674,10 @@ rvv_combine_inner (pixman_bool_t               component,
     RVV_MAKE_COMBINER (name##_u, FALSE, combine_a, combine_c)
 
 static force_inline vfloat32m1_t
-rvv_get_factor (combine_factor_t factor,
-		vfloat32m1_t     sa,
-		vfloat32m1_t     da,
-		size_t           vl)
+rvv_get_factor_float (combine_factor_t factor,
+		      vfloat32m1_t     sa,
+		      vfloat32m1_t     da,
+		      size_t           vl)
 {
     vfloat32m1_t vone  = __riscv_vfmv_v_f_f32m1 (1.0f, vl);
     vfloat32m1_t vzero = __riscv_vfmv_v_f_f32m1 (0.0f, vl);
@@ -752,20 +799,21 @@ rvv_get_factor (combine_factor_t factor,
 }
 
 #define RVV_MAKE_PD_COMBINERS(name, a, b)                                      \
-    static vfloat32m1_t force_inline rvv_pd_combine_##name (                   \
+    static vfloat32m1_t force_inline rvv_pd_combine_##name##_float (           \
 	vfloat32m1_t sa, vfloat32m1_t s, vfloat32m1_t da, vfloat32m1_t d,      \
 	size_t vl)                                                             \
     {                                                                          \
-	const vfloat32m1_t fa = rvv_get_factor (a, sa, da, vl);                \
-	const vfloat32m1_t fb = rvv_get_factor (b, sa, da, vl);                \
+	const vfloat32m1_t fa = rvv_get_factor_float (a, sa, da, vl);          \
+	const vfloat32m1_t fb = rvv_get_factor_float (b, sa, da, vl);          \
 	vfloat32m1_t       t0 = __riscv_vfadd_vv_f32m1 (                       \
-		  __riscv_vfmul_vv_f32m1 (s, fa, vl),                          \
-		  __riscv_vfmul_vv_f32m1 (d, fb, vl), vl);                     \
+            __riscv_vfmul_vv_f32m1 (s, fa, vl),                          \
+            __riscv_vfmul_vv_f32m1 (d, fb, vl), vl);                     \
 	return __riscv_vfmin_vv_f32m1 (__riscv_vfmv_v_f_f32m1 (1.0f, vl), t0,  \
 				       vl);                                    \
     }                                                                          \
                                                                                \
-    RVV_MAKE_COMBINERS (name, rvv_pd_combine_##name, rvv_pd_combine_##name)
+    RVV_MAKE_COMBINERS (name, rvv_pd_combine_##name##_float,                   \
+			rvv_pd_combine_##name##_float)
 
 RVV_MAKE_PD_COMBINERS (clear, ZERO, ZERO)
 RVV_MAKE_PD_COMBINERS (src, ONE, ZERO)
@@ -834,8 +882,8 @@ RVV_MAKE_PD_COMBINERS (conjoint_xor, ONE_MINUS_DA_OVER_SA, ONE_MINUS_SA_OVER_DA)
 		vl),                                                           \
 	    -1.0f, vl);                                                        \
                                                                                \
-	return __riscv_vfadd_vv_f32m1 (f, rvv_blend_##name (sa, s, da, d, vl), \
-				       vl);                                    \
+	return __riscv_vfadd_vv_f32m1 (                                        \
+	    f, rvv_blend_##name##_float (sa, s, da, d, vl), vl);               \
     }                                                                          \
                                                                                \
     RVV_MAKE_COMBINERS (name, rvv_combine_##name##_a, rvv_combine_##name##_c)
@@ -852,16 +900,2205 @@ RVV_MAKE_SEPARABLE_PDF_COMBINERS (soft_light)
 RVV_MAKE_SEPARABLE_PDF_COMBINERS (difference)
 RVV_MAKE_SEPARABLE_PDF_COMBINERS (exclusion)
 
+// int implementation
+
+// pixman-combine32.h RVV implementation plus some convenience functions {
+
+/*
+ * x_c = min(x_c + y_c, 255)
+ */
+
+#define rvv_UN8_ADD_UN8_vv(x, y, vl) __riscv_vsaddu (x, y, vl)
+
+#define rvv_UN8x4_ADD_UN8x4_vv_m4(x, y, vl)                                    \
+    RVV_U8x4_U32_m4 (rvv_UN8_ADD_UN8_vv (RVV_U32_U8x4_m4 (x),                  \
+					 RVV_U32_U8x4_m4 (y), (vl) * 4))
+
+/*
+* x_c = (x_c * a_c) / 255
+*/
+
+#define __rvv_UN8_MUL_UN8_vv(LMUL, LMUL16)                                     \
+    static force_inline VUINT8 (LMUL) rvv_UN8_MUL_UN8_vv_##LMUL (              \
+	const VUINT8 (LMUL) x, const VUINT8 (LMUL) a, size_t vl)               \
+    {                                                                          \
+	VUINT16 (LMUL16)                                                       \
+	mul_higher = __riscv_vwmaccu (                                         \
+	    __riscv_vmv_v_x_u16##LMUL16 (ONE_HALF, vl), x, a, vl);             \
+                                                                               \
+	VUINT16 (LMUL16)                                                       \
+	mul_lower = __riscv_vsrl (mul_higher, G_SHIFT, vl);                    \
+                                                                               \
+	return __riscv_vnsrl (__riscv_vadd (mul_higher, mul_lower, vl),        \
+			      G_SHIFT, vl);                                    \
+    }
+__rvv_UN8_MUL_UN8_vv (m1, m2);
+__rvv_UN8_MUL_UN8_vv (m2, m4);
+__rvv_UN8_MUL_UN8_vv (m4, m8);
+
+static force_inline vuint8m4_t
+rvv_UN8_MUL_UN8_vx_m4 (const vuint8m4_t x, const uint8_t a, size_t vl)
+{
+    vuint16m8_t mul_higher = __riscv_vwmaccu (
+	__riscv_vmv_v_x_u16m8 (ONE_HALF, vl), a, x, vl);
+    vuint16m8_t mul_lower = __riscv_vsrl (mul_higher, G_SHIFT, vl);
+
+    return __riscv_vnsrl (__riscv_vadd (mul_higher, mul_lower, vl), G_SHIFT,
+			  vl);
+}
+
+#define __rvv_UN8x4_MUL_UN8x4_vv(LMUL, x, a, vl)                               \
+    RVV_U8x4_U32 (LMUL, rvv_UN8_MUL_UN8_vv_##LMUL (RVV_U32_U8x4 (LMUL, x),     \
+						   RVV_U32_U8x4 (LMUL, a),     \
+						   (vl) * 4))
+#define rvv_UN8x4_MUL_UN8x4_vv_m2(x, a, vl)                                    \
+    __rvv_UN8x4_MUL_UN8x4_vv (m2, x, a, vl)
+#define rvv_UN8x4_MUL_UN8x4_vv_m4(x, a, vl)                                    \
+    __rvv_UN8x4_MUL_UN8x4_vv (m4, x, a, vl)
+
+/*
+* a_c = a (broadcast to all components)
+*/
+
+#define __rvv_UN16_bcast_UN8x4_v(LMUL, LMUL16)                                 \
+    static force_inline VUINT32 (LMUL)                                         \
+	rvv_UN16_bcast_UN8x4_v_##LMUL (const VUINT16 (LMUL16) a, size_t vl)    \
+    {                                                                          \
+	VUINT32 (LMUL)                                                         \
+	a32 = __riscv_vwcvtu_x (__riscv_vmadd (a, 1 << 8, a, vl), vl);         \
+                                                                               \
+	return __riscv_vmadd (a32, 1 << 16, a32, vl);                          \
+    }
+__rvv_UN16_bcast_UN8x4_v (m2, m1);
+__rvv_UN16_bcast_UN8x4_v (m4, m2);
+
+#define rvv_UN8_bcast_UN8x4_v_m4(a, vl)                                        \
+    rvv_UN16_bcast_UN8x4_v_m4 (__riscv_vwcvtu_x (a, vl), vl)
+
+/*
+* x_c = (x_c * a) / 255
+*/
+
+#define rvv_UN8x4_MUL_UN8_vv_m4(x, a, vl)                                      \
+    rvv_UN8x4_MUL_UN8x4_vv_m4 (x, rvv_UN8_bcast_UN8x4_v_m4 (a, vl), vl)
+
+#define __rvv_UN8x4_MUL_UN16_vv(LMUL, x, a, vl)                                \
+    rvv_UN8x4_MUL_UN8x4_vv_##LMUL (x, rvv_UN16_bcast_UN8x4_v_##LMUL (a, vl), vl)
+#define rvv_UN8x4_MUL_UN16_vv_m2(x, a, vl)                                     \
+    __rvv_UN8x4_MUL_UN16_vv (m2, x, a, vl)
+#define rvv_UN8x4_MUL_UN16_vv_m4(x, a, vl)                                     \
+    __rvv_UN8x4_MUL_UN16_vv (m4, x, a, vl)
+
+#define rvv_UN8x4_MUL_UN8_vx_m4(x, a, vl)                                      \
+    RVV_U8x4_U32_m4 (rvv_UN8_MUL_UN8_vx_m4 (RVV_U32_U8x4_m4 (x), a, (vl) * 4))
+
+static force_inline vuint32m2_t
+rvv_DIV_ONE_UN32m2_UN32m2_v (const vuint32m2_t x, size_t vl)
+{
+    vuint32m2_t mul_higher = __riscv_vadd (x, ONE_HALF, vl);
+    vuint32m2_t mul_lower  = __riscv_vsrl (mul_higher, G_SHIFT, vl);
+
+    return __riscv_vsrl (__riscv_vadd (mul_higher, mul_lower, vl), G_SHIFT, vl);
+}
+
+static force_inline vuint8m2_t
+rvv_DIV_ONE_UN32m8_UN8m2_v (const vuint32m8_t x, size_t vl)
+{
+    vuint32m8_t mul_higher = __riscv_vadd (x, ONE_HALF, vl);
+    vuint32m8_t mul_lower  = __riscv_vsrl (mul_higher, G_SHIFT, vl);
+
+    return __riscv_vncvt_x (
+	__riscv_vnsrl (__riscv_vadd (mul_higher, mul_lower, vl), G_SHIFT, vl),
+	vl);
+}
+
+/*
+* x_c = (x_c * a) / 255 + y_c
+*/
+
+#define rvv_UN8x4_MUL_UN16_ADD_UN8x4_vvv_m4(x, a, y, vl)                       \
+    rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN16_vv_m4 (x, a, vl), y, vl)
+
+/*
+* x_c = (x_c * a + y_c * b) / 255
+*/
+
+#define rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4(x, a, y, b, vl)          \
+    rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN16_vv_m4 (x, a, vl),            \
+			       rvv_UN8x4_MUL_UN16_vv_m4 (y, b, vl), vl)
+
+/*
+* x_c = (x_c * a_c) / 255 + y_c
+*/
+
+#define rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4(x, a, y, vl)                      \
+    rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN8x4_vv_m4 (x, a, vl), y, vl)
+
+/*
+* x_c = (x_c * a_c + y_c * b) / 255
+*/
+
+#define rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4(x, a, y, b, vl)         \
+    rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN8x4_vv_m4 (x, a, vl),           \
+			       rvv_UN8x4_MUL_UN16_vv_m4 (y, b, vl), vl)
+
+// } pixman-combine32.h
+
+// Additional functions.
+
+#define rvv_shift_alpha_u16(x, vl) __riscv_vnsrl (x, 24, vl)
+
+#define rvv_shift_not_alpha_u16(x, vl)                                         \
+    rvv_shift_alpha_u16 (__riscv_vnot (x, vl), vl)
+
+#define rvv_load_alpha_u8m1(src, vl)                                           \
+    __riscv_vlse8_v_u8m1 ((uint8_t *)src + 3, 4, vl)
+
+#define rvv_load_not_alpha_u8m1(src, vl)                                       \
+    __riscv_vnot (rvv_load_alpha_u8m1 (src, vl), vl)
+
+#define rvv_u8m2_to_i16m4(in, vl)                                              \
+    __riscv_vreinterpret_i16m4 (__riscv_vwcvtu_x (in, vl))
+
+#define rvv_over_m4(src, dest, vl)                                             \
+    rvv_UN8x4_MUL_UN16_ADD_UN8x4_vvv_m4 (                                      \
+	dest, rvv_shift_not_alpha_u16 (src, vl), src, vl)
+
+#define rvv_in_m4(x, y, vl) rvv_UN8x4_MUL_UN8_vv_m4 (x, y, vl)
+
+#define rvv_in_load_s_m_m4(src, mask, vl)                                      \
+    rvv_in_m4 (__riscv_vle32_v_u32m4 (src, vl),                                \
+	       rvv_load_alpha_u8m1 (mask, vl), vl)
+
+#define rvv_in_load_s_nm_m4(src, mask, vl)                                     \
+    rvv_in_m4 (__riscv_vle32_v_u32m4 (src, vl),                                \
+	       rvv_load_not_alpha_u8m1 (mask, vl), vl)
+
+static force_inline vuint16m2_t
+rvv_convert_8888_to_0565_m2 (const vuint32m4_t s, size_t vl)
+{
+    vuint32m4_t rb = __riscv_vand (s, 0xF800F8, vl);
+
+    return __riscv_vor (
+	__riscv_vor (__riscv_vnsrl (rb, 3, vl), __riscv_vnsrl (rb, 8, vl), vl),
+	__riscv_vand (__riscv_vnsrl (s, 5, vl), 0x7E0, vl), vl);
+}
+
+static force_inline vuint32m4_t
+rvv_convert_0565_to_0888_m4 (const vuint16m2_t s, size_t vl)
+{
+    vuint8m1_t  g1, g2;
+    vuint16m2_t r, g_w, b;
+    vuint32m4_t r_w, rb_w;
+
+    r    = __riscv_vand (s, 0xF800, vl);
+    b    = __riscv_vand (s, 0x001F, vl);
+    r_w  = __riscv_vwmulu (r, 1 << 8, vl);
+    rb_w = __riscv_vwmaccu (r_w, 1 << 3, b, vl);
+    rb_w = __riscv_vand (__riscv_vor (rb_w, __riscv_vsrl (rb_w, 5, vl), vl),
+			 0xFF00FF, vl);
+
+    g1  = __riscv_vsll (__riscv_vnsrl (s, 5, vl), 2, vl);
+    g2  = __riscv_vsrl (g1, 6, vl);
+    g_w = __riscv_vwaddu_vv (g1, g2, vl);
+
+    return __riscv_vwmaccu (rb_w, 1 << 8, g_w, vl);
+}
+
+#define rvv_convert_0565_to_8888_m4(s, vl)                                     \
+    __riscv_vor (rvv_convert_0565_to_0888_m4 (s, vl), 0xff000000, vl)
+
+#define __rvv_combine_mask_value_ca(LMUL, src, mask, vl)                       \
+    rvv_UN8x4_MUL_UN8x4_vv_##LMUL (src, mask, vl)
+#define rvv_combine_mask_value_ca_m2(src, mask, vl)                            \
+    __rvv_combine_mask_value_ca (m2, src, mask, vl)
+#define rvv_combine_mask_value_ca_m4(src, mask, vl)                            \
+    __rvv_combine_mask_value_ca (m4, src, mask, vl)
+
+#define __rvv_combine_mask_alpha_ca(LMUL, src, mask, vl)                       \
+    rvv_UN8x4_MUL_UN16_vv_##LMUL (mask, rvv_shift_alpha_u16 (src, vl), vl)
+#define rvv_combine_mask_alpha_ca_m2(src, mask, vl)                            \
+    __rvv_combine_mask_alpha_ca (m2, src, mask, vl)
+#define rvv_combine_mask_alpha_ca_m4(src, mask, vl)                            \
+    __rvv_combine_mask_alpha_ca (m4, src, mask, vl)
+
+#define __rvv_combine_mask(LMUL, src, mask, vl)                                \
+    rvv_UN8x4_MUL_UN16_vv_##LMUL (src, rvv_shift_alpha_u16 (mask, vl), vl)
+#define rvv_combine_mask_m2(src, mask, vl)                                     \
+    __rvv_combine_mask (m2, src, mask, vl)
+#define rvv_combine_mask_m4(src, mask, vl)                                     \
+    __rvv_combine_mask (m4, src, mask, vl)
+
+#define __rvv_combine_mask_ca(LMUL)                                            \
+    static force_inline void rvv_combine_mask_ca_##LMUL (                      \
+	VUINT32 (LMUL) *__restrict__ src, VUINT32 (LMUL) *__restrict__ mask,   \
+	size_t vl)                                                             \
+    {                                                                          \
+	VUINT32 (LMUL) src_cpy = *src;                                         \
+	*(src)  = rvv_combine_mask_value_ca_##LMUL (*(src), *(mask), vl);      \
+	*(mask) = rvv_combine_mask_alpha_ca_##LMUL (src_cpy, *(mask), vl);     \
+    }
+__rvv_combine_mask_ca (m2);
+__rvv_combine_mask_ca (m4);
+
+static void
+rvv_combine_clear (pixman_implementation_t *__restrict__ imp,
+		   pixman_op_t op,
+		   uint32_t *__restrict__ dest,
+		   const uint32_t *__restrict__ src,
+		   const uint32_t *__restrict__ mask,
+		   int width)
+{
+    uint32_t *pd = dest;
+
+    vuint32m8_t v = __riscv_vmv_v_x_u32m8 (0, __riscv_vsetvlmax_e32m8 ());
+    RVV_FOREACH_1 (width, vl, e32m8, pd) { __riscv_vse32 (pd, v, vl); }
+}
+
+static void
+rvv_combine_src_u (pixman_implementation_t *__restrict__ imp,
+		   pixman_op_t op,
+		   uint32_t *__restrict__ dest,
+		   const uint32_t *__restrict__ src,
+		   const uint32_t *__restrict__ mask,
+		   int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (pd, rvv_in_load_s_m_m4 (ps, pm, vl), vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m8, ps, pd)
+	{
+	    __riscv_vse32 (pd, __riscv_vle32_v_u32m8 (ps, vl), vl);
+	}
+    }
+}
+
+static void
+rvv_combine_over_u (pixman_implementation_t *__restrict__ imp,
+		    pixman_op_t op,
+		    uint32_t *__restrict__ dest,
+		    const uint32_t *__restrict__ src,
+		    const uint32_t *__restrict__ mask,
+		    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_over_m4 (rvv_in_load_s_m_m4 (ps, pm, vl),
+					__riscv_vle32_v_u32m4 (pd, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_over_m4 (__riscv_vle32_v_u32m4 (ps, vl),
+					__riscv_vle32_v_u32m4 (pd, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_combine_over_reverse_u (pixman_implementation_t *__restrict__ imp,
+			    pixman_op_t op,
+			    uint32_t *__restrict__ dest,
+			    const uint32_t *__restrict__ src,
+			    const uint32_t *__restrict__ mask,
+			    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_over_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+					rvv_in_load_s_m_m4 (ps, pm, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_over_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+					__riscv_vle32_v_u32m4 (ps, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_combine_in_u (pixman_implementation_t *__restrict__ imp,
+		  pixman_op_t op,
+		  uint32_t *__restrict__ dest,
+		  const uint32_t *__restrict__ src,
+		  const uint32_t *__restrict__ mask,
+		  int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_in_m4 (rvv_in_load_s_m_m4 (ps, pm, vl),
+				      rvv_load_alpha_u8m1 (pd, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (pd, rvv_in_load_s_m_m4 (ps, pd, vl), vl);
+	}
+    }
+}
+
+static void
+rvv_combine_in_reverse_u (pixman_implementation_t *__restrict__ imp,
+			  pixman_op_t op,
+			  uint32_t *__restrict__ dest,
+			  const uint32_t *__restrict__ src,
+			  const uint32_t *__restrict__ mask,
+			  int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_in_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+				      rvv_UN8_MUL_UN8_vv_m1 (
+					  rvv_load_alpha_u8m1 (ps, vl),
+					  rvv_load_alpha_u8m1 (pm, vl), vl),
+				      vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (pd, rvv_in_load_s_m_m4 (pd, ps, vl), vl);
+	}
+    }
+}
+
+static void
+rvv_combine_out_u (pixman_implementation_t *__restrict__ imp,
+		   pixman_op_t op,
+		   uint32_t *__restrict__ dest,
+		   const uint32_t *__restrict__ src,
+		   const uint32_t *__restrict__ mask,
+		   int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_in_m4 (rvv_in_load_s_m_m4 (ps, pm, vl),
+				      rvv_load_not_alpha_u8m1 (pd, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (pd, rvv_in_load_s_nm_m4 (ps, pd, vl), vl);
+	}
+    }
+}
+
+static void
+rvv_combine_out_reverse_u (pixman_implementation_t *__restrict__ imp,
+			   pixman_op_t op,
+			   uint32_t *__restrict__ dest,
+			   const uint32_t *__restrict__ src,
+			   const uint32_t *__restrict__ mask,
+			   int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (
+		pd,
+		rvv_in_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+			   __riscv_vnot (rvv_UN8_MUL_UN8_vv_m1 (
+					     rvv_load_alpha_u8m1 (ps, vl),
+					     rvv_load_alpha_u8m1 (pm, vl), vl),
+					 vl),
+			   vl),
+		vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (pd, rvv_in_load_s_nm_m4 (pd, ps, vl), vl);
+	}
+    }
+}
+
+static void
+rvv_combine_atop_u (pixman_implementation_t *__restrict__ imp,
+		    pixman_op_t op,
+		    uint32_t *__restrict__ dest,
+		    const uint32_t *__restrict__ src,
+		    const uint32_t *__restrict__ mask,
+		    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+    vuint32m4_t s, d;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    s = rvv_in_load_s_m_m4 (ps, pm, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       s, rvv_shift_alpha_u16 (d, vl), d,
+			       rvv_shift_not_alpha_u16 (s, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    s = __riscv_vle32_v_u32m4 (ps, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       s, rvv_shift_alpha_u16 (d, vl), d,
+			       rvv_shift_not_alpha_u16 (s, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_combine_atop_reverse_u (pixman_implementation_t *__restrict__ imp,
+			    pixman_op_t op,
+			    uint32_t *__restrict__ dest,
+			    const uint32_t *__restrict__ src,
+			    const uint32_t *__restrict__ mask,
+			    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+    vuint32m4_t s, d;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    s = rvv_in_load_s_m_m4 (ps, pm, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       s, rvv_shift_not_alpha_u16 (d, vl), d,
+			       rvv_shift_alpha_u16 (s, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    s = __riscv_vle32_v_u32m4 (ps, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       s, rvv_shift_not_alpha_u16 (d, vl), d,
+			       rvv_shift_alpha_u16 (s, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_combine_xor_u (pixman_implementation_t *__restrict__ imp,
+		   pixman_op_t op,
+		   uint32_t *__restrict__ dest,
+		   const uint32_t *__restrict__ src,
+		   const uint32_t *__restrict__ mask,
+		   int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+    vuint32m4_t s, d;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    s = rvv_in_load_s_m_m4 (ps, pm, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       s, rvv_shift_not_alpha_u16 (d, vl), d,
+			       rvv_shift_not_alpha_u16 (s, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    s = __riscv_vle32_v_u32m4 (ps, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       s, rvv_shift_not_alpha_u16 (d, vl), d,
+			       rvv_shift_not_alpha_u16 (s, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_combine_add_u (pixman_implementation_t *__restrict__ imp,
+		   pixman_op_t op,
+		   uint32_t *__restrict__ dest,
+		   const uint32_t *__restrict__ src,
+		   const uint32_t *__restrict__ mask,
+		   int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (
+		pd,
+		rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+					   rvv_in_load_s_m_m4 (ps, pm, vl), vl),
+		vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (
+		pd,
+		rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+					   __riscv_vle32_v_u32m4 (ps, vl), vl),
+		vl);
+	}
+    }
+}
+
+/*
+ * Multiply
+ *
+ *      ad * as * B(d / ad, s / as)
+ *    = ad * as * d/ad * s/as
+ *    = d * s
+ *
+ */
+static void
+rvv_combine_multiply_u (pixman_implementation_t *imp,
+			pixman_op_t              op,
+			uint32_t *__restrict__ dest,
+			const uint32_t *__restrict__ src,
+			const uint32_t *__restrict__ mask,
+			int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t s, d;
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    s = rvv_in_load_s_m_m4 (ps, pm, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_ADD_UN8x4_vv_m4 (
+			       rvv_UN8x4_MUL_UN8x4_vv_m4 (d, s, vl),
+			       rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+				   s, rvv_shift_not_alpha_u16 (d, vl), d,
+				   rvv_shift_not_alpha_u16 (s, vl), vl),
+			       vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    s = __riscv_vle32_v_u32m4 (ps, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_ADD_UN8x4_vv_m4 (
+			       rvv_UN8x4_MUL_UN8x4_vv_m4 (d, s, vl),
+			       rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+				   s, rvv_shift_not_alpha_u16 (d, vl), d,
+				   rvv_shift_not_alpha_u16 (s, vl), vl),
+			       vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_combine_multiply_ca (pixman_implementation_t *__restrict__ imp,
+			 pixman_op_t op,
+			 uint32_t *__restrict__ dest,
+			 const uint32_t *__restrict__ src,
+			 const uint32_t *__restrict__ mask,
+			 int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t s, m, d;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	s = __riscv_vle32_v_u32m4 (ps, vl);
+	m = __riscv_vle32_v_u32m4 (pm, vl);
+	rvv_combine_mask_ca_m4 (&s, &m, vl);
+
+	d = __riscv_vle32_v_u32m4 (pd, vl);
+
+	__riscv_vse32 (pd,
+		       rvv_UN8x4_ADD_UN8x4_vv_m4 (
+			   rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       d, __riscv_vnot (m, vl), s,
+			       rvv_shift_not_alpha_u16 (d, vl), vl),
+			   rvv_UN8x4_MUL_UN8x4_vv_m4 (d, s, vl), vl),
+		       vl);
+    }
+}
+
+#define PDF_SEPARABLE_BLEND_MODE(name)                                         \
+    static void rvv_combine_##name##_u (                                       \
+	pixman_implementation_t *imp, pixman_op_t op, uint32_t *dest,          \
+	const uint32_t *src, const uint32_t *mask, int width)                  \
+    {                                                                          \
+	uint32_t *__restrict__ pd       = dest;                                \
+	const uint32_t *__restrict__ ps = src;                                 \
+	const uint32_t *__restrict__ pm = mask;                                \
+                                                                               \
+	vuint32m2_t s, d, ra, rx;                                              \
+	vuint16m1_t da, sa;                                                    \
+	size_t      vl4;                                                       \
+	vuint8m2_t  s4, d4, sa4, isa4, da4, ida4;                              \
+	vuint32m8_t rx4;                                                       \
+                                                                               \
+	RVV_FOREACH_3 (width, vl, e32m2, ps, pm, pd)                           \
+	{                                                                      \
+	    vl4 = vl * 4;                                                      \
+                                                                               \
+	    s = __riscv_vle32_v_u32m2 (ps, vl);                                \
+	    if (mask)                                                          \
+		s = rvv_combine_mask_m2 (s, __riscv_vle32_v_u32m2 (pm, vl),    \
+					 vl);                                  \
+	    sa = rvv_shift_alpha_u16 (s, vl);                                  \
+                                                                               \
+	    d  = __riscv_vle32_v_u32m2 (pd, vl);                               \
+	    da = rvv_shift_alpha_u16 (d, vl);                                  \
+                                                                               \
+	    ra = __riscv_vsub (__riscv_vwaddu_vv (__riscv_vmul (da, 0xFF, vl), \
+						  __riscv_vmul (sa, 0xFF, vl), \
+						  vl),                         \
+			       __riscv_vwmulu (sa, da, vl), vl);               \
+                                                                               \
+	    s4   = RVV_U32_U8x4_m2 (s);                                        \
+	    sa4  = RVV_U32_U8x4_m2 (rvv_UN16_bcast_UN8x4_v_m2 (sa, vl));       \
+	    isa4 = __riscv_vnot (sa4, vl4);                                    \
+	    d4   = RVV_U32_U8x4_m2 (d);                                        \
+	    da4  = RVV_U32_U8x4_m2 (rvv_UN16_bcast_UN8x4_v_m2 (da, vl));       \
+	    ida4 = __riscv_vnot (da4, vl4);                                    \
+                                                                               \
+	    rx4 = __riscv_vadd (                                               \
+		__riscv_vwaddu_vv (__riscv_vwmulu (isa4, d4, vl4),             \
+				   __riscv_vwmulu (ida4, s4, vl4), vl4),       \
+		rvv_blend_##name##_int (d4, da4, s4, sa4, vl4), vl4);          \
+                                                                               \
+	    ra  = __riscv_vminu (ra, 255 * 255, vl);                           \
+	    rx4 = __riscv_vminu (rx4, 255 * 255, vl4);                         \
+                                                                               \
+	    ra = rvv_DIV_ONE_UN32m2_UN32m2_v (ra, vl);                         \
+	    rx = RVV_U8x4_U32_m2 (rvv_DIV_ONE_UN32m8_UN8m2_v (rx4, vl4));      \
+                                                                               \
+	    __riscv_vse32 (pd,                                                 \
+			   __riscv_vor (__riscv_vsll (ra, 24, vl),             \
+					__riscv_vand (rx, 0x00FFFFFF, vl),     \
+					vl),                                   \
+			   vl);                                                \
+	}                                                                      \
+    }                                                                          \
+                                                                               \
+    static void rvv_combine_##name##_ca (                                      \
+	pixman_implementation_t *imp, pixman_op_t op, uint32_t *dest,          \
+	const uint32_t *src, const uint32_t *mask, int width)                  \
+    {                                                                          \
+	uint32_t *__restrict__ pd       = dest;                                \
+	const uint32_t *__restrict__ ps = src;                                 \
+	const uint32_t *__restrict__ pm = mask;                                \
+                                                                               \
+	vuint32m2_t s, m, d, ra, rx;                                           \
+	vuint16m1_t da, sa;                                                    \
+	size_t      vl4;                                                       \
+	vuint8m2_t  s4, m4, d4, ixa4, da4, ida4;                               \
+	vuint32m8_t rx4;                                                       \
+                                                                               \
+	RVV_FOREACH_3 (width, vl, e32m2, ps, pm, pd)                           \
+	{                                                                      \
+	    m = __riscv_vle32_v_u32m2 (pm, vl);                                \
+	    s = __riscv_vle32_v_u32m2 (ps, vl);                                \
+	    rvv_combine_mask_ca_m2 (&s, &m, vl);                               \
+	    sa = rvv_shift_alpha_u16 (s, vl);                                  \
+                                                                               \
+	    d  = __riscv_vle32_v_u32m2 (pd, vl);                               \
+	    da = rvv_shift_alpha_u16 (d, vl);                                  \
+                                                                               \
+	    ra = __riscv_vsub (__riscv_vwaddu_vv (__riscv_vmul (da, 0xFF, vl), \
+						  __riscv_vmul (sa, 0xFF, vl), \
+						  vl),                         \
+			       __riscv_vwmulu (sa, da, vl), vl);               \
+                                                                               \
+	    ixa4 = RVV_U32_U8x4_m2 (__riscv_vnot (m, vl));                     \
+	    d4   = RVV_U32_U8x4_m2 (d);                                        \
+	    ida4 = RVV_U32_U8x4_m2 (                                           \
+		__riscv_vnot (rvv_UN16_bcast_UN8x4_v_m2 (da, vl), vl));        \
+	    s4  = RVV_U32_U8x4_m2 (s);                                         \
+	    da4 = RVV_U32_U8x4_m2 (rvv_UN16_bcast_UN8x4_v_m2 (da, vl));        \
+	    m4  = RVV_U32_U8x4_m2 (m);                                         \
+                                                                               \
+	    vl4 = vl * 4;                                                      \
+	    rx4 = __riscv_vadd (                                               \
+		__riscv_vwaddu_vv (__riscv_vwmulu (ixa4, d4, vl4),             \
+				   __riscv_vwmulu (ida4, s4, vl4), vl4),       \
+		rvv_blend_##name##_int (d4, da4, s4, m4, vl4), vl4);           \
+                                                                               \
+	    ra  = __riscv_vminu (ra, 255 * 255, vl);                           \
+	    rx4 = __riscv_vminu (rx4, 255 * 255, vl4);                         \
+                                                                               \
+	    ra = rvv_DIV_ONE_UN32m2_UN32m2_v (ra, vl);                         \
+	    rx = RVV_U8x4_U32_m2 (rvv_DIV_ONE_UN32m8_UN8m2_v (rx4, vl4));      \
+                                                                               \
+	    __riscv_vse32 (pd,                                                 \
+			   __riscv_vor (__riscv_vsll (ra, 24, vl),             \
+					__riscv_vand (rx, 0x00FFFFFF, vl),     \
+					vl),                                   \
+			   vl);                                                \
+	}                                                                      \
+    }
+
+static force_inline vuint32m8_t
+rvv_blend_screen_int (const vuint8m2_t d,
+		      const vuint8m2_t ad,
+		      const vuint8m2_t s,
+		      const vuint8m2_t as,
+		      size_t           vl)
+{
+    return __riscv_vsub (__riscv_vwaddu_vv (__riscv_vwmulu (s, ad, vl),
+					    __riscv_vwmulu (d, as, vl), vl),
+			 __riscv_vwcvtu_x (__riscv_vwmulu (s, d, vl), vl), vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (screen)
+
+static force_inline vuint32m8_t
+_rvv_blend_overlay_hard_light (const vuint8m2_t d,
+			       const vuint8m2_t ad,
+			       const vuint8m2_t s,
+			       const vuint8m2_t as,
+			       const vbool4_t   selector,
+			       size_t           vl)
+{
+    vuint32m8_t out_true = __riscv_vwmulu (__riscv_vwmulu (s, d, vl), 2, vl);
+
+    vint16m4_t d_i  = rvv_u8m2_to_i16m4 (d, vl);
+    vint16m4_t ad_i = rvv_u8m2_to_i16m4 (ad, vl);
+    vint16m4_t s_i  = rvv_u8m2_to_i16m4 (s, vl);
+    vint16m4_t as_i = rvv_u8m2_to_i16m4 (as, vl);
+
+    vuint32m8_t out_false = __riscv_vreinterpret_v_i32m8_u32m8 (__riscv_vsub (
+	__riscv_vwmul (as_i, ad_i, vl),
+	__riscv_vsll (__riscv_vwmul (__riscv_vsub (ad_i, d_i, vl),
+				     __riscv_vsub (as_i, s_i, vl), vl),
+		      1, vl),
+	vl));
+
+    return __riscv_vmerge (out_false, out_true, selector, vl);
+}
+
+static force_inline vuint32m8_t
+rvv_blend_overlay_int (const vuint8m2_t d,
+		       const vuint8m2_t ad,
+		       const vuint8m2_t s,
+		       const vuint8m2_t as,
+		       size_t           vl)
+{
+    return _rvv_blend_overlay_hard_light (
+	d, ad, s, as,
+	__riscv_vmsltu (__riscv_vwmulu (d, 2, vl), __riscv_vwcvtu_x (ad, vl),
+			vl),
+	vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (overlay)
+
+static force_inline vuint32m8_t
+rvv_blend_darken_int (const vuint8m2_t d,
+		      const vuint8m2_t ad,
+		      const vuint8m2_t s,
+		      const vuint8m2_t as,
+		      size_t           vl)
+{
+    return __riscv_vwcvtu_x (__riscv_vminu (__riscv_vwmulu (ad, s, vl),
+					    __riscv_vwmulu (as, d, vl), vl),
+			     vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (darken)
+
+static force_inline vuint32m8_t
+rvv_blend_lighten_int (const vuint8m2_t d,
+		       const vuint8m2_t ad,
+		       const vuint8m2_t s,
+		       const vuint8m2_t as,
+		       size_t           vl)
+{
+    return __riscv_vwcvtu_x (__riscv_vmaxu (__riscv_vwmulu (as, d, vl),
+					    __riscv_vwmulu (ad, s, vl), vl),
+			     vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (lighten)
+
+static force_inline vuint32m8_t
+rvv_blend_hard_light_int (const vuint8m2_t d,
+			  const vuint8m2_t ad,
+			  const vuint8m2_t s,
+			  const vuint8m2_t as,
+			  size_t           vl)
+{
+    return _rvv_blend_overlay_hard_light (
+	d, ad, s, as,
+	__riscv_vmsltu (__riscv_vwmulu (s, 2, vl), __riscv_vwcvtu_x (as, vl),
+			vl),
+	vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (hard_light)
+
+static force_inline vuint32m8_t
+rvv_blend_difference_int (const vuint8m2_t d,
+			  const vuint8m2_t ad,
+			  const vuint8m2_t s,
+			  const vuint8m2_t as,
+			  size_t           vl)
+{
+    vuint16m4_t das = __riscv_vwmulu (d, as, vl);
+    vuint16m4_t sad = __riscv_vwmulu (s, ad, vl);
+
+    return __riscv_vmerge (__riscv_vwsubu_vv (sad, das, vl),
+			   __riscv_vwsubu_vv (das, sad, vl),
+			   __riscv_vmsltu (sad, das, vl), vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (difference)
+
+static force_inline vuint32m8_t
+rvv_blend_exclusion_int (const vuint8m2_t d,
+			 const vuint8m2_t ad,
+			 const vuint8m2_t s,
+			 const vuint8m2_t as,
+			 size_t           vl)
+{
+    return __riscv_vsub (__riscv_vwaddu_vv (__riscv_vwmulu (s, ad, vl),
+					    __riscv_vwmulu (d, as, vl), vl),
+			 __riscv_vwmulu (__riscv_vwmulu (d, s, vl), 2, vl), vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (exclusion)
+
+#undef PDF_SEPARABLE_BLEND_MODE
+
+static void
+rvv_combine_over_ca (pixman_implementation_t *__restrict__ imp,
+		     pixman_op_t op,
+		     uint32_t *__restrict__ dest,
+		     const uint32_t *__restrict__ src,
+		     const uint32_t *__restrict__ mask,
+		     int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t s, m;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	s = __riscv_vle32_v_u32m4 (ps, vl);
+	m = __riscv_vle32_v_u32m4 (pm, vl);
+	rvv_combine_mask_ca_m4 (&s, &m, vl);
+
+	__riscv_vse32 (
+	    pd,
+	    rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 (
+		__riscv_vle32_v_u32m4 (pd, vl), __riscv_vnot (m, vl), s, vl),
+	    vl);
+    }
+}
+
+static void
+rvv_combine_over_reverse_ca (pixman_implementation_t *__restrict__ imp,
+			     pixman_op_t op,
+			     uint32_t *__restrict__ dest,
+			     const uint32_t *__restrict__ src,
+			     const uint32_t *__restrict__ mask,
+			     int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t d;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	d = __riscv_vle32_v_u32m4 (pd, vl);
+	__riscv_vse32 (
+	    pd,
+	    rvv_UN8x4_MUL_UN16_ADD_UN8x4_vvv_m4 (
+		rvv_UN8x4_MUL_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (ps, vl),
+					   __riscv_vle32_v_u32m4 (pm, vl), vl),
+		rvv_shift_not_alpha_u16 (d, vl), d, vl),
+	    vl);
+    }
+}
+
+static void
+rvv_combine_atop_ca (pixman_implementation_t *__restrict__ imp,
+		     pixman_op_t op,
+		     uint32_t *__restrict__ dest,
+		     const uint32_t *__restrict__ src,
+		     const uint32_t *__restrict__ mask,
+		     int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t d, s, m;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	s = __riscv_vle32_v_u32m4 (ps, vl);
+	m = __riscv_vle32_v_u32m4 (pm, vl);
+	rvv_combine_mask_ca_m4 (&s, &m, vl);
+
+	d = __riscv_vle32_v_u32m4 (pd, vl);
+	__riscv_vse32 (
+	    pd,
+	    rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+		d, __riscv_vnot (m, vl), s, rvv_shift_alpha_u16 (d, vl), vl),
+	    vl);
+    }
+}
+
+static void
+rvv_combine_xor_ca (pixman_implementation_t *__restrict__ imp,
+		    pixman_op_t op,
+		    uint32_t *__restrict__ dest,
+		    const uint32_t *__restrict__ src,
+		    const uint32_t *__restrict__ mask,
+		    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t d, s, m;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	s = __riscv_vle32_v_u32m4 (ps, vl);
+	m = __riscv_vle32_v_u32m4 (pm, vl);
+	rvv_combine_mask_ca_m4 (&s, &m, vl);
+
+	d = __riscv_vle32_v_u32m4 (pd, vl);
+	__riscv_vse32 (pd,
+		       rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			   d, __riscv_vnot (m, vl), s,
+			   rvv_shift_not_alpha_u16 (d, vl), vl),
+		       vl);
+    }
+}
+
+static void
+rvv_combine_atop_reverse_ca (pixman_implementation_t *__restrict__ imp,
+			     pixman_op_t op,
+			     uint32_t *__restrict__ dest,
+			     const uint32_t *__restrict__ src,
+			     const uint32_t *__restrict__ mask,
+			     int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t d, s, m;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	s = __riscv_vle32_v_u32m4 (ps, vl);
+	m = __riscv_vle32_v_u32m4 (pm, vl);
+	rvv_combine_mask_ca_m4 (&s, &m, vl);
+
+	d = __riscv_vle32_v_u32m4 (pd, vl);
+	__riscv_vse32 (pd,
+		       rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			   d, m, s, rvv_shift_not_alpha_u16 (d, vl), vl),
+		       vl);
+    }
+}
+
+static void
+rvv_combine_src_ca (pixman_implementation_t *__restrict__ imp,
+		    pixman_op_t op,
+		    uint32_t *__restrict__ dest,
+		    const uint32_t *__restrict__ src,
+		    const uint32_t *__restrict__ mask,
+		    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	__riscv_vse32 (
+	    pd,
+	    rvv_combine_mask_value_ca_m4 (__riscv_vle32_v_u32m4 (ps, vl),
+					  __riscv_vle32_v_u32m4 (pm, vl), vl),
+	    vl);
+    }
+}
+
+static void
+rvv_combine_in_ca (pixman_implementation_t *__restrict__ imp,
+		   pixman_op_t op,
+		   uint32_t *__restrict__ dest,
+		   const uint32_t *__restrict__ src,
+		   const uint32_t *__restrict__ mask,
+		   int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	__riscv_vse32 (pd,
+		       rvv_in_m4 (rvv_combine_mask_value_ca_m4 (
+				      __riscv_vle32_v_u32m4 (ps, vl),
+				      __riscv_vle32_v_u32m4 (pm, vl), vl),
+				  rvv_load_alpha_u8m1 (pd, vl), vl),
+		       vl);
+    }
+}
+
+static void
+rvv_combine_in_reverse_ca (pixman_implementation_t *imp,
+			   pixman_op_t              op,
+			   uint32_t                *dest,
+			   const uint32_t          *src,
+			   const uint32_t          *mask,
+			   int                      width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	__riscv_vse32 (
+	    pd,
+	    rvv_UN8x4_MUL_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+				       rvv_combine_mask_alpha_ca_m4 (
+					   __riscv_vle32_v_u32m4 (ps, vl),
+					   __riscv_vle32_v_u32m4 (pm, vl), vl),
+				       vl),
+	    vl);
+    }
+}
+
+static void
+rvv_combine_out_ca (pixman_implementation_t *__restrict__ imp,
+		    pixman_op_t op,
+		    uint32_t *__restrict__ dest,
+		    const uint32_t *__restrict__ src,
+		    const uint32_t *__restrict__ mask,
+		    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	__riscv_vse32 (pd,
+		       rvv_in_m4 (rvv_combine_mask_value_ca_m4 (
+				      __riscv_vle32_v_u32m4 (ps, vl),
+				      __riscv_vle32_v_u32m4 (pm, vl), vl),
+				  rvv_load_not_alpha_u8m1 (pd, vl), vl),
+		       vl);
+    }
+}
+
+static void
+rvv_combine_out_reverse_ca (pixman_implementation_t *imp,
+			    pixman_op_t              op,
+			    uint32_t                *dest,
+			    const uint32_t          *src,
+			    const uint32_t          *mask,
+			    int                      width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	__riscv_vse32 (
+	    pd,
+	    rvv_UN8x4_MUL_UN8x4_vv_m4 (
+		__riscv_vle32_v_u32m4 (pd, vl),
+		__riscv_vnot_v_u32m4 (rvv_combine_mask_alpha_ca_m4 (
+					  __riscv_vle32_v_u32m4 (ps, vl),
+					  __riscv_vle32_v_u32m4 (pm, vl), vl),
+				      vl),
+		vl),
+	    vl);
+    }
+}
+
+static void
+rvv_combine_add_ca (pixman_implementation_t *__restrict__ imp,
+		    pixman_op_t op,
+		    uint32_t *__restrict__ dest,
+		    const uint32_t *__restrict__ src,
+		    const uint32_t *__restrict__ mask,
+		    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	__riscv_vse32 (
+	    pd,
+	    rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+				       rvv_combine_mask_value_ca_m4 (
+					   __riscv_vle32_v_u32m4 (ps, vl),
+					   __riscv_vle32_v_u32m4 (pm, vl), vl),
+				       vl),
+	    vl);
+    }
+}
+
+static void
+rvv_composite_src_x888_8888 (pixman_implementation_t *__restrict__ imp,
+			     pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *__restrict__ dst_line, *__restrict__ dst;
+    uint32_t *__restrict__ src_line, *__restrict__ src;
+    int32_t dst_stride, src_stride;
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m8, src, dst)
+	{
+	    __riscv_vse32 (
+		dst,
+		__riscv_vor (__riscv_vle32_v_u32m8 (src, vl), 0xff000000, vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_src_8888_8888 (pixman_implementation_t *__restrict__ imp,
+			     pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *__restrict__ dst_line, *__restrict__ dst;
+    uint32_t *__restrict__ src_line, *__restrict__ src;
+    int32_t dst_stride, src_stride;
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m8, src, dst)
+	{
+	    __riscv_vse32 (dst, __riscv_vle32_v_u32m8 (src, vl), vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_x888_8_8888 (pixman_implementation_t *__restrict__ imp,
+				pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *__restrict__ src, *__restrict__ src_line;
+    uint32_t *__restrict__ dst, *__restrict__ dst_line;
+    uint8_t *__restrict__ mask, *__restrict__ mask_line;
+    int32_t src_stride, mask_stride, dst_stride;
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
+			   mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
+
+    while (height--)
+    {
+	src = src_line;
+	src_line += src_stride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_3 (width, vl, e32m4, src, mask, dst)
+	{
+	    __riscv_vse32 (
+		dst,
+		rvv_over_m4 (
+		    rvv_in_m4 (__riscv_vor (__riscv_vle32_v_u32m4 (src, vl),
+					    0xff000000, vl),
+			       __riscv_vle8_v_u8m1 (mask, vl), vl),
+		    __riscv_vle32_v_u32m4 (dst, vl), vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_8888_8888 (pixman_implementation_t *imp,
+			      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    int       dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m4, src, dst)
+	{
+	    __riscv_vse32 (dst,
+			   rvv_over_m4 (__riscv_vle32_v_u32m4 (src, vl),
+					__riscv_vle32_v_u32m4 (dst, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_n_8_0565 (pixman_implementation_t *imp,
+			     pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t *__restrict__ dst_line, *__restrict__ dst;
+    uint8_t *__restrict__ mask_line, *__restrict__ mask;
+    int         dst_stride, mask_stride;
+    uint32_t    src;
+    vuint32m4_t vsrc;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
+			   mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_2 (width, vl, e16m2, mask, dst)
+	{
+	    __riscv_vse16 (
+		dst,
+		rvv_convert_8888_to_0565_m2 (
+		    rvv_over_m4 (
+			rvv_in_m4 (vsrc, __riscv_vle8_v_u8m1 (mask, vl), vl),
+			rvv_convert_0565_to_0888_m4 (
+			    __riscv_vle16_v_u16m2 (dst, vl), vl),
+			vl),
+		    vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_n_8_8888 (pixman_implementation_t *imp,
+			     pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t   *dst_line, *dst;
+    uint8_t    *mask_line, *mask;
+    int         dst_stride, mask_stride;
+    uint32_t    src;
+    vuint32m4_t vsrc;
+
+    src  = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
+			   mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m4, mask, dst)
+	{
+	    __riscv_vse32 (
+		dst,
+		rvv_over_m4 (
+		    rvv_in_m4 (vsrc, __riscv_vle8_v_u8m1 (mask, vl), vl),
+		    __riscv_vle32_v_u32m4 (dst, vl), vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+				  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t   *dst_line, *dst;
+    uint32_t   *mask_line, *mask;
+    int         dst_stride, mask_stride;
+    uint32_t    src;
+    vuint32m4_t vsrc;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride,
+			   mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m4, mask, dst)
+	{
+	    __riscv_vse32 (dst,
+			   rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 (
+			       __riscv_vle32_v_u32m4 (mask, vl), vsrc,
+			       __riscv_vle32_v_u32m4 (dst, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_n_8888_8888_ca (pixman_implementation_t *__restrict__ imp,
+				   pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *__restrict__ dst_line, *__restrict__ dst;
+    uint32_t *__restrict__ mask_line, *__restrict__ mask;
+    int         dst_stride, mask_stride;
+    uint32_t    src, srca;
+    vuint32m4_t vsrc;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+    srca = src >> 24;
+    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride,
+			   mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m4, mask, dst)
+	{
+	    vuint32m4_t m = __riscv_vle32_v_u32m4 (mask, vl);
+	    __riscv_vse32 (
+		dst,
+		rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 (
+		    __riscv_vle32_v_u32m4 (dst, vl),
+		    __riscv_vnot (rvv_UN8x4_MUL_UN8_vx_m4 (m, srca, vl), vl),
+		    rvv_UN8x4_MUL_UN8x4_vv_m4 (m, vsrc, vl), vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_n_8888_0565_ca (pixman_implementation_t *__restrict__ imp,
+				   pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t *__restrict__ dst_line, *__restrict__ dst;
+    uint32_t *__restrict__ mask_line, *__restrict__ mask;
+    int         dst_stride, mask_stride;
+    uint32_t    src, srca;
+    vuint32m4_t vsrc;
+
+    src  = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+	return;
+    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride,
+			   mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m4, mask, dst)
+	{
+	    vuint32m4_t ma = __riscv_vle32_v_u32m4 (mask, vl);
+
+	    __riscv_vse16 (
+		dst,
+		rvv_convert_8888_to_0565_m2 (
+		    rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 (
+			rvv_convert_0565_to_0888_m4 (
+			    __riscv_vle16_v_u16m2 (dst, vl), vl),
+			__riscv_vnot (rvv_UN8x4_MUL_UN8_vx_m4 (ma, srca, vl),
+				      vl),
+			rvv_UN8x4_MUL_UN8x4_vv_m4 (ma, vsrc, vl), vl),
+		    vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_8888_0565 (pixman_implementation_t *__restrict__ imp,
+			      pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t *__restrict__ dst_line, *__restrict__ dst;
+    uint32_t *__restrict__ src_line, *__restrict__ src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride,
+			   dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e16m2, src, dst)
+	{
+	    __riscv_vse16 (
+		dst,
+		rvv_convert_8888_to_0565_m2 (
+		    rvv_over_m4 (__riscv_vle32_v_u32m4 (src, vl),
+				 rvv_convert_0565_to_0888_m4 (
+				     __riscv_vle16_v_u16m2 (dst, vl), vl),
+				 vl),
+		    vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_add_8_8 (pixman_implementation_t *imp,
+		       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *src_line, *src;
+    int      dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride,
+			   src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
+			   dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e8m8, src, dst)
+	{
+	    __riscv_vse8 (dst,
+			  rvv_UN8_ADD_UN8_vv (__riscv_vle8_v_u8m8 (src, vl),
+					      __riscv_vle8_v_u8m8 (dst, vl),
+					      vl),
+			  vl);
+	}
+    }
+}
+
+static void
+rvv_composite_add_0565_0565 (pixman_implementation_t *imp,
+			     pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t *dst_line, *dst;
+    uint16_t *src_line, *src;
+    int       dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride,
+			   src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride,
+			   dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e16m2, src, dst)
+	{
+	    __riscv_vse16 (dst,
+			   rvv_convert_8888_to_0565_m2 (
+			       rvv_UN8x4_ADD_UN8x4_vv_m4 (
+				   rvv_convert_0565_to_8888_m4 (
+				       __riscv_vle16_v_u16m2 (src, vl), vl),
+				   rvv_convert_0565_to_8888_m4 (
+				       __riscv_vle16_v_u16m2 (dst, vl), vl),
+				   vl),
+			       vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_composite_add_8888_8888 (pixman_implementation_t *__restrict__ imp,
+			     pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *__restrict__ dst_line, *__restrict__ dst;
+    uint32_t *__restrict__ src_line, *__restrict__ src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m4, src, dst)
+	{
+	    __riscv_vse32 (
+		dst,
+		rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (src, vl),
+					   __riscv_vle32_v_u32m4 (dst, vl), vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_add_n_8_8 (pixman_implementation_t *imp,
+			 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int      dst_stride, mask_stride;
+    uint32_t src;
+    uint8_t  sa;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
+			   mask_line, 1);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    sa  = (src >> 24);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_2 (width, vl, e8m4, mask, dst)
+	{
+	    __riscv_vse8 (
+		dst,
+		rvv_UN8_ADD_UN8_vv (rvv_UN8_MUL_UN8_vx_m4 (
+					__riscv_vle8_v_u8m4 (mask, vl), sa, vl),
+				    __riscv_vle8_v_u8m4 (dst, vl), vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_src_memcpy (pixman_implementation_t *imp,
+			  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int      bpp     = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;
+    uint32_t n_bytes = width * bpp;
+    int      dst_stride, src_stride;
+    uint8_t *dst;
+    uint8_t *src;
+
+    src_stride = src_image->bits.rowstride * 4;
+    dst_stride = dest_image->bits.rowstride * 4;
+
+    src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
+    dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
+
+    while (height--)
+    {
+	memcpy (dst, src, n_bytes);
+
+	dst += dst_stride;
+	src += src_stride;
+    }
+}
+
+static void
+rvv_composite_in_n_8_8 (pixman_implementation_t *imp,
+			pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint8_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int      dst_stride, mask_stride;
+
+    src  = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
+			   mask_line, 1);
+
+    if (srca == 0xff)
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+
+	    RVV_FOREACH_2 (width, vl, e8m4, mask, dst)
+	    {
+		__riscv_vse8 (
+		    dst,
+		    rvv_UN8_MUL_UN8_vv_m4 (__riscv_vle8_v_u8m4 (mask, vl),
+					   __riscv_vle8_v_u8m4 (dst, vl), vl),
+		    vl);
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+
+	    RVV_FOREACH_2 (width, vl, e8m4, mask, dst)
+	    {
+		__riscv_vse8 (dst,
+			      rvv_UN8_MUL_UN8_vv_m4 (
+				  rvv_UN8_MUL_UN8_vx_m4 (
+				      __riscv_vle8_v_u8m4 (mask, vl), srca, vl),
+				  __riscv_vle8_v_u8m4 (dst, vl), vl),
+			      vl);
+	    }
+	}
+    }
+}
+
+static void
+rvv_composite_in_8_8 (pixman_implementation_t *imp,
+		      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *src_line, *src;
+    int      dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride,
+			   src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
+			   dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e8m4, src, dst)
+	{
+	    __riscv_vse8 (dst,
+			  rvv_UN8_MUL_UN8_vv_m4 (__riscv_vle8_v_u8m4 (src, vl),
+						 __riscv_vle8_v_u8m4 (dst, vl),
+						 vl),
+			  vl);
+	}
+    }
+}
+
+#define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (offs))
+
+/*
+ * There is some potential for hand vectorization, but for now let's leave it
+ * autovectorized.
+ */
+static force_inline void
+pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
+{
+    if (offs)
+    {
+	int leading_pixels = 32 - offs;
+	if (leading_pixels >= width)
+	{
+	    if (v)
+		*dst |= A1_FILL_MASK (width, offs);
+	    else
+		*dst &= ~A1_FILL_MASK (width, offs);
+	    return;
+	}
+	else
+	{
+	    if (v)
+		*dst++ |= A1_FILL_MASK (leading_pixels, offs);
+	    else
+		*dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
+	    width -= leading_pixels;
+	}
+    }
+    while (width >= 32)
+    {
+	if (v)
+	    *dst++ = 0xFFFFFFFF;
+	else
+	    *dst++ = 0;
+	width -= 32;
+    }
+    if (width > 0)
+    {
+	if (v)
+	    *dst |= A1_FILL_MASK (width, 0);
+	else
+	    *dst &= ~A1_FILL_MASK (width, 0);
+    }
+}
+
+static void
+rvv_fill1 (uint32_t *bits,
+	   int       stride,
+	   int       x,
+	   int       y,
+	   int       width,
+	   int       height,
+	   uint32_t  filler)
+{
+    uint32_t *dst  = bits + y * stride + (x >> 5);
+    int       offs = x & 31;
+
+    while (height--)
+    {
+	pixman_fill1_line (dst, offs, width, (filler & 1));
+	dst += stride;
+    }
+}
+
+#define RVV_FILL(dtypew)                                                            \
+    static void rvv_fill_u##dtypew (uint32_t *__restrict__ bits, int stride,        \
+				    int x, int y, int width, int height,            \
+				    uint32_t filler)                                \
+    {                                                                               \
+	uint##dtypew##_t *__restrict__ bitsw = (uint##dtypew##_t *)bits;            \
+	int32_t             vstride          = stride * (32 / dtypew);              \
+	vuint##dtypew##m8_t vfiller          = __riscv_vmv_v_x_u##dtypew##m8 (      \
+            (uint##dtypew##_t)filler, __riscv_vsetvlmax_e##dtypew##m8 ()); \
+                                                                                    \
+	bitsw += y * vstride + x;                                                   \
+	while (height--)                                                            \
+	{                                                                           \
+	    uint##dtypew##_t *__restrict__ d = bitsw;                               \
+                                                                                    \
+	    RVV_FOREACH_1 (width, vl, e##dtypew##m8, d)                             \
+	    {                                                                       \
+		__riscv_vse##dtypew (d, vfiller, vl);                               \
+	    }                                                                       \
+                                                                                    \
+	    bitsw += vstride;                                                       \
+	}                                                                           \
+    }
+
+RVV_FILL (8);
+RVV_FILL (16);
+RVV_FILL (32);
+
+static pixman_bool_t
+rvv_fill (pixman_implementation_t *__restrict__ imp,
+	  uint32_t *__restrict__ bits,
+	  int      stride,
+	  int      bpp,
+	  int      x,
+	  int      y,
+	  int      width,
+	  int      height,
+	  uint32_t filler)
+{
+    switch (bpp)
+    {
+	case 1:
+	    rvv_fill1 (bits, stride, x, y, width, height, filler);
+	    break;
+	case 8:
+	    rvv_fill_u8 (bits, stride, x, y, width, height, filler);
+	    break;
+	case 16:
+	    rvv_fill_u16 (bits, stride, x, y, width, height, filler);
+	    break;
+	case 32:
+	    rvv_fill_u32 (bits, stride, x, y, width, height, filler);
+	    break;
+	default:
+	    return FALSE;
+    }
+
+    return TRUE;
+}
+
+static void
+rvv_composite_solid_fill (pixman_implementation_t *imp,
+			  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (dest_image->bits.format == PIXMAN_a1)
+    {
+	src = src >> 31;
+    }
+    else if (dest_image->bits.format == PIXMAN_a8)
+    {
+	src = src >> 24;
+    }
+    else if (dest_image->bits.format == PIXMAN_r5g6b5 ||
+	     dest_image->bits.format == PIXMAN_b5g6r5)
+    {
+	src = convert_8888_to_0565 (src);
+    }
+
+    rvv_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
+	      PIXMAN_FORMAT_BPP (dest_image->bits.format), dest_x, dest_y,
+	      width, height, src);
+}
+
+#define RVV_BLT(dtypew)                                                        \
+    static void rvv_blt_u##dtypew (                                            \
+	uint32_t *__restrict__ src_bits, uint32_t *__restrict__ dst_bits,      \
+	int src_stride, int dst_stride, int src_x, int src_y, int dest_x,      \
+	int dest_y, int width, int height)                                     \
+    {                                                                          \
+	uint##dtypew##_t *src_w = (uint##dtypew##_t *)src_bits;                \
+	uint##dtypew##_t *dst_w = (uint##dtypew##_t *)dst_bits;                \
+                                                                               \
+	src_stride = src_stride * (32 / dtypew);                               \
+	dst_stride = dst_stride * (32 / dtypew);                               \
+                                                                               \
+	src_w += src_stride * src_y + src_x;                                   \
+	dst_w += dst_stride * dest_y + dest_x;                                 \
+                                                                               \
+	while (height--)                                                       \
+	{                                                                      \
+	    uint##dtypew##_t *__restrict__ pd = dst_w;                         \
+	    uint##dtypew##_t *__restrict__ ps = src_w;                         \
+                                                                               \
+	    RVV_FOREACH_2 (width, vl, e##dtypew##m8, ps, pd)                   \
+	    {                                                                  \
+		__riscv_vse##dtypew (                                          \
+		    pd, __riscv_vle##dtypew##_v_u##dtypew##m8 (ps, vl), vl);   \
+	    }                                                                  \
+                                                                               \
+	    dst_w += dst_stride;                                               \
+	    src_w += src_stride;                                               \
+	}                                                                      \
+    }
+RVV_BLT (8);
+RVV_BLT (16);
+RVV_BLT (32);
+
+static pixman_bool_t
+rvv_blt (pixman_implementation_t *__restrict__ imp,
+	 uint32_t *__restrict__ src_bits,
+	 uint32_t *__restrict__ dst_bits,
+	 int src_stride,
+	 int dst_stride,
+	 int src_bpp,
+	 int dst_bpp,
+	 int src_x,
+	 int src_y,
+	 int dest_x,
+	 int dest_y,
+	 int width,
+	 int height)
+{
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    switch (src_bpp)
+    {
+	case 8:
+	    rvv_blt_u8 (src_bits, dst_bits, src_stride, dst_stride, src_x,
+			src_y, dest_x, dest_y, width, height);
+	    break;
+	case 16:
+	    rvv_blt_u16 (src_bits, dst_bits, src_stride, dst_stride, src_x,
+			 src_y, dest_x, dest_y, width, height);
+	    break;
+	case 32:
+	    rvv_blt_u32 (src_bits, dst_bits, src_stride, dst_stride, src_x,
+			 src_y, dest_x, dest_y, width, height);
+	    break;
+	default:
+	    return FALSE;
+    }
+
+    return TRUE;
+}
+
+// clang-format off
 static const pixman_fast_path_t rvv_fast_paths[] = {
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, rvv_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, rvv_composite_over_n_8_0565),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, rvv_composite_over_n_8_0888),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, rvv_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, rvv_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, rvv_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, rvv_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, rvv_composite_over_n_8_8888),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, rvv_composite_over_n_1_8888),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, rvv_composite_over_n_1_8888),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, rvv_composite_over_n_1_8888),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, rvv_composite_over_n_1_8888),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   rvv_composite_over_n_1_0565),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   rvv_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, rvv_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, rvv_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, rvv_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, rvv_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, rvv_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, rvv_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, rvv_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, rvv_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, rvv_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, rvv_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, rvv_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, rvv_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, rvv_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, rvv_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, rvv_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, rvv_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, rvv_composite_add_0565_0565),
+    PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, rvv_composite_add_0565_0565),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, rvv_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, rvv_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, rvv_composite_add_8_8),
+    // PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1_1),
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, rvv_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, rvv_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, rvv_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, rvv_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, rvv_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, rvv_composite_in_n_8_8),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, rvv_composite_src_8888_8888),
+
     {PIXMAN_OP_NONE},
 };
 
-// clang-format off
 pixman_implementation_t *
 _pixman_implementation_create_rvv (pixman_implementation_t *fallback)
 {
-    pixman_implementation_t *imp = _pixman_implementation_create (fallback, rvv_fast_paths);
+    pixman_implementation_t *imp = _pixman_implementation_create (
+	fallback, rvv_fast_paths);
 
+    // clang-format off
     imp->combine_float[PIXMAN_OP_CLEAR] = rvv_combine_clear_u_float;
     imp->combine_float[PIXMAN_OP_SRC] = rvv_combine_src_u_float;
     imp->combine_float[PIXMAN_OP_DST] = rvv_combine_dst_u_float;
@@ -955,7 +3192,7 @@ _pixman_implementation_create_rvv (pixman_implementation_t *fallback)
     imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER] = rvv_combine_conjoint_over_ca_float;
     imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = rvv_combine_conjoint_over_reverse_ca_float;
     imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN] = rvv_combine_conjoint_in_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] =rvv_combine_conjoint_in_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = rvv_combine_conjoint_in_reverse_ca_float;
     imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT] = rvv_combine_conjoint_out_ca_float;
     imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = rvv_combine_conjoint_out_reverse_ca_float;
     imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP] = rvv_combine_conjoint_atop_ca_float;
@@ -981,7 +3218,54 @@ _pixman_implementation_create_rvv (pixman_implementation_t *fallback)
     imp->combine_float_ca[PIXMAN_OP_HSL_COLOR] = rvv_combine_dst_u_float;
     imp->combine_float_ca[PIXMAN_OP_HSL_LUMINOSITY] = rvv_combine_dst_u_float;
 
+    /* Set up function pointers */
+    imp->combine_32[PIXMAN_OP_CLEAR] = rvv_combine_clear;
+    imp->combine_32[PIXMAN_OP_SRC] = rvv_combine_src_u;
+    imp->combine_32[PIXMAN_OP_OVER] = rvv_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = rvv_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = rvv_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = rvv_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = rvv_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = rvv_combine_add_u;
+
+    imp->combine_32[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_u;
+    imp->combine_32[PIXMAN_OP_SCREEN] = rvv_combine_screen_u;
+    imp->combine_32[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_u;
+    imp->combine_32[PIXMAN_OP_DARKEN] = rvv_combine_darken_u;
+    imp->combine_32[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_u;
+    imp->combine_32[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_u;
+    imp->combine_32[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_u;
+    imp->combine_32[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_u;
+
+    imp->combine_32_ca[PIXMAN_OP_CLEAR] = rvv_combine_clear;
+    imp->combine_32_ca[PIXMAN_OP_SRC] = rvv_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = rvv_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = rvv_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = rvv_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = rvv_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = rvv_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = rvv_combine_add_ca;
+
+    imp->combine_32_ca[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_ca;
+    imp->combine_32_ca[PIXMAN_OP_SCREEN] = rvv_combine_screen_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_ca;
+    imp->combine_32_ca[PIXMAN_OP_DARKEN] = rvv_combine_darken_ca;
+    imp->combine_32_ca[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_ca;
+    imp->combine_32_ca[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_ca;
+    imp->combine_32_ca[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_ca;
+    imp->combine_32_ca[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_ca;
+
+    imp->fill = rvv_fill;
+    imp->blt = rvv_blt;
+
     return imp;
 }
-
-// clang-format on
\ No newline at end of file
+// clang-format on
diff --git a/gfx/cairo/libpixman/src/pixman-utils.c b/gfx/cairo/libpixman/src/pixman-utils.c
index 302cd0c290c9..b171af982eb7 100644
--- a/gfx/cairo/libpixman/src/pixman-utils.c
+++ b/gfx/cairo/libpixman/src/pixman-utils.c
@@ -303,6 +303,43 @@ pixman_region32_copy_from_region16 (pixman_region32_t *dst,
     return retval;
 }
 
+pixman_bool_t
+pixman_region32_copy_from_region64f (pixman_region32_t *dst,
+                                     const pixman_region64f_t *src)
+{
+    int n_boxes, i;
+    pixman_box64f_t *boxes64f;
+    pixman_box32_t *boxes32;
+    pixman_box32_t tmp_boxes[N_TMP_BOXES];
+    pixman_bool_t retval;
+
+    boxes64f = pixman_region64f_rectangles (src, &n_boxes);
+
+    if (n_boxes > N_TMP_BOXES)
+	boxes32 = pixman_malloc_ab (n_boxes, sizeof (pixman_box32_t));
+    else
+	boxes32 = tmp_boxes;
+
+    if (!boxes32)
+	return FALSE;
+
+    for (i = 0; i < n_boxes; ++i)
+    {
+	boxes32[i].x1 = boxes64f[i].x1;
+	boxes32[i].y1 = boxes64f[i].y1;
+	boxes32[i].x2 = boxes64f[i].x2;
+	boxes32[i].y2 = boxes64f[i].y2;
+    }
+
+    pixman_region32_fini (dst);
+    retval = pixman_region32_init_rects (dst, boxes32, n_boxes);
+
+    if (boxes32 != tmp_boxes)
+	free (boxes32);
+
+    return retval;
+}
+
 /* This function is exported for the sake of the test suite and not part
  * of the ABI.
  */
diff --git a/gfx/cairo/libpixman/src/pixman-version.h b/gfx/cairo/libpixman/src/pixman-version.h
index 5767975fcda7..3505c5f6fa60 100644
--- a/gfx/cairo/libpixman/src/pixman-version.h
+++ b/gfx/cairo/libpixman/src/pixman-version.h
@@ -32,10 +32,10 @@
 #endif
 
 #define PIXMAN_VERSION_MAJOR 0
-#define PIXMAN_VERSION_MINOR 44
-#define PIXMAN_VERSION_MICRO 2
+#define PIXMAN_VERSION_MINOR 46
+#define PIXMAN_VERSION_MICRO 0
 
-#define PIXMAN_VERSION_STRING "0.44.2"
+#define PIXMAN_VERSION_STRING "0.46.0"
 
 #define PIXMAN_VERSION_ENCODE(major, minor, micro) (	\
 	  ((major) * 10000)				\
diff --git a/gfx/cairo/libpixman/src/pixman-vmx.c b/gfx/cairo/libpixman/src/pixman-vmx.c
index ceac1f3189db..399cfcc64579 100644
--- a/gfx/cairo/libpixman/src/pixman-vmx.c
+++ b/gfx/cairo/libpixman/src/pixman-vmx.c
@@ -28,121 +28,121 @@
 #ifdef HAVE_CONFIG_H
 #include <pixman-config.h>
 #endif
-#include "pixman-private.h"
 #include "pixman-combine32.h"
 #include "pixman-inlines.h"
+#include "pixman-private.h"
 #include <altivec.h>
 
-#define AVV(x...) {x}
+static const vector unsigned char vzero = (const vector unsigned char){0};
+static vector unsigned char       mask_ff000000;
 
-static vector unsigned int mask_ff000000;
-static vector unsigned int mask_red;
-static vector unsigned int mask_green;
-static vector unsigned int mask_blue;
-static vector unsigned int mask_565_fix_rb;
-static vector unsigned int mask_565_fix_g;
-
-static force_inline vector unsigned int
-splat_alpha (vector unsigned int pix)
+static force_inline vector unsigned char
+splat_alpha (vector unsigned char pix)
 {
+    const vector unsigned char sel = (vector unsigned char){
 #ifdef WORDS_BIGENDIAN
-    return vec_perm (pix, pix,
-		     (vector unsigned char)AVV (
-			 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
-			 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
+	0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
+	0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C,
 #else
-    return vec_perm (pix, pix,
-		     (vector unsigned char)AVV (
-			 0x03, 0x03, 0x03, 0x03, 0x07, 0x07, 0x07, 0x07,
-			 0x0B, 0x0B, 0x0B, 0x0B, 0x0F, 0x0F, 0x0F, 0x0F));
+	0x03, 0x03, 0x03, 0x03, 0x07, 0x07, 0x07, 0x07,
+	0x0B, 0x0B, 0x0B, 0x0B, 0x0F, 0x0F, 0x0F, 0x0F,
 #endif
+    };
+
+    return vec_perm (pix, pix, sel);
+}
+
+static force_inline vector unsigned char
+splat_pixel (vector unsigned char pix)
+{
+    const vector unsigned char sel = (vector unsigned char){
+	0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
+	0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
+    };
+
+    return vec_perm (pix, pix, sel);
+}
+
+static force_inline vector unsigned short
+create_mask_16_128 (uint32_t mask)
+{
+    return (vector unsigned short){mask, mask, mask, mask,
+				   mask, mask, mask, mask};
 }
 
 static force_inline vector unsigned int
-splat_pixel (vector unsigned int pix)
+create_mask_32_128 (uint32_t mask)
 {
-    return vec_perm (pix, pix,
-		     (vector unsigned char)AVV (
-			 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
-			 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03));
+    return (vector unsigned int){mask, mask, mask, mask};
 }
 
-static force_inline vector unsigned int
-pix_multiply (vector unsigned int p, vector unsigned int a)
+static force_inline vector unsigned char
+unpacklo_128_16x8 (vector unsigned char data1, vector unsigned char data2)
 {
-    vector unsigned short hi, lo, mod;
-
-    /* unpack to short */
-    hi = (vector unsigned short)
 #ifdef WORDS_BIGENDIAN
-	vec_mergeh ((vector unsigned char)AVV (0),
-		    (vector unsigned char)p);
+    return vec_mergel (data2, data1);
 #else
-	vec_mergeh ((vector unsigned char) p,
-		    (vector unsigned char) AVV (0));
+    return vec_mergel (data1, data2);
 #endif
-
-    mod = (vector unsigned short)
-#ifdef WORDS_BIGENDIAN
-	vec_mergeh ((vector unsigned char)AVV (0),
-		    (vector unsigned char)a);
-#else
-	vec_mergeh ((vector unsigned char) a,
-		    (vector unsigned char) AVV (0));
-#endif
-
-    hi = vec_mladd (hi, mod, (vector unsigned short)
-                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
-                         0x0080, 0x0080, 0x0080, 0x0080));
-
-    hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
-
-    hi = vec_sr (hi, vec_splat_u16 (8));
-
-    /* unpack to short */
-    lo = (vector unsigned short)
-#ifdef WORDS_BIGENDIAN
-	vec_mergel ((vector unsigned char)AVV (0),
-		    (vector unsigned char)p);
-#else
-	vec_mergel ((vector unsigned char) p,
-		    (vector unsigned char) AVV (0));
-#endif
-
-    mod = (vector unsigned short)
-#ifdef WORDS_BIGENDIAN
-	vec_mergel ((vector unsigned char)AVV (0),
-		    (vector unsigned char)a);
-#else
-	vec_mergel ((vector unsigned char) a,
-		    (vector unsigned char) AVV (0));
-#endif
-
-    lo = vec_mladd (lo, mod, (vector unsigned short)
-                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
-                         0x0080, 0x0080, 0x0080, 0x0080));
-
-    lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
-
-    lo = vec_sr (lo, vec_splat_u16 (8));
-
-    return (vector unsigned int)vec_packsu (hi, lo);
 }
 
-static force_inline vector unsigned int
-pix_add (vector unsigned int a, vector unsigned int b)
+static force_inline vector unsigned char
+unpackhi_128_16x8 (vector unsigned char data1, vector unsigned char data2)
 {
-    return (vector unsigned int)vec_adds ((vector unsigned char)a,
-                                          (vector unsigned char)b);
+#ifdef WORDS_BIGENDIAN
+    return vec_mergeh (data2, data1);
+#else
+    return vec_mergeh (data1, data2);
+#endif
 }
 
-static force_inline vector unsigned int
-pix_add_mul (vector unsigned int x,
-             vector unsigned int a,
-             vector unsigned int y,
-             vector unsigned int b)
+static force_inline void
+unpack_128_2x128 (vector unsigned char  data1,
+		  vector unsigned char  data2,
+		  vector unsigned char *data_lo,
+		  vector unsigned char *data_hi)
 {
-    vector unsigned int t1, t2;
+    *data_lo = unpacklo_128_16x8 (data1, data2);
+    *data_hi = unpackhi_128_16x8 (data1, data2);
+}
+
+static force_inline vector unsigned char
+pix_multiply (vector unsigned char a, vector unsigned char b)
+{
+    const vector unsigned char sel = (vector unsigned char){
+#ifdef WORDS_BIGENDIAN
+	0x00, 0x10, 0x02, 0x12, 0x04, 0x14, 0x06, 0x16,
+	0x08, 0x18, 0x0a, 0x1a, 0x0c, 0x1c, 0x0e, 0x1e,
+#else
+	0x01, 0x11, 0x03, 0x13, 0x05, 0x15, 0x07, 0x17,
+	0x09, 0x19, 0x0b, 0x1b, 0x0d, 0x1d, 0x0f, 0x1f,
+#endif
+    };
+    vector unsigned short e = vec_mule (a, b);
+    vector unsigned short o = vec_mulo (a, b);
+
+    e = vec_adds (e, create_mask_16_128 (128));
+    o = vec_adds (o, create_mask_16_128 (128));
+
+    e = vec_adds (e, vec_sr (e, vec_splat_u16 (8)));
+    o = vec_adds (o, vec_sr (o, vec_splat_u16 (8)));
+
+    return (vector unsigned char)vec_perm (e, o, sel);
+}
+
+static force_inline vector unsigned char
+pix_add (vector unsigned char a, vector unsigned char b)
+{
+    return vec_adds (a, b);
+}
+
+static force_inline vector unsigned char
+pix_add_mul (vector unsigned char x,
+	     vector unsigned char a,
+	     vector unsigned char y,
+	     vector unsigned char b)
+{
+    vector unsigned char t1, t2;
 
     t1 = pix_multiply (x, a);
     t2 = pix_multiply (y, b);
@@ -150,67 +150,65 @@ pix_add_mul (vector unsigned int x,
     return pix_add (t1, t2);
 }
 
-static force_inline vector unsigned int
-negate (vector unsigned int src)
+static force_inline vector unsigned char
+negate (vector unsigned char src)
 {
     return vec_nor (src, src);
 }
 
 /* dest*~srca + src */
-static force_inline vector unsigned int
-over (vector unsigned int src,
-      vector unsigned int srca,
-      vector unsigned int dest)
+static force_inline vector unsigned char
+over (vector unsigned char src,
+      vector unsigned char srca,
+      vector unsigned char dest)
 {
-    vector unsigned char tmp = (vector unsigned char)
-	pix_multiply (dest, negate (srca));
-
-    tmp = vec_adds ((vector unsigned char)src, tmp);
-    return (vector unsigned int)tmp;
+    return vec_adds (src, pix_multiply (dest, negate (srca)));
 }
 
 /* in == pix_multiply */
-#define in_over(src, srca, mask, dest)					\
-    over (pix_multiply (src, mask),					\
-          pix_multiply (srca, mask), dest)
+static force_inline vector unsigned char
+in_over (vector unsigned char src,
+	 vector unsigned char srca,
+	 vector unsigned char mask,
+	 vector unsigned char dest)
+{
+    return over (pix_multiply (src, mask), pix_multiply (srca, mask), dest);
+}
 
 #ifdef WORDS_BIGENDIAN
 
-#define COMPUTE_SHIFT_MASK(source)					\
-    source ## _mask = vec_lvsl (0, source);
+#define COMPUTE_SHIFT_MASK(source) source##_mask = vec_lvsl (0, source);
 
-#define COMPUTE_SHIFT_MASKS(dest, source)				\
-    source ## _mask = vec_lvsl (0, source);
+#define COMPUTE_SHIFT_MASKS(dest, source) source##_mask = vec_lvsl (0, source);
 
-#define COMPUTE_SHIFT_MASKC(dest, source, mask)				\
-    mask ## _mask = vec_lvsl (0, mask);					\
-    source ## _mask = vec_lvsl (0, source);
+#define COMPUTE_SHIFT_MASKC(dest, source, mask)                                \
+    mask##_mask   = vec_lvsl (0, mask);                                        \
+    source##_mask = vec_lvsl (0, source);
 
-#define LOAD_VECTOR(source)				  \
-do							  \
-{							  \
-    vector unsigned char tmp1, tmp2;			  \
-    tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
-    tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
-    v ## source = (typeof(v ## source)) 		  \
-	vec_perm (tmp1, tmp2, source ## _mask);		  \
-} while (0)
+#define LOAD_VECTOR(source)                                                    \
+    do                                                                         \
+    {                                                                          \
+	vector unsigned char tmp1, tmp2;                                       \
+	tmp1      = (typeof (tmp1))vec_ld (0, source);                         \
+	tmp2      = (typeof (tmp2))vec_ld (15, source);                        \
+	v##source = (typeof (v##source))vec_perm (tmp1, tmp2, source##_mask);  \
+    } while (0)
 
-#define LOAD_VECTORS(dest, source)			  \
-do							  \
-{							  \
-    LOAD_VECTOR(source);				  \
-    v ## dest = (typeof(v ## dest))vec_ld (0, dest);	  \
-} while (0)
+#define LOAD_VECTORS(dest, source)                                             \
+    do                                                                         \
+    {                                                                          \
+	LOAD_VECTOR (source);                                                  \
+	v##dest = (typeof (v##dest))vec_ld (0, dest);                          \
+    } while (0)
 
-#define LOAD_VECTORSC(dest, source, mask)		  \
-do							  \
-{							  \
-    LOAD_VECTORS(dest, source); 			  \
-    LOAD_VECTOR(mask);					  \
-} while (0)
+#define LOAD_VECTORSC(dest, source, mask)                                      \
+    do                                                                         \
+    {                                                                          \
+	LOAD_VECTORS (dest, source);                                           \
+	LOAD_VECTOR (mask);                                                    \
+    } while (0)
 
-#define DECLARE_SRC_MASK_VAR vector unsigned char src_mask
+#define DECLARE_SRC_MASK_VAR  vector unsigned char src_mask
 #define DECLARE_MASK_MASK_VAR vector unsigned char mask_mask
 
 #else
@@ -226,42 +224,39 @@ do							  \
 
 #define COMPUTE_SHIFT_MASKC(dest, source, mask)
 
-# define LOAD_VECTOR(source)				\
-    v ## source = (typeof(v ## source))vec_xl(0, source);
+#define LOAD_VECTOR(source) v##source = (typeof (v##source))vec_xl (0, source);
 
-# define LOAD_VECTORS(dest, source)			\
-    LOAD_VECTOR(source);				\
-    LOAD_VECTOR(dest);					\
+#define LOAD_VECTORS(dest, source)                                             \
+    LOAD_VECTOR (source);                                                      \
+    LOAD_VECTOR (dest);
 
-# define LOAD_VECTORSC(dest, source, mask)		\
-    LOAD_VECTORS(dest, source); 			\
-    LOAD_VECTOR(mask);					\
+#define LOAD_VECTORSC(dest, source, mask)                                      \
+    LOAD_VECTORS (dest, source);                                               \
+    LOAD_VECTOR (mask);
 
 #define DECLARE_SRC_MASK_VAR
 #define DECLARE_MASK_MASK_VAR
 
 #endif /* WORDS_BIGENDIAN */
 
-#define LOAD_VECTORSM(dest, source, mask)				\
-    LOAD_VECTORSC (dest, source, mask); 				\
-    v ## source = pix_multiply (v ## source,				\
-                                splat_alpha (v ## mask));
+#define LOAD_VECTORSM(dest, source, mask)                                      \
+    LOAD_VECTORSC (dest, source, mask);                                        \
+    v##source = pix_multiply (v##source, splat_alpha (v##mask));
 
-#define STORE_VECTOR(dest)						\
-    vec_st ((vector unsigned int) v ## dest, 0, dest);
+#define STORE_VECTOR(dest) vec_st ((vector unsigned int)v##dest, 0, dest);
 
 /* load 4 pixels from a 16-byte boundary aligned address */
-static force_inline vector unsigned int
-load_128_aligned (const uint32_t* src)
+static force_inline vector unsigned char
+load_128_aligned (const uint32_t *src)
 {
-    return *((vector unsigned int *) src);
+    return *((vector unsigned char *)src);
 }
 
 /* load 4 pixels from a unaligned address */
-static force_inline vector unsigned int
-load_128_unaligned (const uint32_t* src)
+static force_inline vector unsigned char
+load_128_unaligned (const uint32_t *src)
 {
-    vector unsigned int vsrc;
+    vector unsigned char vsrc;
     DECLARE_SRC_MASK_VAR;
 
     COMPUTE_SHIFT_MASK (src);
@@ -272,143 +267,27 @@ load_128_unaligned (const uint32_t* src)
 
 /* save 4 pixels on a 16-byte boundary aligned address */
 static force_inline void
-save_128_aligned (uint32_t* data,
-		  vector unsigned int vdata)
+save_128_aligned (uint32_t *data, vector unsigned char vdata)
 {
-    STORE_VECTOR(data)
-}
-
-static force_inline vector unsigned int
-create_mask_32_128 (uint32_t mask)
-{
-    return (vector unsigned int) {mask, mask, mask, mask};
-}
-
-static force_inline vector unsigned int
-unpacklo_128_16x8 (vector unsigned int data1, vector unsigned int data2)
-{
-    vector unsigned char lo;
-
-    /* unpack to short */
-    lo = (vector unsigned char)
-#ifdef WORDS_BIGENDIAN
-	vec_mergel ((vector unsigned char) data2,
-		    (vector unsigned char) data1);
-#else
-	vec_mergel ((vector unsigned char) data1,
-		    (vector unsigned char) data2);
-#endif
-
-    return (vector unsigned int) lo;
-}
-
-static force_inline vector unsigned int
-unpackhi_128_16x8 (vector unsigned int data1, vector unsigned int data2)
-{
-    vector unsigned char hi;
-
-    /* unpack to short */
-    hi = (vector unsigned char)
-#ifdef WORDS_BIGENDIAN
-	vec_mergeh ((vector unsigned char) data2,
-		    (vector unsigned char) data1);
-#else
-	vec_mergeh ((vector unsigned char) data1,
-		    (vector unsigned char) data2);
-#endif
-
-    return (vector unsigned int) hi;
-}
-
-static force_inline vector unsigned int
-unpacklo_128_8x16 (vector unsigned int data1, vector unsigned int data2)
-{
-    vector unsigned short lo;
-
-    /* unpack to char */
-    lo = (vector unsigned short)
-#ifdef WORDS_BIGENDIAN
-	vec_mergel ((vector unsigned short) data2,
-		    (vector unsigned short) data1);
-#else
-	vec_mergel ((vector unsigned short) data1,
-		    (vector unsigned short) data2);
-#endif
-
-    return (vector unsigned int) lo;
-}
-
-static force_inline vector unsigned int
-unpackhi_128_8x16 (vector unsigned int data1, vector unsigned int data2)
-{
-    vector unsigned short hi;
-
-    /* unpack to char */
-    hi = (vector unsigned short)
-#ifdef WORDS_BIGENDIAN
-	vec_mergeh ((vector unsigned short) data2,
-		    (vector unsigned short) data1);
-#else
-	vec_mergeh ((vector unsigned short) data1,
-		    (vector unsigned short) data2);
-#endif
-
-    return (vector unsigned int) hi;
-}
-
-static force_inline void
-unpack_128_2x128 (vector unsigned int data1, vector unsigned int data2,
-		    vector unsigned int* data_lo, vector unsigned int* data_hi)
-{
-    *data_lo = unpacklo_128_16x8(data1, data2);
-    *data_hi = unpackhi_128_16x8(data1, data2);
-}
-
-static force_inline void
-unpack_128_2x128_16 (vector unsigned int data1, vector unsigned int data2,
-		    vector unsigned int* data_lo, vector unsigned int* data_hi)
-{
-    *data_lo = unpacklo_128_8x16(data1, data2);
-    *data_hi = unpackhi_128_8x16(data1, data2);
-}
-
-static force_inline vector unsigned int
-unpack_565_to_8888 (vector unsigned int lo)
-{
-    vector unsigned int r, g, b, rb, t;
-
-    r = vec_and (vec_sl(lo, create_mask_32_128(8)), mask_red);
-    g = vec_and (vec_sl(lo, create_mask_32_128(5)), mask_green);
-    b = vec_and (vec_sl(lo, create_mask_32_128(3)), mask_blue);
-
-    rb = vec_or (r, b);
-    t  = vec_and (rb, mask_565_fix_rb);
-    t  = vec_sr (t, create_mask_32_128(5));
-    rb = vec_or (rb, t);
-
-    t  = vec_and (g, mask_565_fix_g);
-    t  = vec_sr (t, create_mask_32_128(6));
-    g  = vec_or (g, t);
-
-    return vec_or (rb, g);
+    STORE_VECTOR (data)
 }
 
 static force_inline int
-is_opaque (vector unsigned int x)
+is_opaque (vector unsigned char x)
 {
     return vec_all_eq (vec_and (x, mask_ff000000), mask_ff000000);
 }
 
 static force_inline int
-is_zero (vector unsigned int x)
+is_zero (vector unsigned char x)
 {
-    return vec_all_eq (x, (vector unsigned int) AVV (0));
+    return vec_all_eq (x, vzero);
 }
 
 static force_inline int
-is_transparent (vector unsigned int x)
+is_transparent (vector unsigned char x)
 {
-    return vec_all_eq (vec_and (x, mask_ff000000), (vector unsigned int) AVV (0));
+    return vec_all_eq (vec_and (x, mask_ff000000), vzero);
 }
 
 static force_inline uint32_t
@@ -416,7 +295,7 @@ core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
 {
     uint32_t a;
 
-    a = ALPHA_8(src);
+    a = ALPHA_8 (src);
 
     if (a == 0xff)
     {
@@ -424,7 +303,7 @@ core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
     }
     else if (src)
     {
-	UN8x4_MUL_UN8_ADD_UN8x4(dst, (~a & MASK), src);
+	UN8x4_MUL_UN8_ADD_UN8x4 (dst, (~a & MASK), src);
     }
 
     return dst;
@@ -436,45 +315,42 @@ combine1 (const uint32_t *ps, const uint32_t *pm)
     uint32_t s = *ps;
 
     if (pm)
-	UN8x4_MUL_UN8(s, ALPHA_8(*pm));
+	UN8x4_MUL_UN8 (s, ALPHA_8 (*pm));
 
     return s;
 }
 
-static force_inline vector unsigned int
-combine4 (const uint32_t* ps, const uint32_t* pm)
+static force_inline vector unsigned char
+combine4 (const uint32_t *ps, const uint32_t *pm)
 {
-    vector unsigned int src, msk;
+    vector unsigned char src, msk;
 
     if (pm)
     {
-	msk = load_128_unaligned(pm);
+	msk = load_128_unaligned (pm);
 
-	if (is_transparent(msk))
-	    return (vector unsigned int) AVV(0);
+	if (is_transparent (msk))
+	    return vzero;
     }
 
-    src = load_128_unaligned(ps);
+    src = load_128_unaligned (ps);
 
     if (pm)
-	src = pix_multiply(src, msk);
+	src = pix_multiply (src, msk);
 
     return src;
 }
 
 static void
-vmx_combine_over_u_no_mask (uint32_t *      dest,
-                            const uint32_t *src,
-                            int             width)
+vmx_combine_over_u_no_mask (uint32_t *dest, const uint32_t *src, int width)
 {
-    int i;
-    vector unsigned int vdest, vsrc;
+    vector unsigned char vdest, vsrc;
     DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t s = *src++;
-	uint32_t d = *dest;
+	uint32_t s  = *src++;
+	uint32_t d  = *dest;
 	uint32_t ia = ALPHA_8 (~s);
 
 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
@@ -486,7 +362,7 @@ vmx_combine_over_u_no_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 
 	LOAD_VECTORS (dest, src);
@@ -499,10 +375,10 @@ vmx_combine_over_u_no_mask (uint32_t *      dest,
 	dest += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
+	uint32_t s  = src[i];
+	uint32_t d  = dest[i];
 	uint32_t ia = ALPHA_8 (~s);
 
 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
@@ -512,13 +388,12 @@ vmx_combine_over_u_no_mask (uint32_t *      dest,
 }
 
 static void
-vmx_combine_over_u_mask (uint32_t *      dest,
-                         const uint32_t *src,
-                         const uint32_t *mask,
-                         int             width)
+vmx_combine_over_u_mask (uint32_t       *dest,
+			 const uint32_t *src,
+			 const uint32_t *mask,
+			 int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
@@ -541,7 +416,7 @@ vmx_combine_over_u_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSM (dest, src, mask);
 
@@ -554,7 +429,7 @@ vmx_combine_over_u_mask (uint32_t *      dest,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t s = src[i];
@@ -572,11 +447,11 @@ vmx_combine_over_u_mask (uint32_t *      dest,
 
 static void
 vmx_combine_over_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dest,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
+		    pixman_op_t              op,
+		    uint32_t                *dest,
+		    const uint32_t          *src,
+		    const uint32_t          *mask,
+		    int                      width)
 {
     if (mask)
 	vmx_combine_over_u_mask (dest, src, mask, width);
@@ -585,18 +460,17 @@ vmx_combine_over_u (pixman_implementation_t *imp,
 }
 
 static void
-vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
-                                    const uint32_t *src,
-                                    int             width)
+vmx_combine_over_reverse_u_no_mask (uint32_t       *dest,
+				    const uint32_t *src,
+				    int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc;
+    vector unsigned char vdest, vsrc;
     DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t s = *src++;
-	uint32_t d = *dest;
+	uint32_t s  = *src++;
+	uint32_t d  = *dest;
 	uint32_t ia = ALPHA_8 (~d);
 
 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
@@ -607,7 +481,7 @@ vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 
 	LOAD_VECTORS (dest, src);
@@ -620,10 +494,10 @@ vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
 	dest += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
+	uint32_t s  = src[i];
+	uint32_t d  = dest[i];
 	uint32_t ia = ALPHA_8 (~dest[i]);
 
 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
@@ -632,21 +506,20 @@ vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
 }
 
 static void
-vmx_combine_over_reverse_u_mask (uint32_t *      dest,
-                                 const uint32_t *src,
-                                 const uint32_t *mask,
-                                 int             width)
+vmx_combine_over_reverse_u_mask (uint32_t       *dest,
+				 const uint32_t *src,
+				 const uint32_t *mask,
+				 int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t m = ALPHA_8 (*mask++);
-	uint32_t s = *src++;
-	uint32_t d = *dest;
+	uint32_t m  = ALPHA_8 (*mask++);
+	uint32_t s  = *src++;
+	uint32_t d  = *dest;
 	uint32_t ia = ALPHA_8 (~d);
 
 	UN8x4_MUL_UN8 (s, m);
@@ -659,7 +532,7 @@ vmx_combine_over_reverse_u_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 
 	LOAD_VECTORSM (dest, src, mask);
@@ -673,11 +546,11 @@ vmx_combine_over_reverse_u_mask (uint32_t *      dest,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t m = ALPHA_8 (mask[i]);
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
+	uint32_t m  = ALPHA_8 (mask[i]);
+	uint32_t s  = src[i];
+	uint32_t d  = dest[i];
 	uint32_t ia = ALPHA_8 (~dest[i]);
 
 	UN8x4_MUL_UN8 (s, m);
@@ -689,11 +562,11 @@ vmx_combine_over_reverse_u_mask (uint32_t *      dest,
 
 static void
 vmx_combine_over_reverse_u (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            uint32_t *               dest,
-                            const uint32_t *         src,
-                            const uint32_t *         mask,
-                            int                      width)
+			    pixman_op_t              op,
+			    uint32_t                *dest,
+			    const uint32_t          *src,
+			    const uint32_t          *mask,
+			    int                      width)
 {
     if (mask)
 	vmx_combine_over_reverse_u_mask (dest, src, mask, width);
@@ -702,12 +575,9 @@ vmx_combine_over_reverse_u (pixman_implementation_t *imp,
 }
 
 static void
-vmx_combine_in_u_no_mask (uint32_t *      dest,
-                          const uint32_t *src,
-                          int             width)
+vmx_combine_in_u_no_mask (uint32_t *dest, const uint32_t *src, int width)
 {
-    int i;
-    vector unsigned int vdest, vsrc;
+    vector unsigned char vdest, vsrc;
     DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
@@ -723,7 +593,7 @@ vmx_combine_in_u_no_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORS (dest, src);
 
@@ -735,7 +605,7 @@ vmx_combine_in_u_no_mask (uint32_t *      dest,
 	dest += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t s = src[i];
 	uint32_t a = ALPHA_8 (dest[i]);
@@ -746,13 +616,12 @@ vmx_combine_in_u_no_mask (uint32_t *      dest,
 }
 
 static void
-vmx_combine_in_u_mask (uint32_t *      dest,
-                       const uint32_t *src,
-                       const uint32_t *mask,
-                       int             width)
+vmx_combine_in_u_mask (uint32_t       *dest,
+		       const uint32_t *src,
+		       const uint32_t *mask,
+		       int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
@@ -772,7 +641,7 @@ vmx_combine_in_u_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSM (dest, src, mask);
 
@@ -785,7 +654,7 @@ vmx_combine_in_u_mask (uint32_t *      dest,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t s = src[i];
@@ -800,11 +669,11 @@ vmx_combine_in_u_mask (uint32_t *      dest,
 
 static void
 vmx_combine_in_u (pixman_implementation_t *imp,
-                  pixman_op_t              op,
-                  uint32_t *               dest,
-                  const uint32_t *         src,
-                  const uint32_t *         mask,
-                  int                      width)
+		  pixman_op_t              op,
+		  uint32_t                *dest,
+		  const uint32_t          *src,
+		  const uint32_t          *mask,
+		  int                      width)
 {
     if (mask)
 	vmx_combine_in_u_mask (dest, src, mask, width);
@@ -813,12 +682,11 @@ vmx_combine_in_u (pixman_implementation_t *imp,
 }
 
 static void
-vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
-                                  const uint32_t *src,
-                                  int             width)
+vmx_combine_in_reverse_u_no_mask (uint32_t       *dest,
+				  const uint32_t *src,
+				  int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc;
+    vector unsigned char vdest, vsrc;
     DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
@@ -835,7 +703,7 @@ vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORS (dest, src);
 
@@ -847,7 +715,7 @@ vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
 	dest += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t d = dest[i];
 	uint32_t a = ALPHA_8 (src[i]);
@@ -859,13 +727,12 @@ vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
 }
 
 static void
-vmx_combine_in_reverse_u_mask (uint32_t *      dest,
-                               const uint32_t *src,
-                               const uint32_t *mask,
-                               int             width)
+vmx_combine_in_reverse_u_mask (uint32_t       *dest,
+			       const uint32_t *src,
+			       const uint32_t *mask,
+			       int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
@@ -886,7 +753,7 @@ vmx_combine_in_reverse_u_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSM (dest, src, mask);
 
@@ -899,7 +766,7 @@ vmx_combine_in_reverse_u_mask (uint32_t *      dest,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t d = dest[i];
@@ -915,11 +782,11 @@ vmx_combine_in_reverse_u_mask (uint32_t *      dest,
 
 static void
 vmx_combine_in_reverse_u (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          uint32_t *               dest,
-                          const uint32_t *         src,
-                          const uint32_t *         mask,
-                          int                      width)
+			  pixman_op_t              op,
+			  uint32_t                *dest,
+			  const uint32_t          *src,
+			  const uint32_t          *mask,
+			  int                      width)
 {
     if (mask)
 	vmx_combine_in_reverse_u_mask (dest, src, mask, width);
@@ -928,12 +795,9 @@ vmx_combine_in_reverse_u (pixman_implementation_t *imp,
 }
 
 static void
-vmx_combine_out_u_no_mask (uint32_t *      dest,
-                           const uint32_t *src,
-                           int             width)
+vmx_combine_out_u_no_mask (uint32_t *dest, const uint32_t *src, int width)
 {
-    int i;
-    vector unsigned int vdest, vsrc;
+    vector unsigned char vdest, vsrc;
     DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
@@ -950,7 +814,7 @@ vmx_combine_out_u_no_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORS (dest, src);
 
@@ -962,7 +826,7 @@ vmx_combine_out_u_no_mask (uint32_t *      dest,
 	dest += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t s = src[i];
 	uint32_t a = ALPHA_8 (~dest[i]);
@@ -974,13 +838,12 @@ vmx_combine_out_u_no_mask (uint32_t *      dest,
 }
 
 static void
-vmx_combine_out_u_mask (uint32_t *      dest,
-                        const uint32_t *src,
-                        const uint32_t *mask,
-                        int             width)
+vmx_combine_out_u_mask (uint32_t       *dest,
+			const uint32_t *src,
+			const uint32_t *mask,
+			int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
@@ -1000,7 +863,7 @@ vmx_combine_out_u_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSM (dest, src, mask);
 
@@ -1013,7 +876,7 @@ vmx_combine_out_u_mask (uint32_t *      dest,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t s = src[i];
@@ -1028,11 +891,11 @@ vmx_combine_out_u_mask (uint32_t *      dest,
 
 static void
 vmx_combine_out_u (pixman_implementation_t *imp,
-                   pixman_op_t              op,
-                   uint32_t *               dest,
-                   const uint32_t *         src,
-                   const uint32_t *         mask,
-                   int                      width)
+		   pixman_op_t              op,
+		   uint32_t                *dest,
+		   const uint32_t          *src,
+		   const uint32_t          *mask,
+		   int                      width)
 {
     if (mask)
 	vmx_combine_out_u_mask (dest, src, mask, width);
@@ -1041,12 +904,11 @@ vmx_combine_out_u (pixman_implementation_t *imp,
 }
 
 static void
-vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
-                                   const uint32_t *src,
-                                   int             width)
+vmx_combine_out_reverse_u_no_mask (uint32_t       *dest,
+				   const uint32_t *src,
+				   int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc;
+    vector unsigned char vdest, vsrc;
     DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
@@ -1063,7 +925,7 @@ vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 
 	LOAD_VECTORS (dest, src);
@@ -1076,7 +938,7 @@ vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
 	dest += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t d = dest[i];
 	uint32_t a = ALPHA_8 (~src[i]);
@@ -1088,13 +950,12 @@ vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
 }
 
 static void
-vmx_combine_out_reverse_u_mask (uint32_t *      dest,
-                                const uint32_t *src,
-                                const uint32_t *mask,
-                                int             width)
+vmx_combine_out_reverse_u_mask (uint32_t       *dest,
+				const uint32_t *src,
+				const uint32_t *mask,
+				int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
@@ -1115,7 +976,7 @@ vmx_combine_out_reverse_u_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSM (dest, src, mask);
 
@@ -1128,7 +989,7 @@ vmx_combine_out_reverse_u_mask (uint32_t *      dest,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t d = dest[i];
@@ -1144,11 +1005,11 @@ vmx_combine_out_reverse_u_mask (uint32_t *      dest,
 
 static void
 vmx_combine_out_reverse_u (pixman_implementation_t *imp,
-                           pixman_op_t              op,
-                           uint32_t *               dest,
-                           const uint32_t *         src,
-                           const uint32_t *         mask,
-                           int                      width)
+			   pixman_op_t              op,
+			   uint32_t                *dest,
+			   const uint32_t          *src,
+			   const uint32_t          *mask,
+			   int                      width)
 {
     if (mask)
 	vmx_combine_out_reverse_u_mask (dest, src, mask, width);
@@ -1157,18 +1018,15 @@ vmx_combine_out_reverse_u (pixman_implementation_t *imp,
 }
 
 static void
-vmx_combine_atop_u_no_mask (uint32_t *      dest,
-                            const uint32_t *src,
-                            int             width)
+vmx_combine_atop_u_no_mask (uint32_t *dest, const uint32_t *src, int width)
 {
-    int i;
-    vector unsigned int vdest, vsrc;
+    vector unsigned char vdest, vsrc;
     DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t s = *src++;
-	uint32_t d = *dest;
+	uint32_t s      = *src++;
+	uint32_t d      = *dest;
 	uint32_t dest_a = ALPHA_8 (d);
 	uint32_t src_ia = ALPHA_8 (~s);
 
@@ -1181,12 +1039,12 @@ vmx_combine_atop_u_no_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORS (dest, src);
 
-	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
-			     vdest, splat_alpha (negate (vsrc)));
+	vdest = pix_add_mul (vsrc, splat_alpha (vdest), vdest,
+			     splat_alpha (negate (vsrc)));
 
 	STORE_VECTOR (dest);
 
@@ -1194,10 +1052,10 @@ vmx_combine_atop_u_no_mask (uint32_t *      dest,
 	dest += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
+	uint32_t s      = src[i];
+	uint32_t d      = dest[i];
 	uint32_t dest_a = ALPHA_8 (d);
 	uint32_t src_ia = ALPHA_8 (~s);
 
@@ -1208,21 +1066,20 @@ vmx_combine_atop_u_no_mask (uint32_t *      dest,
 }
 
 static void
-vmx_combine_atop_u_mask (uint32_t *      dest,
-                         const uint32_t *src,
-                         const uint32_t *mask,
-                         int             width)
+vmx_combine_atop_u_mask (uint32_t       *dest,
+			 const uint32_t *src,
+			 const uint32_t *mask,
+			 int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t m = ALPHA_8 (*mask++);
-	uint32_t s = *src++;
-	uint32_t d = *dest;
+	uint32_t m      = ALPHA_8 (*mask++);
+	uint32_t s      = *src++;
+	uint32_t d      = *dest;
 	uint32_t dest_a = ALPHA_8 (d);
 	uint32_t src_ia;
 
@@ -1239,12 +1096,12 @@ vmx_combine_atop_u_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSM (dest, src, mask);
 
-	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
-			     vdest, splat_alpha (negate (vsrc)));
+	vdest = pix_add_mul (vsrc, splat_alpha (vdest), vdest,
+			     splat_alpha (negate (vsrc)));
 
 	STORE_VECTOR (dest);
 
@@ -1253,11 +1110,11 @@ vmx_combine_atop_u_mask (uint32_t *      dest,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t m = ALPHA_8 (mask[i]);
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
+	uint32_t m      = ALPHA_8 (mask[i]);
+	uint32_t s      = src[i];
+	uint32_t d      = dest[i];
 	uint32_t dest_a = ALPHA_8 (d);
 	uint32_t src_ia;
 
@@ -1273,11 +1130,11 @@ vmx_combine_atop_u_mask (uint32_t *      dest,
 
 static void
 vmx_combine_atop_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dest,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
+		    pixman_op_t              op,
+		    uint32_t                *dest,
+		    const uint32_t          *src,
+		    const uint32_t          *mask,
+		    int                      width)
 {
     if (mask)
 	vmx_combine_atop_u_mask (dest, src, mask, width);
@@ -1286,19 +1143,18 @@ vmx_combine_atop_u (pixman_implementation_t *imp,
 }
 
 static void
-vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
-                                    const uint32_t *src,
-                                    int             width)
+vmx_combine_atop_reverse_u_no_mask (uint32_t       *dest,
+				    const uint32_t *src,
+				    int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc;
+    vector unsigned char vdest, vsrc;
     DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t s = *src++;
-	uint32_t d = *dest;
-	uint32_t src_a = ALPHA_8 (s);
+	uint32_t s       = *src++;
+	uint32_t d       = *dest;
+	uint32_t src_a   = ALPHA_8 (s);
 	uint32_t dest_ia = ALPHA_8 (~d);
 
 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
@@ -1310,12 +1166,12 @@ vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORS (dest, src);
 
-	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
-			     vsrc, splat_alpha (negate (vdest)));
+	vdest = pix_add_mul (vdest, splat_alpha (vsrc), vsrc,
+			     splat_alpha (negate (vdest)));
 
 	STORE_VECTOR (dest);
 
@@ -1323,11 +1179,11 @@ vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
 	dest += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
-	uint32_t src_a = ALPHA_8 (s);
+	uint32_t s       = src[i];
+	uint32_t d       = dest[i];
+	uint32_t src_a   = ALPHA_8 (s);
 	uint32_t dest_ia = ALPHA_8 (~d);
 
 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
@@ -1337,13 +1193,12 @@ vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
 }
 
 static void
-vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
-                                 const uint32_t *src,
-                                 const uint32_t *mask,
-                                 int             width)
+vmx_combine_atop_reverse_u_mask (uint32_t       *dest,
+				 const uint32_t *src,
+				 const uint32_t *mask,
+				 int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
@@ -1368,12 +1223,12 @@ vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSM (dest, src, mask);
 
-	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
-			     vsrc, splat_alpha (negate (vdest)));
+	vdest = pix_add_mul (vdest, splat_alpha (vsrc), vsrc,
+			     splat_alpha (negate (vdest)));
 
 	STORE_VECTOR (dest);
 
@@ -1382,7 +1237,7 @@ vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t s = src[i];
@@ -1402,11 +1257,11 @@ vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
 
 static void
 vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            uint32_t *               dest,
-                            const uint32_t *         src,
-                            const uint32_t *         mask,
-                            int                      width)
+			    pixman_op_t              op,
+			    uint32_t                *dest,
+			    const uint32_t          *src,
+			    const uint32_t          *mask,
+			    int                      width)
 {
     if (mask)
 	vmx_combine_atop_reverse_u_mask (dest, src, mask, width);
@@ -1415,19 +1270,16 @@ vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
 }
 
 static void
-vmx_combine_xor_u_no_mask (uint32_t *      dest,
-                           const uint32_t *src,
-                           int             width)
+vmx_combine_xor_u_no_mask (uint32_t *dest, const uint32_t *src, int width)
 {
-    int i;
-    vector unsigned int vdest, vsrc;
+    vector unsigned char vdest, vsrc;
     DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t s = *src++;
-	uint32_t d = *dest;
-	uint32_t src_ia = ALPHA_8 (~s);
+	uint32_t s       = *src++;
+	uint32_t d       = *dest;
+	uint32_t src_ia  = ALPHA_8 (~s);
 	uint32_t dest_ia = ALPHA_8 (~d);
 
 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
@@ -1439,12 +1291,12 @@ vmx_combine_xor_u_no_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORS (dest, src);
 
-	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
-			     vdest, splat_alpha (negate (vsrc)));
+	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)), vdest,
+			     splat_alpha (negate (vsrc)));
 
 	STORE_VECTOR (dest);
 
@@ -1452,11 +1304,11 @@ vmx_combine_xor_u_no_mask (uint32_t *      dest,
 	dest += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
-	uint32_t src_ia = ALPHA_8 (~s);
+	uint32_t s       = src[i];
+	uint32_t d       = dest[i];
+	uint32_t src_ia  = ALPHA_8 (~s);
 	uint32_t dest_ia = ALPHA_8 (~d);
 
 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
@@ -1466,13 +1318,12 @@ vmx_combine_xor_u_no_mask (uint32_t *      dest,
 }
 
 static void
-vmx_combine_xor_u_mask (uint32_t *      dest,
-                        const uint32_t *src,
-                        const uint32_t *mask,
-                        int             width)
+vmx_combine_xor_u_mask (uint32_t       *dest,
+			const uint32_t *src,
+			const uint32_t *mask,
+			int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
@@ -1497,12 +1348,12 @@ vmx_combine_xor_u_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSM (dest, src, mask);
 
-	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
-			     vdest, splat_alpha (negate (vsrc)));
+	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)), vdest,
+			     splat_alpha (negate (vsrc)));
 
 	STORE_VECTOR (dest);
 
@@ -1511,7 +1362,7 @@ vmx_combine_xor_u_mask (uint32_t *      dest,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t s = src[i];
@@ -1531,11 +1382,11 @@ vmx_combine_xor_u_mask (uint32_t *      dest,
 
 static void
 vmx_combine_xor_u (pixman_implementation_t *imp,
-                   pixman_op_t              op,
-                   uint32_t *               dest,
-                   const uint32_t *         src,
-                   const uint32_t *         mask,
-                   int                      width)
+		   pixman_op_t              op,
+		   uint32_t                *dest,
+		   const uint32_t          *src,
+		   const uint32_t          *mask,
+		   int                      width)
 {
     if (mask)
 	vmx_combine_xor_u_mask (dest, src, mask, width);
@@ -1544,12 +1395,9 @@ vmx_combine_xor_u (pixman_implementation_t *imp,
 }
 
 static void
-vmx_combine_add_u_no_mask (uint32_t *      dest,
-                           const uint32_t *src,
-                           int             width)
+vmx_combine_add_u_no_mask (uint32_t *dest, const uint32_t *src, int width)
 {
-    int i;
-    vector unsigned int vdest, vsrc;
+    vector unsigned char vdest, vsrc;
     DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
@@ -1565,7 +1413,7 @@ vmx_combine_add_u_no_mask (uint32_t *      dest,
 
     COMPUTE_SHIFT_MASKS (dest, src);
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORS (dest, src);
 
@@ -1577,7 +1425,7 @@ vmx_combine_add_u_no_mask (uint32_t *      dest,
 	dest += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
@@ -1589,13 +1437,12 @@ vmx_combine_add_u_no_mask (uint32_t *      dest,
 }
 
 static void
-vmx_combine_add_u_mask (uint32_t *      dest,
-                        const uint32_t *src,
-                        const uint32_t *mask,
-                        int             width)
+vmx_combine_add_u_mask (uint32_t       *dest,
+			const uint32_t *src,
+			const uint32_t *mask,
+			int             width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
@@ -1615,7 +1462,7 @@ vmx_combine_add_u_mask (uint32_t *      dest,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSM (dest, src, mask);
 
@@ -1628,7 +1475,7 @@ vmx_combine_add_u_mask (uint32_t *      dest,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t s = src[i];
@@ -1643,11 +1490,11 @@ vmx_combine_add_u_mask (uint32_t *      dest,
 
 static void
 vmx_combine_add_u (pixman_implementation_t *imp,
-                   pixman_op_t              op,
-                   uint32_t *               dest,
-                   const uint32_t *         src,
-                   const uint32_t *         mask,
-                   int                      width)
+		   pixman_op_t              op,
+		   uint32_t                *dest,
+		   const uint32_t          *src,
+		   const uint32_t          *mask,
+		   int                      width)
 {
     if (mask)
 	vmx_combine_add_u_mask (dest, src, mask, width);
@@ -1657,14 +1504,13 @@ vmx_combine_add_u (pixman_implementation_t *imp,
 
 static void
 vmx_combine_src_ca (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dest,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
+		    pixman_op_t              op,
+		    uint32_t                *dest,
+		    const uint32_t          *src,
+		    const uint32_t          *mask,
+		    int                      width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
@@ -1682,7 +1528,7 @@ vmx_combine_src_ca (pixman_implementation_t *imp,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSC (dest, src, mask);
 
@@ -1695,7 +1541,7 @@ vmx_combine_src_ca (pixman_implementation_t *imp,
 	dest += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t a = mask[i];
 	uint32_t s = src[i];
@@ -1708,22 +1554,21 @@ vmx_combine_src_ca (pixman_implementation_t *imp,
 
 static void
 vmx_combine_over_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dest,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
+		     pixman_op_t              op,
+		     uint32_t                *dest,
+		     const uint32_t          *src,
+		     const uint32_t          *mask,
+		     int                      width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t a = *mask++;
-	uint32_t s = *src++;
-	uint32_t d = *dest;
+	uint32_t a  = *mask++;
+	uint32_t s  = *src++;
+	uint32_t d  = *dest;
 	uint32_t sa = ALPHA_8 (s);
 
 	UN8x4_MUL_UN8x4 (s, a);
@@ -1737,7 +1582,7 @@ vmx_combine_over_ca (pixman_implementation_t *imp,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSC (dest, src, mask);
 
@@ -1750,11 +1595,11 @@ vmx_combine_over_ca (pixman_implementation_t *imp,
 	dest += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t a = mask[i];
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
+	uint32_t a  = mask[i];
+	uint32_t s  = src[i];
+	uint32_t d  = dest[i];
 	uint32_t sa = ALPHA_8 (s);
 
 	UN8x4_MUL_UN8x4 (s, a);
@@ -1767,22 +1612,21 @@ vmx_combine_over_ca (pixman_implementation_t *imp,
 
 static void
 vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               dest,
-                             const uint32_t *         src,
-                             const uint32_t *         mask,
-                             int                      width)
+			     pixman_op_t              op,
+			     uint32_t                *dest,
+			     const uint32_t          *src,
+			     const uint32_t          *mask,
+			     int                      width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t a = *mask++;
-	uint32_t s = *src++;
-	uint32_t d = *dest;
+	uint32_t a   = *mask++;
+	uint32_t s   = *src++;
+	uint32_t d   = *dest;
 	uint32_t ida = ALPHA_8 (~d);
 
 	UN8x4_MUL_UN8x4 (s, a);
@@ -1795,7 +1639,7 @@ vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSC (dest, src, mask);
 
@@ -1808,11 +1652,11 @@ vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
 	dest += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t a = mask[i];
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
+	uint32_t a   = mask[i];
+	uint32_t s   = src[i];
+	uint32_t d   = dest[i];
 	uint32_t ida = ALPHA_8 (~d);
 
 	UN8x4_MUL_UN8x4 (s, a);
@@ -1824,21 +1668,20 @@ vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
 
 static void
 vmx_combine_in_ca (pixman_implementation_t *imp,
-                   pixman_op_t              op,
-                   uint32_t *               dest,
-                   const uint32_t *         src,
-                   const uint32_t *         mask,
-                   int                      width)
+		   pixman_op_t              op,
+		   uint32_t                *dest,
+		   const uint32_t          *src,
+		   const uint32_t          *mask,
+		   int                      width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t a = *mask++;
-	uint32_t s = *src++;
+	uint32_t a  = *mask++;
+	uint32_t s  = *src++;
 	uint32_t da = ALPHA_8 (*dest);
 
 	UN8x4_MUL_UN8x4 (s, a);
@@ -1851,7 +1694,7 @@ vmx_combine_in_ca (pixman_implementation_t *imp,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSC (dest, src, mask);
 
@@ -1864,10 +1707,10 @@ vmx_combine_in_ca (pixman_implementation_t *imp,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t a = mask[i];
-	uint32_t s = src[i];
+	uint32_t a  = mask[i];
+	uint32_t s  = src[i];
 	uint32_t da = ALPHA_8 (dest[i]);
 
 	UN8x4_MUL_UN8x4 (s, a);
@@ -1879,21 +1722,20 @@ vmx_combine_in_ca (pixman_implementation_t *imp,
 
 static void
 vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
-                           pixman_op_t              op,
-                           uint32_t *               dest,
-                           const uint32_t *         src,
-                           const uint32_t *         mask,
-                           int                      width)
+			   pixman_op_t              op,
+			   uint32_t                *dest,
+			   const uint32_t          *src,
+			   const uint32_t          *mask,
+			   int                      width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t a = *mask++;
-	uint32_t d = *dest;
+	uint32_t a  = *mask++;
+	uint32_t d  = *dest;
 	uint32_t sa = ALPHA_8 (*src++);
 
 	UN8x4_MUL_UN8 (a, sa);
@@ -1906,7 +1748,7 @@ vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 
 	LOAD_VECTORSC (dest, src, mask);
@@ -1920,10 +1762,10 @@ vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t a = mask[i];
-	uint32_t d = dest[i];
+	uint32_t a  = mask[i];
+	uint32_t d  = dest[i];
 	uint32_t sa = ALPHA_8 (src[i]);
 
 	UN8x4_MUL_UN8 (a, sa);
@@ -1935,22 +1777,21 @@ vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
 
 static void
 vmx_combine_out_ca (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dest,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
+		    pixman_op_t              op,
+		    uint32_t                *dest,
+		    const uint32_t          *src,
+		    const uint32_t          *mask,
+		    int                      width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t a = *mask++;
-	uint32_t s = *src++;
-	uint32_t d = *dest;
+	uint32_t a  = *mask++;
+	uint32_t s  = *src++;
+	uint32_t d  = *dest;
 	uint32_t da = ALPHA_8 (~d);
 
 	UN8x4_MUL_UN8x4 (s, a);
@@ -1963,12 +1804,12 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSC (dest, src, mask);
 
-	vdest = pix_multiply (
-	    pix_multiply (vsrc, vmask), splat_alpha (negate (vdest)));
+	vdest = pix_multiply (pix_multiply (vsrc, vmask),
+			      splat_alpha (negate (vdest)));
 
 	STORE_VECTOR (dest);
 
@@ -1977,11 +1818,11 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t a = mask[i];
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
+	uint32_t a  = mask[i];
+	uint32_t s  = src[i];
+	uint32_t d  = dest[i];
 	uint32_t da = ALPHA_8 (~d);
 
 	UN8x4_MUL_UN8x4 (s, a);
@@ -1993,22 +1834,21 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
 
 static void
 vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            uint32_t *               dest,
-                            const uint32_t *         src,
-                            const uint32_t *         mask,
-                            int                      width)
+			    pixman_op_t              op,
+			    uint32_t                *dest,
+			    const uint32_t          *src,
+			    const uint32_t          *mask,
+			    int                      width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t a = *mask++;
-	uint32_t s = *src++;
-	uint32_t d = *dest;
+	uint32_t a  = *mask++;
+	uint32_t s  = *src++;
+	uint32_t d  = *dest;
 	uint32_t sa = ALPHA_8 (s);
 
 	UN8x4_MUL_UN8 (a, sa);
@@ -2021,7 +1861,7 @@ vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSC (dest, src, mask);
 
@@ -2035,11 +1875,11 @@ vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t a = mask[i];
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
+	uint32_t a  = mask[i];
+	uint32_t s  = src[i];
+	uint32_t d  = dest[i];
 	uint32_t sa = ALPHA_8 (s);
 
 	UN8x4_MUL_UN8 (a, sa);
@@ -2051,22 +1891,21 @@ vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
 
 static void
 vmx_combine_atop_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dest,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
+		     pixman_op_t              op,
+		     uint32_t                *dest,
+		     const uint32_t          *src,
+		     const uint32_t          *mask,
+		     int                      width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask, vsrca;
+    vector unsigned char vdest, vsrc, vmask, vsrca;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t a = *mask++;
-	uint32_t s = *src++;
-	uint32_t d = *dest;
+	uint32_t a  = *mask++;
+	uint32_t s  = *src++;
+	uint32_t d  = *dest;
 	uint32_t sa = ALPHA_8 (s);
 	uint32_t da = ALPHA_8 (d);
 
@@ -2081,17 +1920,16 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSC (dest, src, mask);
 
 	vsrca = splat_alpha (vsrc);
 
-	vsrc = pix_multiply (vsrc, vmask);
+	vsrc  = pix_multiply (vsrc, vmask);
 	vmask = pix_multiply (vmask, vsrca);
 
-	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
-			     negate (vmask), vdest);
+	vdest = pix_add_mul (vsrc, splat_alpha (vdest), negate (vmask), vdest);
 
 	STORE_VECTOR (dest);
 
@@ -2100,11 +1938,11 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t a = mask[i];
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
+	uint32_t a  = mask[i];
+	uint32_t s  = src[i];
+	uint32_t d  = dest[i];
 	uint32_t sa = ALPHA_8 (s);
 	uint32_t da = ALPHA_8 (d);
 
@@ -2118,22 +1956,21 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
 
 static void
 vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               dest,
-                             const uint32_t *         src,
-                             const uint32_t *         mask,
-                             int                      width)
+			     pixman_op_t              op,
+			     uint32_t                *dest,
+			     const uint32_t          *src,
+			     const uint32_t          *mask,
+			     int                      width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t a = *mask++;
-	uint32_t s = *src++;
-	uint32_t d = *dest;
+	uint32_t a  = *mask++;
+	uint32_t s  = *src++;
+	uint32_t d  = *dest;
 	uint32_t sa = ALPHA_8 (s);
 	uint32_t da = ALPHA_8 (~d);
 
@@ -2148,12 +1985,11 @@ vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSC (dest, src, mask);
 
-	vdest = pix_add_mul (vdest,
-			     pix_multiply (vmask, splat_alpha (vsrc)),
+	vdest = pix_add_mul (vdest, pix_multiply (vmask, splat_alpha (vsrc)),
 			     pix_multiply (vsrc, vmask),
 			     negate (splat_alpha (vdest)));
 
@@ -2164,11 +2000,11 @@ vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t a = mask[i];
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
+	uint32_t a  = mask[i];
+	uint32_t s  = src[i];
+	uint32_t d  = dest[i];
 	uint32_t sa = ALPHA_8 (s);
 	uint32_t da = ALPHA_8 (~d);
 
@@ -2182,22 +2018,21 @@ vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
 
 static void
 vmx_combine_xor_ca (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dest,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
+		    pixman_op_t              op,
+		    uint32_t                *dest,
+		    const uint32_t          *src,
+		    const uint32_t          *mask,
+		    int                      width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
-	uint32_t a = *mask++;
-	uint32_t s = *src++;
-	uint32_t d = *dest;
+	uint32_t a  = *mask++;
+	uint32_t s  = *src++;
+	uint32_t d  = *dest;
 	uint32_t sa = ALPHA_8 (s);
 	uint32_t da = ALPHA_8 (~d);
 
@@ -2212,14 +2047,13 @@ vmx_combine_xor_ca (pixman_implementation_t *imp,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSC (dest, src, mask);
 
-	vdest = pix_add_mul (vdest,
-			     negate (pix_multiply (vmask, splat_alpha (vsrc))),
-			     pix_multiply (vsrc, vmask),
-			     negate (splat_alpha (vdest)));
+	vdest = pix_add_mul (
+	    vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))),
+	    pix_multiply (vsrc, vmask), negate (splat_alpha (vdest)));
 
 	STORE_VECTOR (dest);
 
@@ -2228,11 +2062,11 @@ vmx_combine_xor_ca (pixman_implementation_t *imp,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
-	uint32_t a = mask[i];
-	uint32_t s = src[i];
-	uint32_t d = dest[i];
+	uint32_t a  = mask[i];
+	uint32_t s  = src[i];
+	uint32_t d  = dest[i];
 	uint32_t sa = ALPHA_8 (s);
 	uint32_t da = ALPHA_8 (~d);
 
@@ -2246,14 +2080,13 @@ vmx_combine_xor_ca (pixman_implementation_t *imp,
 
 static void
 vmx_combine_add_ca (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dest,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
+		    pixman_op_t              op,
+		    uint32_t                *dest,
+		    const uint32_t          *src,
+		    const uint32_t          *mask,
+		    int                      width)
 {
-    int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char vdest, vsrc, vmask;
     DECLARE_SRC_MASK_VAR;
     DECLARE_MASK_MASK_VAR;
 
@@ -2273,7 +2106,7 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width / 4; i > 0; i--)
+    for (int i = width / 4; i > 0; i--)
     {
 	LOAD_VECTORSC (dest, src, mask);
 
@@ -2286,7 +2119,7 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
 	mask += 4;
     }
 
-    for (i = width % 4; --i >= 0;)
+    for (int i = width % 4; --i >= 0;)
     {
 	uint32_t a = mask[i];
 	uint32_t s = src[i];
@@ -2301,36 +2134,36 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
 
 static void
 vmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
-                              pixman_composite_info_t *info)
+			     pixman_composite_info_t *info)
 {
     PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t src, srca;
+    uint32_t  src, srca;
     uint32_t *dst_line, *dst;
-    uint8_t *mask_line;
-    int dst_stride, mask_stride;
-    int32_t w;
-    uint32_t m, d, s, ia;
+    uint8_t  *mask_line;
+    int       dst_stride, mask_stride;
+    int32_t   w;
+    uint32_t  m, d, s, ia;
 
-    vector unsigned int vsrc, valpha, vmask, vdst;
+    vector unsigned char vsrc, valpha, vmask, vdst;
 
     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
-    srca = ALPHA_8(src);
+    srca = ALPHA_8 (src);
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (
-	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
+			   mask_line, 1);
 
-    vsrc = (vector unsigned int) {src, src, src, src};
-    valpha = splat_alpha(vsrc);
+    vsrc   = (vector unsigned char)create_mask_32_128 (src);
+    valpha = splat_alpha (vsrc);
 
     while (height--)
     {
 	const uint8_t *pm = mask_line;
-	dst = dst_line;
+	dst               = dst_line;
 	dst_line += dst_stride;
 	mask_line += mask_stride;
 	w = width;
@@ -2355,20 +2188,21 @@ vmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
 
 	while (w >= 4)
 	{
-	    m = *((uint32_t*)pm);
+	    m = *((uint32_t *)pm);
 
 	    if (srca == 0xff && m == 0xffffffff)
 	    {
-		save_128_aligned(dst, vsrc);
+		save_128_aligned (dst, vsrc);
 	    }
 	    else if (m)
 	    {
-		vmask = splat_pixel((vector unsigned int) {m, m, m, m});
+		vmask = splat_pixel (
+		    (vector unsigned char)create_mask_32_128 (m));
 
 		/* dst is 16-byte aligned */
 		vdst = in_over (vsrc, valpha, vmask, load_128_aligned (dst));
 
-		save_128_aligned(dst, vdst);
+		save_128_aligned (dst, vdst);
 	    }
 
 	    w -= 4;
@@ -2394,19 +2228,18 @@ vmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	    dst++;
 	}
     }
-
 }
 
 static pixman_bool_t
 vmx_fill (pixman_implementation_t *imp,
-           uint32_t *               bits,
-           int                      stride,
-           int                      bpp,
-           int                      x,
-           int                      y,
-           int                      width,
-           int                      height,
-           uint32_t		    filler)
+	  uint32_t                *bits,
+	  int                      stride,
+	  int                      bpp,
+	  int                      x,
+	  int                      y,
+	  int                      width,
+	  int                      height,
+	  uint32_t                 filler)
 {
     uint32_t byte_width;
     uint8_t *byte_line;
@@ -2415,31 +2248,31 @@ vmx_fill (pixman_implementation_t *imp,
 
     if (bpp == 8)
     {
-	uint8_t b;
+	uint8_t  b;
 	uint16_t w;
 
-	stride = stride * (int) sizeof (uint32_t) / 1;
-	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+	stride     = stride * (int)sizeof (uint32_t) / 1;
+	byte_line  = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
 	byte_width = width;
 	stride *= 1;
 
-	b = filler & 0xff;
-	w = (b << 8) | b;
+	b      = filler & 0xff;
+	w      = (b << 8) | b;
 	filler = (w << 16) | w;
     }
     else if (bpp == 16)
     {
-	stride = stride * (int) sizeof (uint32_t) / 2;
-	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+	stride     = stride * (int)sizeof (uint32_t) / 2;
+	byte_line  = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
 	byte_width = 2 * width;
 	stride *= 2;
 
-        filler = (filler & 0xffff) * 0x00010001;
+	filler = (filler & 0xffff) * 0x00010001;
     }
     else if (bpp == 32)
     {
-	stride = stride * (int) sizeof (uint32_t) / 4;
-	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+	stride     = stride * (int)sizeof (uint32_t) / 4;
+	byte_line  = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
 	byte_width = 4 * width;
 	stride *= 4;
     }
@@ -2448,11 +2281,11 @@ vmx_fill (pixman_implementation_t *imp,
 	return FALSE;
     }
 
-    vfiller = create_mask_32_128(filler);
+    vfiller = create_mask_32_128 (filler);
 
     while (height--)
     {
-	int w;
+	int      w;
 	uint8_t *d = byte_line;
 	byte_line += stride;
 	w = byte_width;
@@ -2481,14 +2314,14 @@ vmx_fill (pixman_implementation_t *imp,
 
 	while (w >= 128)
 	{
-	    vec_st(vfiller, 0, (uint32_t *) d);
-	    vec_st(vfiller, 0, (uint32_t *) d + 4);
-	    vec_st(vfiller, 0, (uint32_t *) d + 8);
-	    vec_st(vfiller, 0, (uint32_t *) d + 12);
-	    vec_st(vfiller, 0, (uint32_t *) d + 16);
-	    vec_st(vfiller, 0, (uint32_t *) d + 20);
-	    vec_st(vfiller, 0, (uint32_t *) d + 24);
-	    vec_st(vfiller, 0, (uint32_t *) d + 28);
+	    vec_st (vfiller, 0, (uint32_t *)d);
+	    vec_st (vfiller, 0, (uint32_t *)d + 4);
+	    vec_st (vfiller, 0, (uint32_t *)d + 8);
+	    vec_st (vfiller, 0, (uint32_t *)d + 12);
+	    vec_st (vfiller, 0, (uint32_t *)d + 16);
+	    vec_st (vfiller, 0, (uint32_t *)d + 20);
+	    vec_st (vfiller, 0, (uint32_t *)d + 24);
+	    vec_st (vfiller, 0, (uint32_t *)d + 28);
 
 	    d += 128;
 	    w -= 128;
@@ -2496,10 +2329,10 @@ vmx_fill (pixman_implementation_t *imp,
 
 	if (w >= 64)
 	{
-	    vec_st(vfiller, 0, (uint32_t *) d);
-	    vec_st(vfiller, 0, (uint32_t *) d + 4);
-	    vec_st(vfiller, 0, (uint32_t *) d + 8);
-	    vec_st(vfiller, 0, (uint32_t *) d + 12);
+	    vec_st (vfiller, 0, (uint32_t *)d);
+	    vec_st (vfiller, 0, (uint32_t *)d + 4);
+	    vec_st (vfiller, 0, (uint32_t *)d + 8);
+	    vec_st (vfiller, 0, (uint32_t *)d + 12);
 
 	    d += 64;
 	    w -= 64;
@@ -2507,8 +2340,8 @@ vmx_fill (pixman_implementation_t *imp,
 
 	if (w >= 32)
 	{
-	    vec_st(vfiller, 0, (uint32_t *) d);
-	    vec_st(vfiller, 0, (uint32_t *) d + 4);
+	    vec_st (vfiller, 0, (uint32_t *)d);
+	    vec_st (vfiller, 0, (uint32_t *)d + 4);
 
 	    d += 32;
 	    w -= 32;
@@ -2516,7 +2349,7 @@ vmx_fill (pixman_implementation_t *imp,
 
 	if (w >= 16)
 	{
-	    vec_st(vfiller, 0, (uint32_t *) d);
+	    vec_st (vfiller, 0, (uint32_t *)d);
 
 	    d += 16;
 	    w -= 16;
@@ -2550,18 +2383,18 @@ vmx_fill (pixman_implementation_t *imp,
 
 static void
 vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
-			      pixman_composite_info_t *info)
+			     pixman_composite_info_t *info)
 {
     PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-    int32_t w;
-    int dst_stride, src_stride;
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    int32_t   w;
+    int       dst_stride, src_stride;
 
-    PIXMAN_IMAGE_GET_LINE (
-	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
 
     while (height--)
     {
@@ -2579,7 +2412,7 @@ vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
 
 	while (w >= 16)
 	{
-	    vector unsigned int vmx_src1, vmx_src2, vmx_src3, vmx_src4;
+	    vector unsigned char vmx_src1, vmx_src2, vmx_src3, vmx_src4;
 
 	    vmx_src1 = load_128_unaligned (src);
 	    vmx_src2 = load_128_unaligned (src + 4);
@@ -2606,25 +2439,26 @@ vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
 
 static void
 vmx_composite_over_n_8888 (pixman_implementation_t *imp,
-                           pixman_composite_info_t *info)
+			   pixman_composite_info_t *info)
 {
     PIXMAN_COMPOSITE_ARGS (info);
     uint32_t *dst_line, *dst;
-    uint32_t src, ia;
-    int      i, w, dst_stride;
-    vector unsigned int vdst, vsrc, via;
+    uint32_t  src, ia;
+    int       w, dst_stride;
+
+    vector unsigned char vdst, vsrc, via;
 
     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (
-	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
 
-    vsrc = (vector unsigned int){src, src, src, src};
-    via = negate (splat_alpha (vsrc));
-    ia = ALPHA_8 (~src);
+    vsrc = (vector unsigned char)create_mask_32_128 (src);
+    via  = negate (splat_alpha (vsrc));
+    ia   = ALPHA_8 (~src);
 
     while (height--)
     {
@@ -2640,14 +2474,14 @@ vmx_composite_over_n_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	for (i = w / 4; i > 0; i--)
+	for (int i = w / 4; i > 0; i--)
 	{
 	    vdst = pix_multiply (load_128_aligned (dst), via);
 	    save_128_aligned (dst, pix_add (vsrc, vdst));
 	    dst += 4;
 	}
 
-	for (i = w % 4; --i >= 0;)
+	for (int i = w % 4; --i >= 0;)
 	{
 	    uint32_t d = dst[i];
 	    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
@@ -2658,63 +2492,63 @@ vmx_composite_over_n_8888 (pixman_implementation_t *imp,
 
 static void
 vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
-                               pixman_composite_info_t *info)
+			      pixman_composite_info_t *info)
 {
     PIXMAN_COMPOSITE_ARGS (info);
-    int dst_stride, src_stride;
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
+    int       dst_stride, src_stride;
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
 
-    PIXMAN_IMAGE_GET_LINE (
-    dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-    src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
 
     dst = dst_line;
     src = src_line;
 
     while (height--)
     {
-        vmx_combine_over_u (imp, op, dst, src, NULL, width);
+	vmx_combine_over_u (imp, op, dst, src, NULL, width);
 
-        dst += dst_stride;
-        src += src_stride;
+	dst += dst_stride;
+	src += src_stride;
     }
 }
 
 static void
 vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
-                                    pixman_composite_info_t *info)
+				   pixman_composite_info_t *info)
 {
     PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t src, ia;
-    uint32_t    *dst_line, d;
-    uint32_t    *mask_line, m;
-    uint32_t pack_cmp;
-    int dst_stride, mask_stride;
+    uint32_t  src, ia;
+    uint32_t *dst_line, d;
+    uint32_t *mask_line, m;
+    uint32_t  pack_cmp;
+    int       dst_stride, mask_stride;
 
-    vector unsigned int vsrc, valpha, vmask, vdest;
+    vector unsigned char vsrc, valpha, vmask, vdest;
 
     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
 	return;
 
-    PIXMAN_IMAGE_GET_LINE (
-	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride,
+			   mask_line, 1);
 
-    vsrc = (vector unsigned int) {src, src, src, src};
-    valpha = splat_alpha(vsrc);
-    ia = ALPHA_8 (src);
+    vsrc   = (vector unsigned char)create_mask_32_128 (src);
+    valpha = splat_alpha (vsrc);
+    ia     = ALPHA_8 (src);
 
     while (height--)
     {
-	int w = width;
+	int             w  = width;
 	const uint32_t *pm = (uint32_t *)mask_line;
-	uint32_t *pd = (uint32_t *)dst_line;
-	uint32_t s;
+	uint32_t       *pd = (uint32_t *)dst_line;
+	uint32_t        s;
 
 	dst_line += dst_stride;
 	mask_line += mask_stride;
@@ -2743,7 +2577,7 @@ vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	    /* pm is NOT necessarily 16-byte aligned */
 	    vmask = load_128_unaligned (pm);
 
-	    pack_cmp = vec_all_eq(vmask, (vector unsigned int) AVV(0));
+	    pack_cmp = vec_all_eq (vmask, vzero);
 
 	    /* if all bits in mask are zero, pack_cmp is not 0 */
 	    if (pack_cmp == 0)
@@ -2751,7 +2585,7 @@ vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 		/* pd is 16-byte aligned */
 		vdest = in_over (vsrc, valpha, vmask, load_128_aligned (pd));
 
-		save_128_aligned(pd, vdest);
+		save_128_aligned (pd, vdest);
 	    }
 
 	    pd += 4;
@@ -2782,19 +2616,19 @@ vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 
 static void
 vmx_composite_add_8_8 (pixman_implementation_t *imp,
-            pixman_composite_info_t *info)
+		       pixman_composite_info_t *info)
 {
     PIXMAN_COMPOSITE_ARGS (info);
-    uint8_t     *dst_line, *dst;
-    uint8_t     *src_line, *src;
-    int dst_stride, src_stride;
-    int32_t w;
+    uint8_t *dst_line, *dst;
+    uint8_t *src_line, *src;
+    int      dst_stride, src_stride;
+    int32_t  w;
     uint16_t t;
 
-    PIXMAN_IMAGE_GET_LINE (
-    src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-    dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride,
+			   src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
+			   dst_line, 1);
 
     while (height--)
     {
@@ -2808,13 +2642,13 @@ vmx_composite_add_8_8 (pixman_implementation_t *imp,
 	/* Small head */
 	while (w && (uintptr_t)dst & 3)
 	{
-	    t = (*dst) + (*src++);
+	    t      = (*dst) + (*src++);
 	    *dst++ = t | (0 - (t >> 8));
 	    w--;
 	}
 
-	vmx_combine_add_u (imp, op,
-		    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+	vmx_combine_add_u (imp, op, (uint32_t *)dst, (uint32_t *)src, NULL,
+			   w >> 2);
 
 	/* Small tail */
 	dst += w & 0xfffc;
@@ -2824,7 +2658,7 @@ vmx_composite_add_8_8 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    t = (*dst) + (*src++);
+	    t      = (*dst) + (*src++);
 	    *dst++ = t | (0 - (t >> 8));
 	    w--;
 	}
@@ -2833,17 +2667,17 @@ vmx_composite_add_8_8 (pixman_implementation_t *imp,
 
 static void
 vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
-                              pixman_composite_info_t *info)
+			     pixman_composite_info_t *info)
 {
     PIXMAN_COMPOSITE_ARGS (info);
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-    int dst_stride, src_stride;
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    int       dst_stride, src_stride;
 
-    PIXMAN_IMAGE_GET_LINE (
-	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
 
     while (height--)
     {
@@ -2857,18 +2691,18 @@ vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
 }
 
 static force_inline void
-scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t*       pd,
-                                            const uint32_t* ps,
-                                            int32_t         w,
-                                            pixman_fixed_t  vx,
-                                            pixman_fixed_t  unit_x,
-                                            pixman_fixed_t  src_width_fixed,
-                                            pixman_bool_t   fully_transparent_src)
+scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t       *pd,
+					    const uint32_t *ps,
+					    int32_t         w,
+					    pixman_fixed_t  vx,
+					    pixman_fixed_t  unit_x,
+					    pixman_fixed_t  src_width_fixed,
+					    pixman_bool_t fully_transparent_src)
 {
-    uint32_t s, d;
-    const uint32_t* pm = NULL;
+    uint32_t        s, d;
+    const uint32_t *pm = NULL;
 
-    vector unsigned int vsrc, vdst;
+    vector unsigned char vsrc, vdst;
 
     if (fully_transparent_src)
 	return;
@@ -2917,7 +2751,7 @@ scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t*       pd,
 	}
 	else if (!is_zero (vsrc))
 	{
-	    vdst = over(vsrc, splat_alpha(vsrc), load_128_aligned (pd));
+	    vdst = over (vsrc, splat_alpha (vsrc), load_128_aligned (pd));
 
 	    save_128_aligned (pd, vdst);
 	}
@@ -2944,6 +2778,7 @@ scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t*       pd,
     }
 }
 
+/* clang-format off */
 FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER,
 		       scaled_nearest_scanline_vmx_8888_8888_OVER,
 		       uint32_t, uint32_t, COVER)
@@ -2990,12 +2825,12 @@ static const pixman_fast_path_t vmx_fast_paths[] =
 
     {   PIXMAN_OP_NONE	},
 };
+/* clang-format on */
 
 static uint32_t *
 vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
 {
-    int w = iter->width;
-    vector unsigned int ff000000 = mask_ff000000;
+    int       w   = iter->width;
     uint32_t *dst = iter->buffer;
     uint32_t *src = (uint32_t *)iter->bits;
 
@@ -3009,7 +2844,8 @@ vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
 
     while (w >= 4)
     {
-	save_128_aligned(dst, vec_or(load_128_unaligned(src), ff000000));
+	save_128_aligned (dst,
+			  vec_or (load_128_unaligned (src), mask_ff000000));
 
 	dst += 4;
 	src += 4;
@@ -3028,31 +2864,32 @@ vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
 static uint32_t *
 vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
 {
-    int w = iter->width;
+    int       w   = iter->width;
     uint32_t *dst = iter->buffer;
-    uint8_t *src = iter->bits;
-    vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6;
+    uint8_t  *src = iter->bits;
+
+    vector unsigned char vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6;
 
     iter->bits += iter->stride;
 
     while (w && (((uintptr_t)dst) & 15))
     {
-        *dst++ = *(src++) << 24;
-        w--;
+	*dst++ = *(src++) << 24;
+	w--;
     }
 
     while (w >= 16)
     {
-	vmx0 = load_128_unaligned((uint32_t *) src);
+	vmx0 = load_128_unaligned ((uint32_t *)src);
 
-	unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2);
-	unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4);
-	unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6);
+	unpack_128_2x128 (vzero, vmx0, &vmx1, &vmx2);
+	unpack_128_2x128 (vzero, vmx1, &vmx3, &vmx4);
+	unpack_128_2x128 (vzero, vmx2, &vmx5, &vmx6);
 
-	save_128_aligned(dst, vmx6);
-	save_128_aligned((dst +  4), vmx5);
-	save_128_aligned((dst +  8), vmx4);
-	save_128_aligned((dst + 12), vmx3);
+	save_128_aligned (dst, vmx6);
+	save_128_aligned ((dst + 4), vmx5);
+	save_128_aligned ((dst + 8), vmx4);
+	save_128_aligned ((dst + 12), vmx3);
 
 	dst += 16;
 	src += 16;
@@ -3068,10 +2905,11 @@ vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
-#define IMAGE_FLAGS							\
-    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
+#define IMAGE_FLAGS                                                            \
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |                       \
      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
 
+/* clang-format off */
 static const pixman_iter_info_t vmx_iters[] =
 {
     { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
@@ -3082,45 +2920,42 @@ static const pixman_iter_info_t vmx_iters[] =
     },
     { PIXMAN_null },
 };
+/* clang-format on */
 
 pixman_implementation_t *
 _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
 {
-    pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
+    pixman_implementation_t *imp = _pixman_implementation_create (
+	fallback, vmx_fast_paths);
 
     /* VMX constants */
-    mask_ff000000 = create_mask_32_128 (0xff000000);
-    mask_red   = create_mask_32_128 (0x00f80000);
-    mask_green = create_mask_32_128 (0x0000fc00);
-    mask_blue  = create_mask_32_128 (0x000000f8);
-    mask_565_fix_rb = create_mask_32_128 (0x00e000e0);
-    mask_565_fix_g = create_mask_32_128  (0x0000c000);
+    mask_ff000000 = (vector unsigned char)create_mask_32_128 (0xff000000);
 
     /* Set up function pointers */
 
-    imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER]         = vmx_combine_over_u;
     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
-    imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u;
-    imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u;
-    imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u;
-    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u;
-    imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_IN]           = vmx_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE]   = vmx_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT]          = vmx_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE]  = vmx_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP]         = vmx_combine_atop_u;
     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
-    imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_XOR]          = vmx_combine_xor_u;
 
     imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
 
-    imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca;
-    imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_SRC]          = vmx_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER]         = vmx_combine_over_ca;
     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
-    imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca;
-    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca;
-    imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca;
-    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca;
-    imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN]           = vmx_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE]   = vmx_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT]          = vmx_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE]  = vmx_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP]         = vmx_combine_atop_ca;
     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
-    imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
-    imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR]          = vmx_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD]          = vmx_combine_add_ca;
 
     imp->fill = vmx_fill;
 
diff --git a/gfx/cairo/libpixman/src/pixman.c b/gfx/cairo/libpixman/src/pixman.c
index 26e500868ed7..1d77f8ef861f 100644
--- a/gfx/cairo/libpixman/src/pixman.c
+++ b/gfx/cairo/libpixman/src/pixman.c
@@ -739,6 +739,24 @@ pixman_image_composite (pixman_op_t      op,
                               mask_x, mask_y, dest_x, dest_y, width, height);
 }
 
+PIXMAN_EXPORT void
+pixman_image_composite64f (pixman_op_t      op,
+                           pixman_image_t * src,
+                           pixman_image_t * mask,
+                           pixman_image_t * dest,
+                           double           src_x,
+                           double           src_y,
+                           double           mask_x,
+                           double           mask_y,
+                           double           dest_x,
+                           double           dest_y,
+                           double           width,
+                           double           height)
+{
+    pixman_image_composite32 (op, src, mask, dest, src_x, src_y, 
+                              mask_x, mask_y, dest_x, dest_y, width, height);
+}
+
 PIXMAN_EXPORT pixman_bool_t
 pixman_blt (uint32_t *src_bits,
             uint32_t *dst_bits,
@@ -854,7 +872,7 @@ pixman_image_fill_rectangles (pixman_op_t                 op,
                               int                         n_rects,
                               const pixman_rectangle16_t *rects)
 {
-    pixman_box32_t stack_boxes[6];
+    pixman_box32_t stack_boxes[6] = {0};
     pixman_box32_t *boxes;
     pixman_bool_t result;
     int i;
diff --git a/gfx/cairo/libpixman/src/pixman.h b/gfx/cairo/libpixman/src/pixman.h
index a5c51934ca4e..ad3220ee6f1f 100644
--- a/gfx/cairo/libpixman/src/pixman.h
+++ b/gfx/cairo/libpixman/src/pixman.h
@@ -758,6 +758,173 @@ void                    pixman_region32_reset              (pixman_region32_t
 
 PIXMAN_API
 void			pixman_region32_clear		   (pixman_region32_t *region);
+/*
+ * 64 bit fractional regions
+ */
+typedef struct pixman_region64f_data	pixman_region64f_data_t;
+typedef struct pixman_box64f		pixman_box64f_t;
+typedef struct pixman_rectangle64f	pixman_rectangle64f_t;
+typedef struct pixman_region64f		pixman_region64f_t;
+
+struct pixman_region64f_data {
+    long		size;
+    long		numRects;
+/*  pixman_box64f_t	rects[size];   in memory but not explicitly declared */
+};
+
+struct pixman_rectangle64f
+{
+    double x, y;
+    double width, height;
+};
+
+struct pixman_box64f
+{
+    double x1, y1, x2, y2;
+};
+
+struct pixman_region64f
+{
+    pixman_box64f_t          extents;
+    pixman_region64f_data_t  *data;
+};
+
+/* creation/destruction */
+PIXMAN_API
+void                    pixman_region64f_init              (pixman_region64f_t *region);
+
+PIXMAN_API
+void                    pixman_region64f_init_rect         (pixman_region64f_t *region,
+							    int                 x,
+							    int                 y,
+							    unsigned int        width,
+							    unsigned int        height);
+
+PIXMAN_API
+void                    pixman_region64f_init_rectf        (pixman_region64f_t *region,
+							    double              x,
+							    double              y,
+							    double              width,
+							    double              height);
+
+PIXMAN_API
+pixman_bool_t           pixman_region64f_init_rects        (pixman_region64f_t *region,
+							    const pixman_box64f_t *boxes,
+							    int                 count);
+
+PIXMAN_API
+void                    pixman_region64f_init_with_extents (pixman_region64f_t    *region,
+							    const pixman_box64f_t *extents);
+
+PIXMAN_API
+void                    pixman_region64f_init_from_image   (pixman_region64f_t *region,
+							    pixman_image_t     *image);
+
+PIXMAN_API
+void                    pixman_region64f_fini              (pixman_region64f_t *region);
+
+
+/* manipulation */
+PIXMAN_API
+void                    pixman_region64f_translate         (pixman_region64f_t *region,
+							    int                 x,
+							    int                 y);
+
+PIXMAN_API
+pixman_bool_t           pixman_region64f_copy              (pixman_region64f_t       *dest,
+							    const pixman_region64f_t *source);
+
+PIXMAN_API
+pixman_bool_t           pixman_region64f_intersect         (pixman_region64f_t       *new_reg,
+							    const pixman_region64f_t *reg1,
+							    const pixman_region64f_t *reg2);
+
+PIXMAN_API
+pixman_bool_t           pixman_region64f_union             (pixman_region64f_t       *new_reg,
+							    const pixman_region64f_t *reg1,
+							    const pixman_region64f_t *reg2);
+
+PIXMAN_API
+pixman_bool_t		pixman_region64f_intersect_rect    (pixman_region64f_t       *dest,
+							    const pixman_region64f_t *source,
+							    int                       x,
+							    int                       y,
+							    unsigned int              width,
+							    unsigned int              height);
+
+PIXMAN_API
+pixman_bool_t		pixman_region64f_intersect_rectf   (pixman_region64f_t       *dest,
+							    const pixman_region64f_t *source,
+							    double                    x,
+							    double                    y,
+							    double                    width,
+							    double                    height);
+
+PIXMAN_API
+pixman_bool_t           pixman_region64f_union_rect        (pixman_region64f_t       *dest,
+							    const pixman_region64f_t *source,
+							    int                       x,
+							    int                       y,
+							    unsigned int              width,
+							    unsigned int              height);
+
+PIXMAN_API
+pixman_bool_t           pixman_region64f_union_rectf       (pixman_region64f_t       *dest,
+							    const pixman_region64f_t *source,
+							    double                    x,
+							    double                    y,
+							    double                    width,
+							    double                    height);
+
+PIXMAN_API
+pixman_bool_t           pixman_region64f_subtract          (pixman_region64f_t       *reg_d,
+							    const pixman_region64f_t *reg_m,
+							    const pixman_region64f_t *reg_s);
+
+PIXMAN_API
+pixman_bool_t           pixman_region64f_inverse           (pixman_region64f_t       *new_reg,
+							    const pixman_region64f_t *reg1,
+							    const pixman_box64f_t    *inv_rect);
+
+PIXMAN_API
+pixman_bool_t           pixman_region64f_contains_point    (const pixman_region64f_t *region,
+							    int                       x,
+							    int                       y,
+							    pixman_box64f_t          *box);
+
+PIXMAN_API
+pixman_region_overlap_t pixman_region64f_contains_rectangle(const pixman_region64f_t *region,
+							    const pixman_box64f_t    *prect);
+
+PIXMAN_API
+pixman_bool_t           pixman_region64f_empty             (const pixman_region64f_t *region);
+
+PIXMAN_API
+pixman_bool_t           pixman_region64f_not_empty         (const pixman_region64f_t *region);
+
+PIXMAN_API
+pixman_box64f_t *       pixman_region64f_extents           (const pixman_region64f_t *region);
+
+PIXMAN_API
+int                     pixman_region64f_n_rects           (const pixman_region64f_t *region);
+
+PIXMAN_API
+pixman_box64f_t *       pixman_region64f_rectangles        (const pixman_region64f_t *region,
+							    int                      *n_rects);
+
+PIXMAN_API
+pixman_bool_t           pixman_region64f_equal             (const pixman_region64f_t *region1,
+							    const pixman_region64f_t *region2);
+
+PIXMAN_API
+pixman_bool_t           pixman_region64f_selfcheck         (pixman_region64f_t *region);
+
+PIXMAN_API
+void                    pixman_region64f_reset             (pixman_region64f_t    *region,
+							    const pixman_box64f_t *box);
+
+PIXMAN_API
+void			pixman_region64f_clear		   (pixman_region64f_t *region);
 
 
 /* Copy / Fill / Misc */
@@ -884,6 +1051,10 @@ typedef enum {
 /* 96bpp formats */
     PIXMAN_rgb_float =	PIXMAN_FORMAT_BYTE(96,PIXMAN_TYPE_RGBA_FLOAT,0,32,32,32),
 
+/* 64bpp formats */
+    /* [63:0] A:B:G:R 16:16:16:16 native endian */
+    PIXMAN_a16b16g16r16 = PIXMAN_FORMAT_BYTE(64,PIXMAN_TYPE_ABGR,16,16,16,16),
+
 /* 32bpp formats */
     PIXMAN_a8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,8,8,8,8),
     PIXMAN_x8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,8,8,8),
@@ -1025,6 +1196,10 @@ PIXMAN_API
 pixman_bool_t   pixman_image_set_clip_region32       (pixman_image_t               *image,
 						      const pixman_region32_t            *region);
 
+PIXMAN_API
+pixman_bool_t   pixman_image_set_clip_region64f      (pixman_image_t               *image,
+						      const pixman_region64f_t           *region);
+
 PIXMAN_API
 void		pixman_image_set_has_client_clip     (pixman_image_t               *image,
 						      pixman_bool_t		    clien_clip);
@@ -1181,6 +1356,20 @@ void          pixman_image_composite32        (pixman_op_t        op,
 					       int32_t            width,
 					       int32_t            height);
 
+PIXMAN_API
+void          pixman_image_composite64f       (pixman_op_t        op,
+					       pixman_image_t    *src,
+					       pixman_image_t    *mask,
+					       pixman_image_t    *dest,
+					       double             src_x,
+					       double             src_y,
+					       double             mask_x,
+					       double             mask_y,
+					       double             dest_x,
+					       double             dest_y,
+					       double             width,
+					       double             height);
+
 /* Executive Summary: This function is a no-op that only exists
  * for historical reasons.
  *
diff --git a/gfx/cairo/pixman-neon.patch b/gfx/cairo/pixman-neon.patch
deleted file mode 100644
index 049b8b7105f7..000000000000
--- a/gfx/cairo/pixman-neon.patch
+++ /dev/null
@@ -1,30 +0,0 @@
-diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
-index 6bd27360aa027..cd33babca1e0c 100644
---- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
-+++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
-@@ -55,9 +55,9 @@
- #endif
- 
- .text
--.fpu neon
- .arch armv7a
- .object_arch armv4
-+.fpu neon
- .eabi_attribute 10, 0
- .eabi_attribute 12, 0
- .arm
-diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
-index 0e092577f1c73..c04b335d1e5bd 100644
---- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
-+++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
-@@ -40,9 +40,9 @@
- #endif
- 
-     .text
--    .fpu neon
-     .arch armv7a
-     .object_arch armv4
-+    .fpu neon
-     .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
-     .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
-     .arm