Adding in curl and openssl repos

2025-08-14 12:09:30 -04:00
parent af2117b574
commit 0ace93e303
21174 changed files with 3607720 additions and 2 deletions
--- a/openssl-3.4.2/crypto/md5/asm/md5-586.pl
+++ b/openssl-3.4.2/crypto/md5/asm/md5-586.pl
@@ -0,0 +1,320 @@
+#! /usr/bin/env perl
+# Copyright 1995-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# Normal is the
+# md5_block_x86(MD5_CTX *c, ULONG *X);
+# version, non-normal is the
+# md5_block_x86(MD5_CTX *c, ULONG *X,int blocks);
+
+$normal=0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+# $output is the last argument if it looks like a file (it has an extension)
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+
+$output and open STDOUT,">$output";
+
+&asm_init($ARGV[0]);
+
+$A="eax";
+$B="ebx";
+$C="ecx";
+$D="edx";
+$tmp1="edi";
+$tmp2="ebp";
+$X="esi";
+
+# What we need to load into $tmp for the next round
+%Ltmp1=("R0",&Np($C), "R1",&Np($C), "R2",&Np($C), "R3",&Np($D));
+@xo=(
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,	# R0
+ 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12,	# R1
+ 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2,	# R2
+ 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9,	# R3
+ );
+
+&md5_block("ossl_md5_block_asm_data_order");
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
+
+sub Np
+	{
+	local($p)=@_;
+	local(%n)=($A,$D,$B,$A,$C,$B,$D,$C);
+	return($n{$p});
+	}
+
+sub R0
+	{
+	local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
+
+	&mov($tmp1,$C)  if $pos < 0;
+	&mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
+
+	# body proper
+
+	&comment("R0 $ki");
+	&xor($tmp1,$d); # F function - part 2
+
+	&and($tmp1,$b); # F function - part 3
+	&lea($a,&DWP($t,$a,$tmp2,1));
+
+	&xor($tmp1,$d); # F function - part 4
+	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);
+
+	&add($a,$tmp1);
+
+	&rotl($a,$s);
+
+	&mov($tmp1,&Np($c)) if $pos < 1;	# next tmp1 for R0
+	&mov($tmp1,&Np($c)) if $pos == 1;	# next tmp1 for R1
+
+	&add($a,$b);
+	}
+
+sub R1
+	{
+	local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
+
+	&comment("R1 $ki");
+
+	&xor($tmp1,$b); # G function - part 2
+	&and($tmp1,$d); # G function - part 3
+	&lea($a,&DWP($t,$a,$tmp2,1));
+
+	&xor($tmp1,$c);			# G function - part 4
+	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);
+
+	&add($a,$tmp1);
+	&mov($tmp1,&Np($c)) if $pos < 1;	# G function - part 1
+	&mov($tmp1,&Np($c)) if $pos == 1;	# G function - part 1
+
+	&rotl($a,$s);
+
+	&add($a,$b);
+	}
+
+sub R2
+	{
+	local($n,$pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
+	# This one is different, only 3 logical operations
+
+if (($n & 1) == 0)
+	{
+	&comment("R2 $ki");
+	# make sure to do 'D' first, not 'B', else we clash with
+	# the last add from the previous round.
+
+	&xor($tmp1,$d); # H function - part 2
+
+	&xor($tmp1,$b); # H function - part 3
+	&lea($a,&DWP($t,$a,$tmp2,1));
+
+	&add($a,$tmp1);
+	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0));
+
+	&rotl($a,$s);
+
+	&mov($tmp1,&Np($c));
+	}
+else
+	{
+	&comment("R2 $ki");
+	# make sure to do 'D' first, not 'B', else we clash with
+	# the last add from the previous round.
+
+	&add($b,$c);			# MOVED FORWARD
+	&xor($tmp1,$d); # H function - part 2
+
+	&lea($a,&DWP($t,$a,$tmp2,1));
+
+	&xor($tmp1,$b); # H function - part 3
+	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);
+
+	&add($a,$tmp1);
+	&mov($tmp1,&Np($c)) if $pos < 1;	# H function - part 1
+	&mov($tmp1,-1) if $pos == 1;		# I function - part 1
+
+	&rotl($a,$s);
+
+	&add($a,$b);
+	}
+	}
+
+sub R3
+	{
+	local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
+
+	&comment("R3 $ki");
+
+	# &not($tmp1)
+	&xor($tmp1,$d) if $pos < 0; 	# I function - part 2
+
+	&or($tmp1,$b);				# I function - part 3
+	&lea($a,&DWP($t,$a,$tmp2,1));
+
+	&xor($tmp1,$c); 			# I function - part 4
+	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0))	if $pos != 2; # load X/k value
+	&mov($tmp2,&wparam(0)) if $pos == 2;
+
+	&add($a,$tmp1);
+	&mov($tmp1,-1) if $pos < 1;	# H function - part 1
+	&add($K,64) if $pos >=1 && !$normal;
+
+	&rotl($a,$s);
+
+	&xor($tmp1,&Np($d)) if $pos <= 0; 	# I function - part = first time
+	&mov($tmp1,&DWP( 0,$tmp2,"",0)) if $pos > 0;
+	&add($a,$b);
+	}
+
+
+sub md5_block
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,"",3);
+
+	# parameter 1 is the MD5_CTX structure.
+	# A	0
+	# B	4
+	# C	8
+	# D 	12
+
+	&push("esi");
+	 &push("edi");
+	&mov($tmp1,	&wparam(0)); # edi
+	 &mov($X,	&wparam(1)); # esi
+	&mov($C,	&wparam(2));
+	 &push("ebp");
+	&shl($C,	6);
+	&push("ebx");
+	 &add($C,	$X); # offset we end at
+	&sub($C,	64);
+	 &mov($A,	&DWP( 0,$tmp1,"",0));
+	&push($C);	# Put on the TOS
+	 &mov($B,	&DWP( 4,$tmp1,"",0));
+	&mov($C,	&DWP( 8,$tmp1,"",0));
+	 &mov($D,	&DWP(12,$tmp1,"",0));
+
+	&set_label("start") unless $normal;
+	&comment("");
+	&comment("R0 section");
+
+	&R0(-2,$A,$B,$C,$D,$X, 0, 7,0xd76aa478);
+	&R0( 0,$D,$A,$B,$C,$X, 1,12,0xe8c7b756);
+	&R0( 0,$C,$D,$A,$B,$X, 2,17,0x242070db);
+	&R0( 0,$B,$C,$D,$A,$X, 3,22,0xc1bdceee);
+	&R0( 0,$A,$B,$C,$D,$X, 4, 7,0xf57c0faf);
+	&R0( 0,$D,$A,$B,$C,$X, 5,12,0x4787c62a);
+	&R0( 0,$C,$D,$A,$B,$X, 6,17,0xa8304613);
+	&R0( 0,$B,$C,$D,$A,$X, 7,22,0xfd469501);
+	&R0( 0,$A,$B,$C,$D,$X, 8, 7,0x698098d8);
+	&R0( 0,$D,$A,$B,$C,$X, 9,12,0x8b44f7af);
+	&R0( 0,$C,$D,$A,$B,$X,10,17,0xffff5bb1);
+	&R0( 0,$B,$C,$D,$A,$X,11,22,0x895cd7be);
+	&R0( 0,$A,$B,$C,$D,$X,12, 7,0x6b901122);
+	&R0( 0,$D,$A,$B,$C,$X,13,12,0xfd987193);
+	&R0( 0,$C,$D,$A,$B,$X,14,17,0xa679438e);
+	&R0( 1,$B,$C,$D,$A,$X,15,22,0x49b40821);
+
+	&comment("");
+	&comment("R1 section");
+	&R1(-1,$A,$B,$C,$D,$X,16, 5,0xf61e2562);
+	&R1( 0,$D,$A,$B,$C,$X,17, 9,0xc040b340);
+	&R1( 0,$C,$D,$A,$B,$X,18,14,0x265e5a51);
+	&R1( 0,$B,$C,$D,$A,$X,19,20,0xe9b6c7aa);
+	&R1( 0,$A,$B,$C,$D,$X,20, 5,0xd62f105d);
+	&R1( 0,$D,$A,$B,$C,$X,21, 9,0x02441453);
+	&R1( 0,$C,$D,$A,$B,$X,22,14,0xd8a1e681);
+	&R1( 0,$B,$C,$D,$A,$X,23,20,0xe7d3fbc8);
+	&R1( 0,$A,$B,$C,$D,$X,24, 5,0x21e1cde6);
+	&R1( 0,$D,$A,$B,$C,$X,25, 9,0xc33707d6);
+	&R1( 0,$C,$D,$A,$B,$X,26,14,0xf4d50d87);
+	&R1( 0,$B,$C,$D,$A,$X,27,20,0x455a14ed);
+	&R1( 0,$A,$B,$C,$D,$X,28, 5,0xa9e3e905);
+	&R1( 0,$D,$A,$B,$C,$X,29, 9,0xfcefa3f8);
+	&R1( 0,$C,$D,$A,$B,$X,30,14,0x676f02d9);
+	&R1( 1,$B,$C,$D,$A,$X,31,20,0x8d2a4c8a);
+
+	&comment("");
+	&comment("R2 section");
+	&R2( 0,-1,$A,$B,$C,$D,$X,32, 4,0xfffa3942);
+	&R2( 1, 0,$D,$A,$B,$C,$X,33,11,0x8771f681);
+	&R2( 2, 0,$C,$D,$A,$B,$X,34,16,0x6d9d6122);
+	&R2( 3, 0,$B,$C,$D,$A,$X,35,23,0xfde5380c);
+	&R2( 4, 0,$A,$B,$C,$D,$X,36, 4,0xa4beea44);
+	&R2( 5, 0,$D,$A,$B,$C,$X,37,11,0x4bdecfa9);
+	&R2( 6, 0,$C,$D,$A,$B,$X,38,16,0xf6bb4b60);
+	&R2( 7, 0,$B,$C,$D,$A,$X,39,23,0xbebfbc70);
+	&R2( 8, 0,$A,$B,$C,$D,$X,40, 4,0x289b7ec6);
+	&R2( 9, 0,$D,$A,$B,$C,$X,41,11,0xeaa127fa);
+	&R2(10, 0,$C,$D,$A,$B,$X,42,16,0xd4ef3085);
+	&R2(11, 0,$B,$C,$D,$A,$X,43,23,0x04881d05);
+	&R2(12, 0,$A,$B,$C,$D,$X,44, 4,0xd9d4d039);
+	&R2(13, 0,$D,$A,$B,$C,$X,45,11,0xe6db99e5);
+	&R2(14, 0,$C,$D,$A,$B,$X,46,16,0x1fa27cf8);
+	&R2(15, 1,$B,$C,$D,$A,$X,47,23,0xc4ac5665);
+
+	&comment("");
+	&comment("R3 section");
+	&R3(-1,$A,$B,$C,$D,$X,48, 6,0xf4292244);
+	&R3( 0,$D,$A,$B,$C,$X,49,10,0x432aff97);
+	&R3( 0,$C,$D,$A,$B,$X,50,15,0xab9423a7);
+	&R3( 0,$B,$C,$D,$A,$X,51,21,0xfc93a039);
+	&R3( 0,$A,$B,$C,$D,$X,52, 6,0x655b59c3);
+	&R3( 0,$D,$A,$B,$C,$X,53,10,0x8f0ccc92);
+	&R3( 0,$C,$D,$A,$B,$X,54,15,0xffeff47d);
+	&R3( 0,$B,$C,$D,$A,$X,55,21,0x85845dd1);
+	&R3( 0,$A,$B,$C,$D,$X,56, 6,0x6fa87e4f);
+	&R3( 0,$D,$A,$B,$C,$X,57,10,0xfe2ce6e0);
+	&R3( 0,$C,$D,$A,$B,$X,58,15,0xa3014314);
+	&R3( 0,$B,$C,$D,$A,$X,59,21,0x4e0811a1);
+	&R3( 0,$A,$B,$C,$D,$X,60, 6,0xf7537e82);
+	&R3( 0,$D,$A,$B,$C,$X,61,10,0xbd3af235);
+	&R3( 0,$C,$D,$A,$B,$X,62,15,0x2ad7d2bb);
+	&R3( 2,$B,$C,$D,$A,$X,63,21,0xeb86d391);
+
+	# &mov($tmp2,&wparam(0));	# done in the last R3
+	# &mov($tmp1,	&DWP( 0,$tmp2,"",0)); # done is the last R3
+
+	&add($A,$tmp1);
+	 &mov($tmp1,	&DWP( 4,$tmp2,"",0));
+
+	&add($B,$tmp1);
+	&mov($tmp1,	&DWP( 8,$tmp2,"",0));
+
+	&add($C,$tmp1);
+	&mov($tmp1,	&DWP(12,$tmp2,"",0));
+
+	&add($D,$tmp1);
+	&mov(&DWP( 0,$tmp2,"",0),$A);
+
+	&mov(&DWP( 4,$tmp2,"",0),$B);
+	&mov($tmp1,&swtmp(0)) unless $normal;
+
+	&mov(&DWP( 8,$tmp2,"",0),$C);
+	 &mov(&DWP(12,$tmp2,"",0),$D);
+
+	&cmp($tmp1,$X) unless $normal;			# check count
+	 &jae(&label("start")) unless $normal;
+
+	&pop("eax"); # pop the temp variable off the stack
+	 &pop("ebx");
+	&pop("ebp");
+	 &pop("edi");
+	&pop("esi");
+	 &ret();
+	&function_end_B($name);
+	}
+
--- a/openssl-3.4.2/crypto/md5/asm/md5-aarch64.pl
+++ b/openssl-3.4.2/crypto/md5/asm/md5-aarch64.pl
@@ -0,0 +1,715 @@
+#! /usr/bin/env perl
+# Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# MD5 optimized for aarch64.
+
+use strict;
+
+my $code;
+
+#no warnings qw(uninitialized);
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+    or die "can't call $xlate: $1";
+*STDOUT=*OUT;
+
+$code .= <<EOF;
+#include "arm_arch.h"
+
+.text
+.globl  ossl_md5_block_asm_data_order
+.type   ossl_md5_block_asm_data_order,\@function
+ossl_md5_block_asm_data_order:
+        AARCH64_VALID_CALL_TARGET
+        // Save all callee-saved registers
+        stp     x19,x20,[sp,#-80]!
+        stp     x21,x22,[sp,#16]
+        stp     x23,x24,[sp,#32]
+        stp     x25,x26,[sp,#48]
+        stp     x27,x28,[sp,#64]
+
+        ldp w10, w11, [x0, #0]        // Load MD5 state->A and state->B
+        ldp w12, w13, [x0, #8]        // Load MD5 state->C and state->D
+.align 5
+ossl_md5_blocks_loop:
+        eor x17, x12, x13             // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        and x16, x17, x11             // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        ldp w15, w20, [x1]            // Load 2 words of input data0 M[0],M[1]
+        ldp w3, w21, [x1, #8]        // Load 2 words of input data0 M[2],M[3]
+#ifdef __AARCH64EB__
+        rev w15, w15
+        rev w20, w20
+        rev w3, w3
+        rev w21, w21
+#endif
+        eor x14, x16, x13             // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x9, #0xa478              // Load lower half of constant 0xd76aa478
+        movk x9, #0xd76a, lsl #16     // Load upper half of constant 0xd76aa478
+        add w8, w10, w15              // Add dest value
+        add w7, w8, w9                // Add constant 0xd76aa478
+        add w6, w7, w14               // Add aux function result
+        ror w6, w6, #25               // Rotate left s=7 bits
+        eor x5, x11, x12              // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w4, w11, w6               // Add X parameter round 1 A=FF(A, B, C, D, 0xd76aa478, s=7, M[0])
+        and x8, x5, x4                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x17, x8, x12              // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x16, #0xb756             // Load lower half of constant 0xe8c7b756
+        movk x16, #0xe8c7, lsl #16    // Load upper half of constant 0xe8c7b756
+        add w9, w13, w20              // Add dest value
+        add w7, w9, w16               // Add constant 0xe8c7b756
+        add w14, w7, w17              // Add aux function result
+        ror w14, w14, #20             // Rotate left s=12 bits
+        eor x6, x4, x11               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w5, w4, w14               // Add X parameter round 1 D=FF(D, A, B, C, 0xe8c7b756, s=12, M[1])
+        and x8, x6, x5                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x9, x8, x11               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x16, #0x70db             // Load lower half of constant 0x242070db
+        movk x16, #0x2420, lsl #16    // Load upper half of constant 0x242070db
+        add w7, w12, w3               // Add dest value
+        add w17, w7, w16              // Add constant 0x242070db
+        add w14, w17, w9              // Add aux function result
+        ror w14, w14, #15             // Rotate left s=17 bits
+        eor x6, x5, x4                // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w8, w5, w14               // Add X parameter round 1 C=FF(C, D, A, B, 0x242070db, s=17, M[2])
+        and x7, x6, x8                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x16, x7, x4               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x9, #0xceee              // Load lower half of constant 0xc1bdceee
+        movk x9, #0xc1bd, lsl #16     // Load upper half of constant 0xc1bdceee
+        add w14, w11, w21             // Add dest value
+        add w6, w14, w9               // Add constant 0xc1bdceee
+        add w7, w6, w16               // Add aux function result
+        ror w7, w7, #10               // Rotate left s=22 bits
+        eor x17, x8, x5               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w9, w8, w7                // Add X parameter round 1 B=FF(B, C, D, A, 0xc1bdceee, s=22, M[3])
+        ldp w14, w22, [x1, #16]       // Load 2 words of input data0 M[4],M[5]
+        ldp w7, w23, [x1, #24]        // Load 2 words of input data0 M[6],M[7]
+#ifdef __AARCH64EB__
+        rev w14, w14
+        rev w22, w22
+        rev w7, w7
+        rev w23, w23
+#endif
+        and x16, x17, x9              // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x6, x16, x5               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x16, #0xfaf              // Load lower half of constant 0xf57c0faf
+        movk x16, #0xf57c, lsl #16    // Load upper half of constant 0xf57c0faf
+        add w17, w4, w14              // Add dest value
+        add w16, w17, w16             // Add constant 0xf57c0faf
+        add w4, w16, w6               // Add aux function result
+        ror w4, w4, #25               // Rotate left s=7 bits
+        eor x16, x9, x8               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w17, w9, w4               // Add X parameter round 1 A=FF(A, B, C, D, 0xf57c0faf, s=7, M[4])
+        and x16, x16, x17             // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x6, x16, x8               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x4, #0xc62a              // Load lower half of constant 0x4787c62a
+        movk x4, #0x4787, lsl #16     // Load upper half of constant 0x4787c62a
+        add w16, w5, w22              // Add dest value
+        add w16, w16, w4              // Add constant 0x4787c62a
+        add w5, w16, w6               // Add aux function result
+        ror w5, w5, #20               // Rotate left s=12 bits
+        eor x4, x17, x9               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w19, w17, w5              // Add X parameter round 1 D=FF(D, A, B, C, 0x4787c62a, s=12, M[5])
+        and x6, x4, x19               // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x5, x6, x9                // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x4, #0x4613              // Load lower half of constant 0xa8304613
+        movk x4, #0xa830, lsl #16     // Load upper half of constant 0xa8304613
+        add w6, w8, w7                // Add dest value
+        add w8, w6, w4                // Add constant 0xa8304613
+        add w4, w8, w5                // Add aux function result
+        ror w4, w4, #15               // Rotate left s=17 bits
+        eor x6, x19, x17              // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w8, w19, w4               // Add X parameter round 1 C=FF(C, D, A, B, 0xa8304613, s=17, M[6])
+        and x5, x6, x8                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x4, x5, x17               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x6, #0x9501              // Load lower half of constant 0xfd469501
+        movk x6, #0xfd46, lsl #16     // Load upper half of constant 0xfd469501
+        add w9, w9, w23               // Add dest value
+        add w5, w9, w6                // Add constant 0xfd469501
+        add w9, w5, w4                // Add aux function result
+        ror w9, w9, #10               // Rotate left s=22 bits
+        eor x6, x8, x19               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w4, w8, w9                // Add X parameter round 1 B=FF(B, C, D, A, 0xfd469501, s=22, M[7])
+        ldp w5, w24, [x1, #32]        // Load 2 words of input data0 M[8],M[9]
+        ldp w16, w25, [x1, #40]        // Load 2 words of input data0 M[10],M[11]
+#ifdef __AARCH64EB__
+        rev w5, w5
+        rev w24, w24
+        rev w16, w16
+        rev w25, w25
+#endif
+        and x9, x6, x4                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x6, x9, x19               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x9, #0x98d8              // Load lower half of constant 0x698098d8
+        movk x9, #0x6980, lsl #16     // Load upper half of constant 0x698098d8
+        add w17, w17, w5              // Add dest value
+        add w9, w17, w9               // Add constant 0x698098d8
+        add w17, w9, w6               // Add aux function result
+        ror w17, w17, #25             // Rotate left s=7 bits
+        eor x9, x4, x8                // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w6, w4, w17               // Add X parameter round 1 A=FF(A, B, C, D, 0x698098d8, s=7, M[8])
+        and x17, x9, x6               // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x9, x17, x8               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x17, #0xf7af             // Load lower half of constant 0x8b44f7af
+        movk x17, #0x8b44, lsl #16    // Load upper half of constant 0x8b44f7af
+        add w19, w19, w24             // Add dest value
+        add w17, w19, w17             // Add constant 0x8b44f7af
+        add w19, w17, w9              // Add aux function result
+        ror w19, w19, #20             // Rotate left s=12 bits
+        eor x9, x6, x4                // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w17, w6, w19              // Add X parameter round 1 D=FF(D, A, B, C, 0x8b44f7af, s=12, M[9])
+        and x9, x9, x17               // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x9, x9, x4                // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x11, #0x5bb1             // Load lower half of constant 0xffff5bb1
+        movk x11, #0xffff, lsl #16    // Load upper half of constant 0xffff5bb1
+        add w8, w8, w16               // Add dest value
+        add w8, w8, w11               // Add constant 0xffff5bb1
+        add w8, w8, w9                // Add aux function result
+        ror w8, w8, #15               // Rotate left s=17 bits
+        eor x9, x17, x6               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w8, w17, w8               // Add X parameter round 1 C=FF(C, D, A, B, 0xffff5bb1, s=17, M[10])
+        and x9, x9, x8                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x9, x9, x6                // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x11, #0xd7be             // Load lower half of constant 0x895cd7be
+        movk x11, #0x895c, lsl #16    // Load upper half of constant 0x895cd7be
+        add w4, w4, w25               // Add dest value
+        add w4, w4, w11               // Add constant 0x895cd7be
+        add w9, w4, w9                // Add aux function result
+        ror w9, w9, #10               // Rotate left s=22 bits
+        eor x4, x8, x17               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w9, w8, w9                // Add X parameter round 1 B=FF(B, C, D, A, 0x895cd7be, s=22, M[11])
+        ldp w11, w26, [x1, #48]       // Load 2 words of input data0 M[12],M[13]
+        ldp w12, w27, [x1, #56]       // Load 2 words of input data0 M[14],M[15]
+#ifdef __AARCH64EB__
+        rev w11, w11
+        rev w26, w26
+        rev w12, w12
+        rev w27, w27
+#endif
+        and x4, x4, x9                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x4, x4, x17               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x19, #0x1122             // Load lower half of constant 0x6b901122
+        movk x19, #0x6b90, lsl #16    // Load upper half of constant 0x6b901122
+        add w6, w6, w11               // Add dest value
+        add w6, w6, w19               // Add constant 0x6b901122
+        add w4, w6, w4                // Add aux function result
+        ror w4, w4, #25               // Rotate left s=7 bits
+        eor x6, x9, x8                // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w4, w9, w4                // Add X parameter round 1 A=FF(A, B, C, D, 0x6b901122, s=7, M[12])
+        and x6, x6, x4                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x6, x6, x8                // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x19, #0x7193             // Load lower half of constant 0xfd987193
+        movk x19, #0xfd98, lsl #16    // Load upper half of constant 0xfd987193
+        add w17, w17, w26             // Add dest value
+        add w17, w17, w19             // Add constant 0xfd987193
+        add w17, w17, w6              // Add aux function result
+        ror w17, w17, #20             // Rotate left s=12 bits
+        eor x6, x4, x9                // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w17, w4, w17              // Add X parameter round 1 D=FF(D, A, B, C, 0xfd987193, s=12, M[13])
+        and x6, x6, x17               // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x6, x6, x9                // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x13, #0x438e             // Load lower half of constant 0xa679438e
+        movk x13, #0xa679, lsl #16    // Load upper half of constant 0xa679438e
+        add w8, w8, w12               // Add dest value
+        add w8, w8, w13               // Add constant 0xa679438e
+        add w8, w8, w6                // Add aux function result
+        ror w8, w8, #15               // Rotate left s=17 bits
+        eor x6, x17, x4               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        add w8, w17, w8               // Add X parameter round 1 C=FF(C, D, A, B, 0xa679438e, s=17, M[14])
+        and x6, x6, x8                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        eor x6, x6, x4                // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+        movz x13, #0x821              // Load lower half of constant 0x49b40821
+        movk x13, #0x49b4, lsl #16    // Load upper half of constant 0x49b40821
+        add w9, w9, w27               // Add dest value
+        add w9, w9, w13               // Add constant 0x49b40821
+        add w9, w9, w6                // Add aux function result
+        ror w9, w9, #10               // Rotate left s=22 bits
+        bic x6, x8, x17               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w9, w8, w9                // Add X parameter round 1 B=FF(B, C, D, A, 0x49b40821, s=22, M[15])
+        and x13, x9, x17              // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0x2562             // Load lower half of constant 0xf61e2562
+        movk x13, #0xf61e, lsl #16    // Load upper half of constant 0xf61e2562
+        add w4, w4, w20               // Add dest value
+        add w4, w4, w13               // Add constant 0xf61e2562
+        add w4, w4, w6                // Add aux function result
+        ror w4, w4, #27               // Rotate left s=5 bits
+        bic x6, x9, x8                // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w4, w9, w4                // Add X parameter round 2 A=GG(A, B, C, D, 0xf61e2562, s=5, M[1])
+        and x13, x4, x8               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0xb340             // Load lower half of constant 0xc040b340
+        movk x13, #0xc040, lsl #16    // Load upper half of constant 0xc040b340
+        add w17, w17, w7              // Add dest value
+        add w17, w17, w13             // Add constant 0xc040b340
+        add w17, w17, w6              // Add aux function result
+        ror w17, w17, #23             // Rotate left s=9 bits
+        bic x6, x4, x9                // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w17, w4, w17              // Add X parameter round 2 D=GG(D, A, B, C, 0xc040b340, s=9, M[6])
+        and x13, x17, x9              // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0x5a51             // Load lower half of constant 0x265e5a51
+        movk x13, #0x265e, lsl #16    // Load upper half of constant 0x265e5a51
+        add w8, w8, w25               // Add dest value
+        add w8, w8, w13               // Add constant 0x265e5a51
+        add w8, w8, w6                // Add aux function result
+        ror w8, w8, #18               // Rotate left s=14 bits
+        bic x6, x17, x4               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w8, w17, w8               // Add X parameter round 2 C=GG(C, D, A, B, 0x265e5a51, s=14, M[11])
+        and x13, x8, x4               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0xc7aa             // Load lower half of constant 0xe9b6c7aa
+        movk x13, #0xe9b6, lsl #16    // Load upper half of constant 0xe9b6c7aa
+        add w9, w9, w15               // Add dest value
+        add w9, w9, w13               // Add constant 0xe9b6c7aa
+        add w9, w9, w6                // Add aux function result
+        ror w9, w9, #12               // Rotate left s=20 bits
+        bic x6, x8, x17               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w9, w8, w9                // Add X parameter round 2 B=GG(B, C, D, A, 0xe9b6c7aa, s=20, M[0])
+        and x13, x9, x17              // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0x105d             // Load lower half of constant 0xd62f105d
+        movk x13, #0xd62f, lsl #16    // Load upper half of constant 0xd62f105d
+        add w4, w4, w22               // Add dest value
+        add w4, w4, w13               // Add constant 0xd62f105d
+        add w4, w4, w6                // Add aux function result
+        ror w4, w4, #27               // Rotate left s=5 bits
+        bic x6, x9, x8                // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w4, w9, w4                // Add X parameter round 2 A=GG(A, B, C, D, 0xd62f105d, s=5, M[5])
+        and x13, x4, x8               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0x1453             // Load lower half of constant 0x2441453
+        movk x13, #0x244, lsl #16     // Load upper half of constant 0x2441453
+        add w17, w17, w16             // Add dest value
+        add w17, w17, w13             // Add constant 0x2441453
+        add w17, w17, w6              // Add aux function result
+        ror w17, w17, #23             // Rotate left s=9 bits
+        bic x6, x4, x9                // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w17, w4, w17              // Add X parameter round 2 D=GG(D, A, B, C, 0x2441453, s=9, M[10])
+        and x13, x17, x9              // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0xe681             // Load lower half of constant 0xd8a1e681
+        movk x13, #0xd8a1, lsl #16    // Load upper half of constant 0xd8a1e681
+        add w8, w8, w27               // Add dest value
+        add w8, w8, w13               // Add constant 0xd8a1e681
+        add w8, w8, w6                // Add aux function result
+        ror w8, w8, #18               // Rotate left s=14 bits
+        bic x6, x17, x4               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w8, w17, w8               // Add X parameter round 2 C=GG(C, D, A, B, 0xd8a1e681, s=14, M[15])
+        and x13, x8, x4               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0xfbc8             // Load lower half of constant 0xe7d3fbc8
+        movk x13, #0xe7d3, lsl #16    // Load upper half of constant 0xe7d3fbc8
+        add w9, w9, w14               // Add dest value
+        add w9, w9, w13               // Add constant 0xe7d3fbc8
+        add w9, w9, w6                // Add aux function result
+        ror w9, w9, #12               // Rotate left s=20 bits
+        bic x6, x8, x17               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w9, w8, w9                // Add X parameter round 2 B=GG(B, C, D, A, 0xe7d3fbc8, s=20, M[4])
+        and x13, x9, x17              // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0xcde6             // Load lower half of constant 0x21e1cde6
+        movk x13, #0x21e1, lsl #16    // Load upper half of constant 0x21e1cde6
+        add w4, w4, w24               // Add dest value
+        add w4, w4, w13               // Add constant 0x21e1cde6
+        add w4, w4, w6                // Add aux function result
+        ror w4, w4, #27               // Rotate left s=5 bits
+        bic x6, x9, x8                // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w4, w9, w4                // Add X parameter round 2 A=GG(A, B, C, D, 0x21e1cde6, s=5, M[9])
+        and x13, x4, x8               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0x7d6              // Load lower half of constant 0xc33707d6
+        movk x13, #0xc337, lsl #16    // Load upper half of constant 0xc33707d6
+        add w17, w17, w12             // Add dest value
+        add w17, w17, w13             // Add constant 0xc33707d6
+        add w17, w17, w6              // Add aux function result
+        ror w17, w17, #23             // Rotate left s=9 bits
+        bic x6, x4, x9                // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w17, w4, w17              // Add X parameter round 2 D=GG(D, A, B, C, 0xc33707d6, s=9, M[14])
+        and x13, x17, x9              // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0xd87              // Load lower half of constant 0xf4d50d87
+        movk x13, #0xf4d5, lsl #16    // Load upper half of constant 0xf4d50d87
+        add w8, w8, w21               // Add dest value
+        add w8, w8, w13               // Add constant 0xf4d50d87
+        add w8, w8, w6                // Add aux function result
+        ror w8, w8, #18               // Rotate left s=14 bits
+        bic x6, x17, x4               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w8, w17, w8               // Add X parameter round 2 C=GG(C, D, A, B, 0xf4d50d87, s=14, M[3])
+        and x13, x8, x4               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0x14ed             // Load lower half of constant 0x455a14ed
+        movk x13, #0x455a, lsl #16    // Load upper half of constant 0x455a14ed
+        add w9, w9, w5                // Add dest value
+        add w9, w9, w13               // Add constant 0x455a14ed
+        add w9, w9, w6                // Add aux function result
+        ror w9, w9, #12               // Rotate left s=20 bits
+        bic x6, x8, x17               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w9, w8, w9                // Add X parameter round 2 B=GG(B, C, D, A, 0x455a14ed, s=20, M[8])
+        and x13, x9, x17              // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0xe905             // Load lower half of constant 0xa9e3e905
+        movk x13, #0xa9e3, lsl #16    // Load upper half of constant 0xa9e3e905
+        add w4, w4, w26               // Add dest value
+        add w4, w4, w13               // Add constant 0xa9e3e905
+        add w4, w4, w6                // Add aux function result
+        ror w4, w4, #27               // Rotate left s=5 bits
+        bic x6, x9, x8                // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w4, w9, w4                // Add X parameter round 2 A=GG(A, B, C, D, 0xa9e3e905, s=5, M[13])
+        and x13, x4, x8               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0xa3f8             // Load lower half of constant 0xfcefa3f8
+        movk x13, #0xfcef, lsl #16    // Load upper half of constant 0xfcefa3f8
+        add w17, w17, w3              // Add dest value
+        add w17, w17, w13             // Add constant 0xfcefa3f8
+        add w17, w17, w6              // Add aux function result
+        ror w17, w17, #23             // Rotate left s=9 bits
+        bic x6, x4, x9                // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w17, w4, w17              // Add X parameter round 2 D=GG(D, A, B, C, 0xfcefa3f8, s=9, M[2])
+        and x13, x17, x9              // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0x2d9              // Load lower half of constant 0x676f02d9
+        movk x13, #0x676f, lsl #16    // Load upper half of constant 0x676f02d9
+        add w8, w8, w23               // Add dest value
+        add w8, w8, w13               // Add constant 0x676f02d9
+        add w8, w8, w6                // Add aux function result
+        ror w8, w8, #18               // Rotate left s=14 bits
+        bic x6, x17, x4               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        add w8, w17, w8               // Add X parameter round 2 C=GG(C, D, A, B, 0x676f02d9, s=14, M[7])
+        and x13, x8, x4               // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        orr x6, x6, x13               // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
+        movz x13, #0x4c8a             // Load lower half of constant 0x8d2a4c8a
+        movk x13, #0x8d2a, lsl #16    // Load upper half of constant 0x8d2a4c8a
+        add w9, w9, w11               // Add dest value
+        add w9, w9, w13               // Add constant 0x8d2a4c8a
+        add w9, w9, w6                // Add aux function result
+        eor x6, x8, x17               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        ror w9, w9, #12               // Rotate left s=20 bits
+        movz x10, #0x3942             // Load lower half of constant 0xfffa3942
+        add w9, w8, w9                // Add X parameter round 2 B=GG(B, C, D, A, 0x8d2a4c8a, s=20, M[12])
+        movk x10, #0xfffa, lsl #16    // Load upper half of constant 0xfffa3942
+        add w4, w4, w22               // Add dest value
+        eor x6, x6, x9                // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w4, w4, w10               // Add constant 0xfffa3942
+        add w4, w4, w6                // Add aux function result
+        ror w4, w4, #28               // Rotate left s=4 bits
+        eor x6, x9, x8                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        movz x10, #0xf681             // Load lower half of constant 0x8771f681
+        add w4, w9, w4                // Add X parameter round 3 A=HH(A, B, C, D, 0xfffa3942, s=4, M[5])
+        movk x10, #0x8771, lsl #16    // Load upper half of constant 0x8771f681
+        add w17, w17, w5              // Add dest value
+        eor x6, x6, x4                // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w17, w17, w10             // Add constant 0x8771f681
+        add w17, w17, w6              // Add aux function result
+        eor x6, x4, x9                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        ror w17, w17, #21             // Rotate left s=11 bits
+        movz x13, #0x6122             // Load lower half of constant 0x6d9d6122
+        add w17, w4, w17              // Add X parameter round 3 D=HH(D, A, B, C, 0x8771f681, s=11, M[8])
+        movk x13, #0x6d9d, lsl #16    // Load upper half of constant 0x6d9d6122
+        add w8, w8, w25               // Add dest value
+        eor x6, x6, x17               // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w8, w8, w13               // Add constant 0x6d9d6122
+        add w8, w8, w6                // Add aux function result
+        ror w8, w8, #16               // Rotate left s=16 bits
+        eor x6, x17, x4               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        movz x13, #0x380c             // Load lower half of constant 0xfde5380c
+        add w8, w17, w8               // Add X parameter round 3 C=HH(C, D, A, B, 0x6d9d6122, s=16, M[11])
+        movk x13, #0xfde5, lsl #16    // Load upper half of constant 0xfde5380c
+        add w9, w9, w12               // Add dest value
+        eor x6, x6, x8                // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w9, w9, w13               // Add constant 0xfde5380c
+        add w9, w9, w6                // Add aux function result
+        eor x6, x8, x17               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        ror w9, w9, #9                // Rotate left s=23 bits
+        movz x10, #0xea44             // Load lower half of constant 0xa4beea44
+        add w9, w8, w9                // Add X parameter round 3 B=HH(B, C, D, A, 0xfde5380c, s=23, M[14])
+        movk x10, #0xa4be, lsl #16    // Load upper half of constant 0xa4beea44
+        add w4, w4, w20               // Add dest value
+        eor x6, x6, x9                // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w4, w4, w10               // Add constant 0xa4beea44
+        add w4, w4, w6                // Add aux function result
+        ror w4, w4, #28               // Rotate left s=4 bits
+        eor x6, x9, x8                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        movz x10, #0xcfa9             // Load lower half of constant 0x4bdecfa9
+        add w4, w9, w4                // Add X parameter round 3 A=HH(A, B, C, D, 0xa4beea44, s=4, M[1])
+        movk x10, #0x4bde, lsl #16    // Load upper half of constant 0x4bdecfa9
+        add w17, w17, w14             // Add dest value
+        eor x6, x6, x4                // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w17, w17, w10             // Add constant 0x4bdecfa9
+        add w17, w17, w6              // Add aux function result
+        eor x6, x4, x9                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        ror w17, w17, #21             // Rotate left s=11 bits
+        movz x13, #0x4b60             // Load lower half of constant 0xf6bb4b60
+        add w17, w4, w17              // Add X parameter round 3 D=HH(D, A, B, C, 0x4bdecfa9, s=11, M[4])
+        movk x13, #0xf6bb, lsl #16    // Load upper half of constant 0xf6bb4b60
+        add w8, w8, w23               // Add dest value
+        eor x6, x6, x17               // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w8, w8, w13               // Add constant 0xf6bb4b60
+        add w8, w8, w6                // Add aux function result
+        ror w8, w8, #16               // Rotate left s=16 bits
+        eor x6, x17, x4               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        movz x13, #0xbc70             // Load lower half of constant 0xbebfbc70
+        add w8, w17, w8               // Add X parameter round 3 C=HH(C, D, A, B, 0xf6bb4b60, s=16, M[7])
+        movk x13, #0xbebf, lsl #16    // Load upper half of constant 0xbebfbc70
+        add w9, w9, w16               // Add dest value
+        eor x6, x6, x8                // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w9, w9, w13               // Add constant 0xbebfbc70
+        add w9, w9, w6                // Add aux function result
+        eor x6, x8, x17               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        ror w9, w9, #9                // Rotate left s=23 bits
+        movz x10, #0x7ec6             // Load lower half of constant 0x289b7ec6
+        add w9, w8, w9                // Add X parameter round 3 B=HH(B, C, D, A, 0xbebfbc70, s=23, M[10])
+        movk x10, #0x289b, lsl #16    // Load upper half of constant 0x289b7ec6
+        add w4, w4, w26               // Add dest value
+        eor x6, x6, x9                // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w4, w4, w10               // Add constant 0x289b7ec6
+        add w4, w4, w6                // Add aux function result
+        ror w4, w4, #28               // Rotate left s=4 bits
+        eor x6, x9, x8                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        movz x10, #0x27fa             // Load lower half of constant 0xeaa127fa
+        add w4, w9, w4                // Add X parameter round 3 A=HH(A, B, C, D, 0x289b7ec6, s=4, M[13])
+        movk x10, #0xeaa1, lsl #16    // Load upper half of constant 0xeaa127fa
+        add w17, w17, w15             // Add dest value
+        eor x6, x6, x4                // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w17, w17, w10             // Add constant 0xeaa127fa
+        add w17, w17, w6              // Add aux function result
+        eor x6, x4, x9                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        ror w17, w17, #21             // Rotate left s=11 bits
+        movz x13, #0x3085             // Load lower half of constant 0xd4ef3085
+        add w17, w4, w17              // Add X parameter round 3 D=HH(D, A, B, C, 0xeaa127fa, s=11, M[0])
+        movk x13, #0xd4ef, lsl #16    // Load upper half of constant 0xd4ef3085
+        add w8, w8, w21               // Add dest value
+        eor x6, x6, x17               // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w8, w8, w13               // Add constant 0xd4ef3085
+        add w8, w8, w6                // Add aux function result
+        ror w8, w8, #16               // Rotate left s=16 bits
+        eor x6, x17, x4               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        movz x13, #0x1d05             // Load lower half of constant 0x4881d05
+        add w8, w17, w8               // Add X parameter round 3 C=HH(C, D, A, B, 0xd4ef3085, s=16, M[3])
+        movk x13, #0x488, lsl #16     // Load upper half of constant 0x4881d05
+        add w9, w9, w7                // Add dest value
+        eor x6, x6, x8                // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w9, w9, w13               // Add constant 0x4881d05
+        add w9, w9, w6                // Add aux function result
+        eor x6, x8, x17               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        ror w9, w9, #9                // Rotate left s=23 bits
+        movz x10, #0xd039             // Load lower half of constant 0xd9d4d039
+        add w9, w8, w9                // Add X parameter round 3 B=HH(B, C, D, A, 0x4881d05, s=23, M[6])
+        movk x10, #0xd9d4, lsl #16    // Load upper half of constant 0xd9d4d039
+        add w4, w4, w24               // Add dest value
+        eor x6, x6, x9                // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w4, w4, w10               // Add constant 0xd9d4d039
+        add w4, w4, w6                // Add aux function result
+        ror w4, w4, #28               // Rotate left s=4 bits
+        eor x6, x9, x8                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        movz x10, #0x99e5             // Load lower half of constant 0xe6db99e5
+        add w4, w9, w4                // Add X parameter round 3 A=HH(A, B, C, D, 0xd9d4d039, s=4, M[9])
+        movk x10, #0xe6db, lsl #16    // Load upper half of constant 0xe6db99e5
+        add w17, w17, w11             // Add dest value
+        eor x6, x6, x4                // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w17, w17, w10             // Add constant 0xe6db99e5
+        add w17, w17, w6              // Add aux function result
+        eor x6, x4, x9                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        ror w17, w17, #21             // Rotate left s=11 bits
+        movz x13, #0x7cf8             // Load lower half of constant 0x1fa27cf8
+        add w17, w4, w17              // Add X parameter round 3 D=HH(D, A, B, C, 0xe6db99e5, s=11, M[12])
+        movk x13, #0x1fa2, lsl #16    // Load upper half of constant 0x1fa27cf8
+        add w8, w8, w27               // Add dest value
+        eor x6, x6, x17               // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w8, w8, w13               // Add constant 0x1fa27cf8
+        add w8, w8, w6                // Add aux function result
+        ror w8, w8, #16               // Rotate left s=16 bits
+        eor x6, x17, x4               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+        movz x13, #0x5665             // Load lower half of constant 0xc4ac5665
+        add w8, w17, w8               // Add X parameter round 3 C=HH(C, D, A, B, 0x1fa27cf8, s=16, M[15])
+        movk x13, #0xc4ac, lsl #16    // Load upper half of constant 0xc4ac5665
+        add w9, w9, w3                // Add dest value
+        eor x6, x6, x8                // End aux function round 3 H(x,y,z)=(x^y^z)
+        add w9, w9, w13               // Add constant 0xc4ac5665
+        add w9, w9, w6                // Add aux function result
+        ror w9, w9, #9                // Rotate left s=23 bits
+        movz x6, #0x2244              // Load lower half of constant 0xf4292244
+        movk x6, #0xf429, lsl #16     // Load upper half of constant 0xf4292244
+        add w9, w8, w9                // Add X parameter round 3 B=HH(B, C, D, A, 0xc4ac5665, s=23, M[2])
+        add w4, w4, w15               // Add dest value
+        orn x13, x9, x17              // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w4, w4, w6                // Add constant 0xf4292244
+        eor x6, x8, x13               // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w4, w4, w6                // Add aux function result
+        ror w4, w4, #26               // Rotate left s=6 bits
+        movz x6, #0xff97              // Load lower half of constant 0x432aff97
+        movk x6, #0x432a, lsl #16     // Load upper half of constant 0x432aff97
+        add w4, w9, w4                // Add X parameter round 4 A=II(A, B, C, D, 0xf4292244, s=6, M[0])
+        orn x10, x4, x8               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w17, w17, w23             // Add dest value
+        eor x10, x9, x10              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w17, w17, w6              // Add constant 0x432aff97
+        add w6, w17, w10              // Add aux function result
+        ror w6, w6, #22               // Rotate left s=10 bits
+        movz x17, #0x23a7             // Load lower half of constant 0xab9423a7
+        movk x17, #0xab94, lsl #16    // Load upper half of constant 0xab9423a7
+        add w6, w4, w6                // Add X parameter round 4 D=II(D, A, B, C, 0x432aff97, s=10, M[7])
+        add w8, w8, w12               // Add dest value
+        orn x10, x6, x9               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w8, w8, w17               // Add constant 0xab9423a7
+        eor x17, x4, x10              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w8, w8, w17               // Add aux function result
+        ror w8, w8, #17               // Rotate left s=15 bits
+        movz x17, #0xa039             // Load lower half of constant 0xfc93a039
+        movk x17, #0xfc93, lsl #16    // Load upper half of constant 0xfc93a039
+        add w8, w6, w8                // Add X parameter round 4 C=II(C, D, A, B, 0xab9423a7, s=15, M[14])
+        orn x13, x8, x4               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w9, w9, w22               // Add dest value
+        eor x13, x6, x13              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w9, w9, w17               // Add constant 0xfc93a039
+        add w17, w9, w13              // Add aux function result
+        ror w17, w17, #11             // Rotate left s=21 bits
+        movz x9, #0x59c3              // Load lower half of constant 0x655b59c3
+        movk x9, #0x655b, lsl #16     // Load upper half of constant 0x655b59c3
+        add w17, w8, w17              // Add X parameter round 4 B=II(B, C, D, A, 0xfc93a039, s=21, M[5])
+        add w4, w4, w11               // Add dest value
+        orn x13, x17, x6              // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w9, w4, w9                // Add constant 0x655b59c3
+        eor x4, x8, x13               // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w9, w9, w4                // Add aux function result
+        ror w9, w9, #26               // Rotate left s=6 bits
+        movz x4, #0xcc92              // Load lower half of constant 0x8f0ccc92
+        movk x4, #0x8f0c, lsl #16     // Load upper half of constant 0x8f0ccc92
+        add w9, w17, w9               // Add X parameter round 4 A=II(A, B, C, D, 0x655b59c3, s=6, M[12])
+        orn x10, x9, x8               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w6, w6, w21               // Add dest value
+        eor x10, x17, x10             // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w4, w6, w4                // Add constant 0x8f0ccc92
+        add w6, w4, w10               // Add aux function result
+        ror w6, w6, #22               // Rotate left s=10 bits
+        movz x4, #0xf47d              // Load lower half of constant 0xffeff47d
+        movk x4, #0xffef, lsl #16     // Load upper half of constant 0xffeff47d
+        add w6, w9, w6                // Add X parameter round 4 D=II(D, A, B, C, 0x8f0ccc92, s=10, M[3])
+        add w8, w8, w16               // Add dest value
+        orn x10, x6, x17              // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w8, w8, w4                // Add constant 0xffeff47d
+        eor x4, x9, x10               // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w8, w8, w4                // Add aux function result
+        ror w8, w8, #17               // Rotate left s=15 bits
+        movz x4, #0x5dd1              // Load lower half of constant 0x85845dd1
+        movk x4, #0x8584, lsl #16     // Load upper half of constant 0x85845dd1
+        add w8, w6, w8                // Add X parameter round 4 C=II(C, D, A, B, 0xffeff47d, s=15, M[10])
+        orn x10, x8, x9               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w15, w17, w20             // Add dest value
+        eor x17, x6, x10              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w15, w15, w4              // Add constant 0x85845dd1
+        add w4, w15, w17              // Add aux function result
+        ror w4, w4, #11               // Rotate left s=21 bits
+        movz x15, #0x7e4f             // Load lower half of constant 0x6fa87e4f
+        movk x15, #0x6fa8, lsl #16    // Load upper half of constant 0x6fa87e4f
+        add w17, w8, w4               // Add X parameter round 4 B=II(B, C, D, A, 0x85845dd1, s=21, M[1])
+        add w4, w9, w5                // Add dest value
+        orn x9, x17, x6               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w15, w4, w15              // Add constant 0x6fa87e4f
+        eor x4, x8, x9                // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w9, w15, w4               // Add aux function result
+        ror w9, w9, #26               // Rotate left s=6 bits
+        movz x15, #0xe6e0             // Load lower half of constant 0xfe2ce6e0
+        movk x15, #0xfe2c, lsl #16    // Load upper half of constant 0xfe2ce6e0
+        add w4, w17, w9               // Add X parameter round 4 A=II(A, B, C, D, 0x6fa87e4f, s=6, M[8])
+        orn x9, x4, x8                // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w6, w6, w27               // Add dest value
+        eor x9, x17, x9               // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w15, w6, w15              // Add constant 0xfe2ce6e0
+        add w6, w15, w9               // Add aux function result
+        ror w6, w6, #22               // Rotate left s=10 bits
+        movz x9, #0x4314              // Load lower half of constant 0xa3014314
+        movk x9, #0xa301, lsl #16     // Load upper half of constant 0xa3014314
+        add w15, w4, w6               // Add X parameter round 4 D=II(D, A, B, C, 0xfe2ce6e0, s=10, M[15])
+        add w6, w8, w7                // Add dest value
+        orn x7, x15, x17              // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w8, w6, w9                // Add constant 0xa3014314
+        eor x9, x4, x7                // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w6, w8, w9                // Add aux function result
+        ror w6, w6, #17               // Rotate left s=15 bits
+        movz x7, #0x11a1              // Load lower half of constant 0x4e0811a1
+        movk x7, #0x4e08, lsl #16     // Load upper half of constant 0x4e0811a1
+        add w8, w15, w6               // Add X parameter round 4 C=II(C, D, A, B, 0xa3014314, s=15, M[6])
+        orn x9, x8, x4                // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w6, w17, w26              // Add dest value
+        eor x17, x15, x9              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w9, w6, w7                // Add constant 0x4e0811a1
+        add w7, w9, w17               // Add aux function result
+        ror w7, w7, #11               // Rotate left s=21 bits
+        movz x6, #0x7e82              // Load lower half of constant 0xf7537e82
+        movk x6, #0xf753, lsl #16     // Load upper half of constant 0xf7537e82
+        add w9, w8, w7                // Add X parameter round 4 B=II(B, C, D, A, 0x4e0811a1, s=21, M[13])
+        add w17, w4, w14              // Add dest value
+        orn x7, x9, x15               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w14, w17, w6              // Add constant 0xf7537e82
+        eor x4, x8, x7                // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w17, w14, w4              // Add aux function result
+        ror w17, w17, #26             // Rotate left s=6 bits
+        movz x6, #0xf235              // Load lower half of constant 0xbd3af235
+        movk x6, #0xbd3a, lsl #16     // Load upper half of constant 0xbd3af235
+        add w7, w9, w17               // Add X parameter round 4 A=II(A, B, C, D, 0xf7537e82, s=6, M[4])
+        orn x14, x7, x8               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w4, w15, w25              // Add dest value
+        eor x17, x9, x14              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w15, w4, w6               // Add constant 0xbd3af235
+        add w16, w15, w17             // Add aux function result
+        ror w16, w16, #22             // Rotate left s=10 bits
+        movz x14, #0xd2bb             // Load lower half of constant 0x2ad7d2bb
+        movk x14, #0x2ad7, lsl #16    // Load upper half of constant 0x2ad7d2bb
+        add w4, w7, w16               // Add X parameter round 4 D=II(D, A, B, C, 0xbd3af235, s=10, M[11])
+        add w6, w8, w3                // Add dest value
+        orn x15, x4, x9               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w17, w6, w14              // Add constant 0x2ad7d2bb
+        eor x16, x7, x15              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w8, w17, w16              // Add aux function result
+        ror w8, w8, #17               // Rotate left s=15 bits
+        movz x3, #0xd391              // Load lower half of constant 0xeb86d391
+        movk x3, #0xeb86, lsl #16     // Load upper half of constant 0xeb86d391
+        add w14, w4, w8               // Add X parameter round 4 C=II(C, D, A, B, 0x2ad7d2bb, s=15, M[2])
+        orn x6, x14, x7               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w15, w9, w24              // Add dest value
+        eor x17, x4, x6               // End aux function round 4 I(x,y,z)=((~z|x)^y)
+        add w16, w15, w3              // Add constant 0xeb86d391
+        add w8, w16, w17              // Add aux function result
+        ror w8, w8, #11               // Rotate left s=21 bits
+        ldp w6, w15, [x0]             // Reload MD5 state->A and state->B
+        ldp w5, w9, [x0, #8]          // Reload MD5 state->C and state->D
+        add w3, w14, w8               // Add X parameter round 4 B=II(B, C, D, A, 0xeb86d391, s=21, M[9])
+        add w13, w4, w9               // Add result of MD5 rounds to state->D
+        add w12, w14, w5              // Add result of MD5 rounds to state->C
+        add w10, w7, w6               // Add result of MD5 rounds to state->A
+        add w11, w3, w15              // Add result of MD5 rounds to state->B
+        stp w12, w13, [x0, #8]        // Store MD5 states C,D
+        stp w10, w11, [x0]            // Store MD5 states A,B
+        add x1, x1, #64               // Increment data pointer
+        subs w2, w2, #1               // Decrement block counter
+        b.ne ossl_md5_blocks_loop
+
+        ldp     x21,x22,[sp,#16]
+        ldp     x23,x24,[sp,#32]
+        ldp     x25,x26,[sp,#48]
+        ldp     x27,x28,[sp,#64]
+        ldp     x19,x20,[sp],#80
+        ret
+
+EOF
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/md5/asm/md5-loongarch64.pl
+++ b/openssl-3.4.2/crypto/md5/asm/md5-loongarch64.pl
@@ -0,0 +1,298 @@
+#! /usr/bin/env perl
+# Author: Min Zhou <zhoumin@loongson.cn>
+# Copyright 2023-2025 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# Reference to crypto/md5/asm/md5-x86_64.pl
+# MD5 optimized for LoongArch.
+
+use strict;
+
+my $code;
+
+my ($zero,$ra,$tp,$sp,$fp)=map("\$r$_",(0..3,22));
+my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11));
+my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$x)=map("\$r$_",(12..21));
+
+# $output is the last argument if it looks like a file (it has an extension)
+my $output;
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+open STDOUT,">$output";
+
+# round1_step() does:
+#   dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s)
+#   $t1 = y ^ z
+#   $t2 = dst + X[k_next]
+sub round1_step
+{
+    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
+    my $T_i_h = ($T_i & 0xfffff000) >> 12;
+    my $T_i_l = $T_i & 0xfff;
+
+# In LoongArch we have to use two instructions of lu12i.w and ori to load a
+# 32-bit immediate into a general register. Meanwhile, the instruction lu12i.w
+# treats the 20-bit immediate as a signed number. So if the T_i_h is greater
+# than or equal to (1<<19), we need provide lu12i.w a corresponding negative
+# number whose complement equals to the sign extension of T_i_h.
+
+# The details of the instruction lu12i.w can be found as following:
+# https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_lu12i_w_lu32i_d_lu52i_d
+
+    $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19));
+
+    $code .= "	ld.w	$t0,$a1,0		/* (NEXT STEP) X[0] */\n" if ($pos == -1);
+    $code .= "	xor	$t1,$y,$z		/* y ^ z */\n" if ($pos == -1);
+    $code .= "	add.w	$t2,$dst,$t0		/* dst + X[k] */\n" if ($pos == -1);
+    $code .= <<EOF;
+	lu12i.w	$t8,$T_i_h			/* load bits [31:12] of constant */
+	and     $t1,$x,$t1			/* x & ... */
+	ori	$t8,$t8,$T_i_l			/* load bits [11:0] of constant */
+	xor     $t1,$z,$t1			/* z ^ ... */
+	add.w   $t7,$t2,$t8			/* dst + X[k] + Const */
+	ld.w	$t0,$a1,$k_next*4		/* (NEXT STEP) X[$k_next] */
+	add.w	$dst,$t7,$t1			/* dst += ... */
+	add.w	$t2,$z,$t0			/* (NEXT STEP) dst + X[$k_next] */
+EOF
+
+    $code .= "	rotri.w	$dst,$dst,32-$s		/* dst <<< s */\n";
+    if ($pos != 1) {
+        $code .= "	xor	$t1,$x,$y	/* (NEXT STEP) y ^ z */\n";
+    } else {
+        $code .= "	move	$t0,$a7		/* (NEXT ROUND) $t0 = z' (copy of z) */\n";
+        $code .= "	nor	$t1,$zero,$a7	/* (NEXT ROUND) $t1 = not z' (copy of not z) */\n";
+    }
+    $code .= "	add.w   $dst,$dst,$x		/* dst += x */\n";
+}
+
+# round2_step() does:
+#   dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s)
+#   $t0 = z' (copy of z for the next step)
+#   $t1 = not z' (copy of not z for the next step)
+#   $t2 = dst + X[k_next]
+sub round2_step
+{
+    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
+    my $T_i_h = ($T_i & 0xfffff000) >> 12;
+    my $T_i_l = $T_i & 0xfff;
+    $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19));
+
+    $code .= <<EOF;
+	lu12i.w	$t8,$T_i_h			/* load bits [31:12] of Constant */
+	and	$t0,$x,$t0			/* x & z */
+	ori	$t8,$t8,$T_i_l			/* load bits [11:0] of Constant */
+	and	$t1,$y,$t1			/* y & (not z) */
+	add.w	$t7,$t2,$t8			/* dst + X[k] + Const */
+	or	$t1,$t0,$t1			/* (y & (not z)) | (x & z) */
+	ld.w	$t0,$a1,$k_next*4		/* (NEXT STEP) X[$k_next] */
+	add.w	$dst,$t7,$t1			/* dst += ... */
+	add.w	$t2,$z,$t0			/* (NEXT STEP) dst + X[$k_next] */
+EOF
+
+    $code .= "	rotri.w $dst,$dst,32-$s		/* dst <<< s */\n";
+    if ($pos != 1) {
+        $code .= "	move	$t0,$y		/* (NEXT STEP) z' = $y */\n";
+        $code .= "	nor	$t1,$zero,$y	/* (NEXT STEP) not z' = not $y */\n";
+    } else {
+        $code .= "	xor	$t1,$a6,$a7	/* (NEXT ROUND) $t1 = y ^ z */\n";
+    }
+    $code .= "	add.w	$dst,$dst,$x		/* dst += x */\n";
+}
+
+# round3_step() does:
+#   dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s)
+#   $t1 = y ^ z
+#   $t2 = dst + X[k_next]
+sub round3_step
+{
+    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
+    my $T_i_h = ($T_i & 0xfffff000) >> 12;
+    my $T_i_l = $T_i & 0xfff;
+    $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19));
+
+    $code .= <<EOF;
+	lu12i.w	$t8,$T_i_h			/* load bits [31:12] of Constant */
+	xor	$t1,$x,$t1			/* x ^ ... */
+	ori	$t8,$t8,$T_i_l			/* load bits [11:0] of Constant */
+	add.w	$t7,$t2,$t8			/* dst + X[k] + Const */
+	ld.w	$t0,$a1,$k_next*4		/* (NEXT STEP) X[$k_next] */
+	add.w	$dst,$t7,$t1			/* dst += ... */
+	add.w	$t2,$z,$t0			/* (NEXT STEP) dst + X[$k_next] */
+EOF
+
+    $code .= "	rotri.w $dst,$dst,32-$s		/* dst <<< s */\n";
+    if ($pos != 1) {
+	$code .= "	xor	$t1,$x,$y	/* (NEXT STEP) y ^ z */\n";
+    } else {
+        $code .= "	nor	$t1,$zero,$a7	/* (NEXT ROUND) $t1 = not z */\n";
+    }
+    $code .= "	add.w	$dst,$dst,$x		/* dst += x */\n";
+}
+
+# round4_step() does:
+#   dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s)
+#   $t1 = not z' (copy of not z for the next step)
+#   $t2 = dst + X[k_next]
+sub round4_step
+{
+    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
+    my $T_i_h = ($T_i & 0xfffff000) >> 12;
+    my $T_i_l = $T_i & 0xfff;
+    $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19));
+
+    $code .= <<EOF;
+	lu12i.w	$t8,$T_i_h			/* load bits [31:12] of Constant */
+	or	$t1,$x,$t1			/* x | ... */
+	ori	$t8,$t8,$T_i_l			/* load bits [11:0] of Constant */
+	xor	$t1,$y,$t1			/* y ^ ... */
+	add.w	$t7,$t2,$t8			/* dst + X[k] + Const */
+EOF
+
+    if ($pos != 1) {
+        $code .= "	ld.w	$t0,$a1,$k_next*4		/* (NEXT STEP) X[$k_next] */\n";
+        $code .= "	add.w	$dst,$t7,$t1			/* dst += ... */\n";
+        $code .= "	add.w	$t2,$z,$t0			/* (NEXT STEP) dst + X[$k_next] */\n";
+        $code .= "	rotri.w $dst,$dst,32-$s			/* dst <<< s */\n";
+        $code .= "	nor	$t1,$zero,$y			/* (NEXT STEP) not z' = not $y */\n";
+        $code .= "	add.w	$dst,$dst,$x			/* dst += x */\n";
+    } else {
+        $code .= "	add.w	$a4,$t3,$a4			/* (NEXT LOOP) add old value of A */\n";
+        $code .= "	add.w	$dst,$t7,$t1			/* dst += ... */\n";
+        $code .= "	add.w	$a7,$t6,$a7			/* (NEXT LOOP) add old value of D */\n";
+        $code .= "	rotri.w $dst,$dst,32-$s			/* dst <<< s */\n";
+        $code .= "	addi.d	$a1,$a1,64			/* (NEXT LOOP) ptr += 64 */\n";
+        $code .= "	add.w	$dst,$dst,$x			/* dst += x */\n";
+    }
+}
+
+$code .= <<EOF;
+.text
+
+.globl ossl_md5_block_asm_data_order
+.type ossl_md5_block_asm_data_order function
+ossl_md5_block_asm_data_order:
+	# $a0 = arg #1 (ctx, MD5_CTX pointer)
+	# $a1 = arg #2 (ptr, data pointer)
+	# $a2 = arg #3 (nbr, number of 16-word blocks to process)
+	beqz	$a2,.Lend		# cmp nbr with 0, jmp if nbr == 0
+
+	# ptr is '$a1'
+	# end is '$a3'
+	slli.d	$t0,$a2,6
+	add.d	$a3,$a1,$t0
+
+	# A is '$a4'
+	# B is '$a5'
+	# C is '$a6'
+	# D is '$a7'
+	ld.w	$a4,$a0,0	# a4 = ctx->A
+	ld.w	$a5,$a0,4	# a5 = ctx->B
+	ld.w	$a6,$a0,8	# a6 = ctx->C
+	ld.w	$a7,$a0,12	# a7 = ctx->D
+
+# BEGIN of loop over 16-word blocks
+.align 6
+.Lloop:
+	# save old values of A, B, C, D
+	move	$t3,$a4
+	move	$t4,$a5
+	move	$t5,$a6
+	move	$t6,$a7
+
+	preld	0,$a1,0
+	preld	0,$a1,64
+EOF
+
+round1_step(-1, $a4, $a5, $a6, $a7, '1', 0xd76aa478, '7');
+round1_step(0, $a7, $a4, $a5, $a6, '2', 0xe8c7b756, '12');
+round1_step(0, $a6, $a7, $a4, $a5, '3', 0x242070db, '17');
+round1_step(0, $a5, $a6, $a7, $a4, '4', 0xc1bdceee, '22');
+round1_step(0, $a4, $a5, $a6, $a7, '5', 0xf57c0faf, '7');
+round1_step(0, $a7, $a4, $a5, $a6, '6', 0x4787c62a, '12');
+round1_step(0, $a6, $a7, $a4, $a5, '7', 0xa8304613, '17');
+round1_step(0, $a5, $a6, $a7, $a4, '8', 0xfd469501, '22');
+round1_step(0, $a4, $a5, $a6, $a7, '9', 0x698098d8, '7');
+round1_step(0, $a7, $a4, $a5, $a6, '10', 0x8b44f7af, '12');
+round1_step(0, $a6, $a7, $a4, $a5, '11', 0xffff5bb1, '17');
+round1_step(0, $a5, $a6, $a7, $a4, '12', 0x895cd7be, '22');
+round1_step(0, $a4, $a5, $a6, $a7, '13', 0x6b901122, '7');
+round1_step(0, $a7, $a4, $a5, $a6, '14', 0xfd987193, '12');
+round1_step(0, $a6, $a7, $a4, $a5, '15', 0xa679438e, '17');
+round1_step(1, $a5, $a6, $a7, $a4, '1', 0x49b40821, '22');
+
+round2_step(-1, $a4, $a5, $a6, $a7, '6', 0xf61e2562, '5');
+round2_step(0, $a7, $a4, $a5, $a6, '11', 0xc040b340, '9');
+round2_step(0, $a6, $a7, $a4, $a5, '0', 0x265e5a51, '14');
+round2_step(0, $a5, $a6, $a7, $a4, '5', 0xe9b6c7aa, '20');
+round2_step(0, $a4, $a5, $a6, $a7, '10', 0xd62f105d, '5');
+round2_step(0, $a7, $a4, $a5, $a6, '15', 0x2441453, '9');
+round2_step(0, $a6, $a7, $a4, $a5, '4', 0xd8a1e681, '14');
+round2_step(0, $a5, $a6, $a7, $a4, '9', 0xe7d3fbc8, '20');
+round2_step(0, $a4, $a5, $a6, $a7, '14', 0x21e1cde6, '5');
+round2_step(0, $a7, $a4, $a5, $a6, '3', 0xc33707d6, '9');
+round2_step(0, $a6, $a7, $a4, $a5, '8', 0xf4d50d87, '14');
+round2_step(0, $a5, $a6, $a7, $a4, '13', 0x455a14ed, '20');
+round2_step(0, $a4, $a5, $a6, $a7, '2', 0xa9e3e905, '5');
+round2_step(0, $a7, $a4, $a5, $a6, '7', 0xfcefa3f8, '9');
+round2_step(0, $a6, $a7, $a4, $a5, '12', 0x676f02d9, '14');
+round2_step(1, $a5, $a6, $a7, $a4, '5', 0x8d2a4c8a, '20');
+
+round3_step(-1, $a4, $a5, $a6, $a7, '8', 0xfffa3942, '4');
+round3_step(0, $a7, $a4, $a5, $a6, '11', 0x8771f681, '11');
+round3_step(0, $a6, $a7, $a4, $a5, '14', 0x6d9d6122, '16');
+round3_step(0, $a5, $a6, $a7, $a4, '1', 0xfde5380c, '23');
+round3_step(0, $a4, $a5, $a6, $a7, '4', 0xa4beea44, '4');
+round3_step(0, $a7, $a4, $a5, $a6, '7', 0x4bdecfa9, '11');
+round3_step(0, $a6, $a7, $a4, $a5, '10', 0xf6bb4b60, '16');
+round3_step(0, $a5, $a6, $a7, $a4, '13', 0xbebfbc70, '23');
+round3_step(0, $a4, $a5, $a6, $a7, '0', 0x289b7ec6, '4');
+round3_step(0, $a7, $a4, $a5, $a6, '3', 0xeaa127fa, '11');
+round3_step(0, $a6, $a7, $a4, $a5, '6', 0xd4ef3085, '16');
+round3_step(0, $a5, $a6, $a7, $a4, '9', 0x4881d05, '23');
+round3_step(0, $a4, $a5, $a6, $a7, '12', 0xd9d4d039, '4');
+round3_step(0, $a7, $a4, $a5, $a6, '15', 0xe6db99e5, '11');
+round3_step(0, $a6, $a7, $a4, $a5, '2', 0x1fa27cf8, '16');
+round3_step(1, $a5, $a6, $a7, $a4, '0', 0xc4ac5665, '23');
+
+round4_step(-1, $a4, $a5, $a6, $a7, '7', 0xf4292244, '6');
+round4_step(0, $a7, $a4, $a5, $a6, '14', 0x432aff97, '10');
+round4_step(0, $a6, $a7, $a4, $a5, '5', 0xab9423a7, '15');
+round4_step(0, $a5, $a6, $a7, $a4, '12', 0xfc93a039, '21');
+round4_step(0, $a4, $a5, $a6, $a7, '3', 0x655b59c3, '6');
+round4_step(0, $a7, $a4, $a5, $a6, '10', 0x8f0ccc92, '10');
+round4_step(0, $a6, $a7, $a4, $a5, '1', 0xffeff47d, '15');
+round4_step(0, $a5, $a6, $a7, $a4, '8', 0x85845dd1, '21');
+round4_step(0, $a4, $a5, $a6, $a7, '15', 0x6fa87e4f, '6');
+round4_step(0, $a7, $a4, $a5, $a6, '6', 0xfe2ce6e0, '10');
+round4_step(0, $a6, $a7, $a4, $a5, '13', 0xa3014314, '15');
+round4_step(0, $a5, $a6, $a7, $a4, '4', 0x4e0811a1, '21');
+round4_step(0, $a4, $a5, $a6, $a7, '11', 0xf7537e82, '6');
+round4_step(0, $a7, $a4, $a5, $a6, '2', 0xbd3af235, '10');
+round4_step(0, $a6, $a7, $a4, $a5, '9', 0x2ad7d2bb, '15');
+round4_step(1, $a5, $a6, $a7, $a4, '0', 0xeb86d391, '21');
+
+$code .= <<EOF;
+	# add old values of B, C
+	add.w	$a5,$t4,$a5
+	add.w	$a6,$t5,$a6
+
+	bltu	$a1,$a3,.Lloop	# jmp if ptr < end
+
+	st.w	$a4,$a0,0	# ctx->A = A
+	st.w	$a5,$a0,4	# ctx->B = B
+	st.w	$a6,$a0,8	# ctx->C = C
+	st.w	$a7,$a0,12	# ctx->D = D
+
+.Lend:
+	jr	$ra
+.size ossl_md5_block_asm_data_order,.-ossl_md5_block_asm_data_order
+EOF
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
--- a/openssl-3.4.2/crypto/md5/asm/md5-sparcv9.pl
+++ b/openssl-3.4.2/crypto/md5/asm/md5-sparcv9.pl
@@ -0,0 +1,442 @@
+#! /usr/bin/env perl
+# Copyright 2012-2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Hardware SPARC T4 support by David S. Miller.
+# ====================================================================
+
+# MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
+# code generated by Sun C 5.2.
+
+# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
+# faster than software. Multi-process benchmark saturates at 12x
+# single-process result on 8-core processor, or ~11GBps per 2.85GHz
+# socket.
+
+# $output is the last argument if it looks like a file (it has an extension)
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+
+$output and open STDOUT,">$output";
+
+use integer;
+
+($ctx,$inp,$len)=("%i0","%i1","%i2");	# input arguments
+
+# 64-bit values
+@X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2");
+$tx="%g3";
+($AB,$CD)=("%g4","%g5");
+
+# 32-bit values
+@V=($A,$B,$C,$D)=map("%l$_",(0..3));
+($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7));
+($shr,$shl1,$shl2)=("%i3","%i4","%i5");
+
+my @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
+	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
+	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
+	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
+
+	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
+	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
+	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
+	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
+
+	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
+	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
+	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
+	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
+
+	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
+	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
+	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
+	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0	);
+
+sub R0 {
+  my ($i,$a,$b,$c,$d) = @_;
+  my $rot = (7,12,17,22)[$i%4];
+  my $j   = ($i+1)/2;
+
+  if ($i&1) {
+    $code.=<<___;
+	 srlx	@X[$j],$shr,@X[$j]	! align X[`$i+1`]
+	and	$b,$t1,$t1		! round $i
+	 sllx	@X[$j+1],$shl1,$tx
+	add	$t2,$a,$a
+	 sllx	$tx,$shl2,$tx
+	xor	$d,$t1,$t1
+	 or	$tx,@X[$j],@X[$j]
+	 sethi	%hi(@K[$i+1]),$t2
+	add	$t1,$a,$a
+	 or	$t2,%lo(@K[$i+1]),$t2
+	sll	$a,$rot,$t3
+	 add	@X[$j],$t2,$t2		! X[`$i+1`]+K[`$i+1`]
+	srl	$a,32-$rot,$a
+	add	$b,$t3,$t3
+	 xor	 $b,$c,$t1
+	add	$t3,$a,$a
+___
+  } else {
+    $code.=<<___;
+	 srlx	@X[$j],32,$tx		! extract X[`2*$j+1`]
+	and	$b,$t1,$t1		! round $i
+	add	$t2,$a,$a
+	xor	$d,$t1,$t1
+	 sethi	%hi(@K[$i+1]),$t2
+	add	$t1,$a,$a
+	 or	$t2,%lo(@K[$i+1]),$t2
+	sll	$a,$rot,$t3
+	 add	$tx,$t2,$t2		! X[`2*$j+1`]+K[`$i+1`]
+	srl	$a,32-$rot,$a
+	add	$b,$t3,$t3
+	 xor	 $b,$c,$t1
+	add	$t3,$a,$a
+___
+  }
+}
+
+sub R0_1 {
+  my ($i,$a,$b,$c,$d) = @_;
+  my $rot = (7,12,17,22)[$i%4];
+
+$code.=<<___;
+	 srlx	@X[0],32,$tx		! extract X[1]
+	and	$b,$t1,$t1		! round $i
+	add	$t2,$a,$a
+	xor	$d,$t1,$t1
+	 sethi	%hi(@K[$i+1]),$t2
+	add	$t1,$a,$a
+	 or	$t2,%lo(@K[$i+1]),$t2
+	sll	$a,$rot,$t3
+	 add	$tx,$t2,$t2		! X[1]+K[`$i+1`]
+	srl	$a,32-$rot,$a
+	add	$b,$t3,$t3
+	 andn	 $b,$c,$t1
+	add	$t3,$a,$a
+___
+}
+
+sub R1 {
+  my ($i,$a,$b,$c,$d) = @_;
+  my $rot = (5,9,14,20)[$i%4];
+  my $j   = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16;
+  my $xi  = @X[$j/2];
+
+$code.=<<___ if ($j&1 && ($xi=$tx));
+	 srlx	@X[$j/2],32,$xi		! extract X[$j]
+___
+$code.=<<___;
+	and	$b,$d,$t3		! round $i
+	add	$t2,$a,$a
+	or	$t3,$t1,$t1
+	 sethi	%hi(@K[$i+1]),$t2
+	add	$t1,$a,$a
+	 or	$t2,%lo(@K[$i+1]),$t2
+	sll	$a,$rot,$t3
+	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
+	srl	$a,32-$rot,$a
+	add	$b,$t3,$t3
+	 `$i<31?"andn":"xor"`	 $b,$c,$t1
+	add	$t3,$a,$a
+___
+}
+
+sub R2 {
+  my ($i,$a,$b,$c,$d) = @_;
+  my $rot = (4,11,16,23)[$i%4];
+  my $j   = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16;
+  my $xi  = @X[$j/2];
+
+$code.=<<___ if ($j&1 && ($xi=$tx));
+	 srlx	@X[$j/2],32,$xi		! extract X[$j]
+___
+$code.=<<___;
+	add	$t2,$a,$a		! round $i
+	xor	$b,$t1,$t1
+	 sethi	%hi(@K[$i+1]),$t2
+	add	$t1,$a,$a
+	 or	$t2,%lo(@K[$i+1]),$t2
+	sll	$a,$rot,$t3
+	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
+	srl	$a,32-$rot,$a
+	add	$b,$t3,$t3
+	 xor	 $b,$c,$t1
+	add	$t3,$a,$a
+___
+}
+
+sub R3 {
+  my ($i,$a,$b,$c,$d) = @_;
+  my $rot = (6,10,15,21)[$i%4];
+  my $j   = (0+7*($i+1))%16;
+  my $xi  = @X[$j/2];
+
+$code.=<<___;
+	add	$t2,$a,$a		! round $i
+___
+$code.=<<___ if ($j&1 && ($xi=$tx));
+	 srlx	@X[$j/2],32,$xi		! extract X[$j]
+___
+$code.=<<___;
+	orn	$b,$d,$t1
+	 sethi	%hi(@K[$i+1]),$t2
+	xor	$c,$t1,$t1
+	 or	$t2,%lo(@K[$i+1]),$t2
+	add	$t1,$a,$a
+	sll	$a,$rot,$t3
+	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
+	srl	$a,32-$rot,$a
+	add	$b,$t3,$t3
+	add	$t3,$a,$a
+___
+}
+
+$code.=<<___;
+#ifndef __ASSEMBLER__
+# define __ASSEMBLER__ 1
+#endif
+#include "crypto/sparc_arch.h"
+
+#ifdef __arch64__
+.register	%g2,#scratch
+.register	%g3,#scratch
+#endif
+
+.section	".text",#alloc,#execinstr
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
+.globl	ossl_md5_block_asm_data_order
+.align	32
+ossl_md5_block_asm_data_order:
+	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]
+
+	andcc	%g1, CFR_MD5, %g0
+	be	.Lsoftware
+	nop
+
+	mov	4, %g1
+	andcc	%o1, 0x7, %g0
+	lda	[%o0 + %g0]0x88, %f0		! load context
+	lda	[%o0 + %g1]0x88, %f1
+	add	%o0, 8, %o0
+	lda	[%o0 + %g0]0x88, %f2
+	lda	[%o0 + %g1]0x88, %f3
+	bne,pn	%icc, .Lhwunaligned
+	sub	%o0, 8, %o0
+
+.Lhw_loop:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	subcc	%o2, 1, %o2		! done yet?
+	ldd	[%o1 + 0x38], %f22
+	add	%o1, 0x40, %o1
+	prefetch [%o1 + 63], 20
+
+	.word	0x81b02800		! MD5
+
+	bne,pt	SIZE_T_CC, .Lhw_loop
+	nop
+
+.Lhwfinish:
+	sta	%f0, [%o0 + %g0]0x88	! store context
+	sta	%f1, [%o0 + %g1]0x88
+	add	%o0, 8, %o0
+	sta	%f2, [%o0 + %g0]0x88
+	sta	%f3, [%o0 + %g1]0x88
+	retl
+	nop
+
+.align	8
+.Lhwunaligned:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+.Lhwunaligned_loop:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	subcc	%o2, 1, %o2		! done yet?
+	ldd	[%o1 + 0x40], %f26
+	add	%o1, 0x40, %o1
+	prefetch [%o1 + 63], 20
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	.word	0x81b02800		! MD5
+
+	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
+	for	%f26, %f26, %f10	! %f10=%f26
+
+	ba	.Lhwfinish
+	nop
+
+.align	16
+.Lsoftware:
+	save	%sp,-STACK_FRAME,%sp
+
+	rd	%asi,$saved_asi
+	wr	%g0,0x88,%asi		! ASI_PRIMARY_LITTLE
+	and	$inp,7,$shr
+	andn	$inp,7,$inp
+
+	sll	$shr,3,$shr		! *=8
+	mov	56,$shl2
+	ld	[$ctx+0],$A
+	sub	$shl2,$shr,$shl2
+	ld	[$ctx+4],$B
+	and	$shl2,32,$shl1
+	add	$shl2,8,$shl2
+	ld	[$ctx+8],$C
+	sub	$shl2,$shl1,$shl2	! shr+shl1+shl2==64
+	ld	[$ctx+12],$D
+	nop
+
+.Loop:
+	 cmp	$shr,0			! was inp aligned?
+	ldxa	[$inp+0]%asi,@X[0]	! load little-endian input
+	ldxa	[$inp+8]%asi,@X[1]
+	ldxa	[$inp+16]%asi,@X[2]
+	ldxa	[$inp+24]%asi,@X[3]
+	ldxa	[$inp+32]%asi,@X[4]
+	 sllx	$A,32,$AB		! pack A,B
+	ldxa	[$inp+40]%asi,@X[5]
+	 sllx	$C,32,$CD		! pack C,D
+	ldxa	[$inp+48]%asi,@X[6]
+	 or	$B,$AB,$AB
+	ldxa	[$inp+56]%asi,@X[7]
+	 or	$D,$CD,$CD
+	bnz,a,pn	%icc,.+8
+	ldxa	[$inp+64]%asi,@X[8]
+
+	srlx	@X[0],$shr,@X[0]	! align X[0]
+	sllx	@X[1],$shl1,$tx
+	 sethi	%hi(@K[0]),$t2
+	sllx	$tx,$shl2,$tx
+	 or	$t2,%lo(@K[0]),$t2
+	or	$tx,@X[0],@X[0]
+	 xor	$C,$D,$t1
+	 add	@X[0],$t2,$t2		! X[0]+K[0]
+___
+	for ($i=0;$i<15;$i++)	{ &R0($i,@V);	unshift(@V,pop(@V)); }
+	for (;$i<16;$i++)	{ &R0_1($i,@V);	unshift(@V,pop(@V)); }
+	for (;$i<32;$i++)	{ &R1($i,@V);	unshift(@V,pop(@V)); }
+	for (;$i<48;$i++)	{ &R2($i,@V);	unshift(@V,pop(@V)); }
+	for (;$i<64;$i++)	{ &R3($i,@V);	unshift(@V,pop(@V)); }
+$code.=<<___;
+	srlx	$AB,32,$t1		! unpack A,B,C,D and accumulate
+	add	$inp,64,$inp		! advance inp
+	srlx	$CD,32,$t2
+	add	$t1,$A,$A
+	subcc	$len,1,$len		! done yet?
+	add	$AB,$B,$B
+	add	$t2,$C,$C
+	add	$CD,$D,$D
+	srl	$B,0,$B			! clruw	$B
+	bne	SIZE_T_CC,.Loop
+	srl	$D,0,$D			! clruw	$D
+
+	st	$A,[$ctx+0]		! write out ctx
+	st	$B,[$ctx+4]
+	st	$C,[$ctx+8]
+	st	$D,[$ctx+12]
+
+	wr	%g0,$saved_asi,%asi
+	ret
+	restore
+.type	ossl_md5_block_asm_data_order,#function
+.size	ossl_md5_block_asm_data_order,(.-ossl_md5_block_asm_data_order)
+
+.asciz	"MD5 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my $ref,$opf;
+my %visopf = (	"faligndata"	=> 0x048,
+		"for"		=> 0x07c	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%f([0-9]{1,2})/);
+	    $_=$1;
+	    if ($1>=32) {
+		return $ref if ($1&1);
+		# re-encode for upper double register addressing
+		$_=($1|$1>>5)&31;
+	    }
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+sub unalignaddr {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my $ref="$mnemonic\t$rs1,$rs2,$rd";
+
+    foreach ($rs1,$rs2,$rd) {
+	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
+	else			{ return $ref; }
+    }
+    return  sprintf ".word\t0x%08x !%s",
+		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
+		    $ref;
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+		&unvis($1,$2,$3,$4)
+	 /ge;
+	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+		&unalignaddr($1,$2,$3,$4)
+	 /ge;
+
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/md5/asm/md5-x86_64.pl
+++ b/openssl-3.4.2/crypto/md5/asm/md5-x86_64.pl
@@ -0,0 +1,395 @@
+#! /usr/bin/env perl
+# Author: Marc Bevand <bevand_m (at) epita.fr>
+# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# MD5 optimized for AMD64.
+
+use strict;
+
+my $code;
+
+# round1_step() does:
+#   dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s)
+#   %r10d = X[k_next]
+#   %r11d = z' (copy of z for the next step)
+# Each round1_step() takes about 5.3 clocks (9 instructions, 1.7 IPC)
+sub round1_step
+{
+    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
+    $code .= " mov	0*4(%rsi),	%r10d		/* (NEXT STEP) X[0] */\n" if ($pos == -1);
+    $code .= " mov	%edx,		%r11d		/* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
+    $code .= <<EOF;
+	xor	$y,		%r11d		/* y ^ ... */
+	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
+	and	$x,		%r11d		/* x & ... */
+	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
+	xor	$z,		%r11d		/* z ^ ... */
+	add	%r11d,		$dst		/* dst += ... */
+	rol	\$$s,		$dst		/* dst <<< s */
+	mov	$y,		%r11d		/* (NEXT STEP) z' = $y */
+	add	$x,		$dst		/* dst += x */
+EOF
+}
+
+# round2_step() does:
+#   dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s)
+#   %r10d = X[k_next]
+#   %r11d = z' (copy of z for the next step)
+#   %r12d = z' (copy of z for the next step)
+# Each round2_step() takes about 5.4 clocks (11 instructions, 2.0 IPC)
+sub round2_step
+{
+    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
+    $code .= " mov	%edx,		%r11d		/* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
+    $code .= " mov	%edx,		%r12d		/* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
+    $code .= <<EOF;
+	not	%r11d				/* not z */
+	and	$x,		%r12d		/* x & z */
+	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
+	and	$y,		%r11d		/* y & (not z) */
+	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
+	or	%r11d,		%r12d		/* (y & (not z)) | (x & z) */
+	mov	$y,		%r11d		/* (NEXT STEP) z' = $y */
+	add	%r12d,		$dst		/* dst += ... */
+	mov	$y,		%r12d		/* (NEXT STEP) z' = $y */
+	rol	\$$s,		$dst		/* dst <<< s */
+	add	$x,		$dst		/* dst += x */
+EOF
+}
+
+# round3_step() does:
+#   dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s)
+#   %r10d = X[k_next]
+#   %r11d = y' (copy of y for the next step)
+# Each round3_step() takes about 4.2 clocks (8 instructions, 1.9 IPC)
+{ my $round3_alter=0;
+sub round3_step
+{
+    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
+    $code .= " mov	%ecx,		%r11d		/* (NEXT STEP) y' = %ecx */\n" if ($pos == -1);
+    $code .= <<EOF;
+	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
+	xor	$z,		%r11d		/* z ^ ... */
+	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
+	xor	$x,		%r11d		/* x ^ ... */
+	add	%r11d,		$dst		/* dst += ... */
+EOF
+    $code .= <<EOF if ($round3_alter);
+	rol	\$$s,		$dst		/* dst <<< s */
+	mov	$x,		%r11d		/* (NEXT STEP) y' = $x */
+EOF
+    $code .= <<EOF if (!$round3_alter);
+	mov	$x,		%r11d		/* (NEXT STEP) y' = $x */
+	rol	\$$s,		$dst		/* dst <<< s */
+EOF
+    $code .= <<EOF;
+	add	$x,		$dst		/* dst += x */
+EOF
+    $round3_alter^=1;
+}
+}
+
+# round4_step() does:
+#   dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s)
+#   %r10d = X[k_next]
+#   %r11d = not z' (copy of not z for the next step)
+# Each round4_step() takes about 5.2 clocks (9 instructions, 1.7 IPC)
+sub round4_step
+{
+    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
+    $code .= " mov	\$0xffffffff,	%r11d\n" if ($pos == -1);
+    $code .= " xor	%edx,		%r11d		/* (NEXT STEP) not z' = not %edx*/\n"
+    if ($pos == -1);
+    $code .= <<EOF;
+	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
+	or	$x,		%r11d		/* x | ... */
+	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
+	xor	$y,		%r11d		/* y ^ ... */
+	add	%r11d,		$dst		/* dst += ... */
+	mov	\$0xffffffff,	%r11d
+	rol	\$$s,		$dst		/* dst <<< s */
+	xor	$y,		%r11d		/* (NEXT STEP) not z' = not $y */
+	add	$x,		$dst		/* dst += x */
+EOF
+}
+
+no warnings qw(uninitialized);
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+$code .= <<EOF;
+.text
+.align 16
+
+.globl ossl_md5_block_asm_data_order
+.type ossl_md5_block_asm_data_order,\@function,3
+ossl_md5_block_asm_data_order:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lprologue:
+
+	# rdi = arg #1 (ctx, MD5_CTX pointer)
+	# rsi = arg #2 (ptr, data pointer)
+	# rdx = arg #3 (nbr, number of 16-word blocks to process)
+	mov	%rdi,		%rbp	# rbp = ctx
+	shl	\$6,		%rdx	# rdx = nbr in bytes
+	lea	(%rsi,%rdx),	%rdi	# rdi = end
+	mov	0*4(%rbp),	%eax	# eax = ctx->A
+	mov	1*4(%rbp),	%ebx	# ebx = ctx->B
+	mov	2*4(%rbp),	%ecx	# ecx = ctx->C
+	mov	3*4(%rbp),	%edx	# edx = ctx->D
+	# end is 'rdi'
+	# ptr is 'rsi'
+	# A is 'eax'
+	# B is 'ebx'
+	# C is 'ecx'
+	# D is 'edx'
+
+	cmp	%rdi,		%rsi		# cmp end with ptr
+	je	.Lend				# jmp if ptr == end
+
+	# BEGIN of loop over 16-word blocks
+.Lloop:	# save old values of A, B, C, D
+	mov	%eax,		%r8d
+	mov	%ebx,		%r9d
+	mov	%ecx,		%r14d
+	mov	%edx,		%r15d
+EOF
+round1_step(-1,'%eax','%ebx','%ecx','%edx', '1','0xd76aa478', '7');
+round1_step( 0,'%edx','%eax','%ebx','%ecx', '2','0xe8c7b756','12');
+round1_step( 0,'%ecx','%edx','%eax','%ebx', '3','0x242070db','17');
+round1_step( 0,'%ebx','%ecx','%edx','%eax', '4','0xc1bdceee','22');
+round1_step( 0,'%eax','%ebx','%ecx','%edx', '5','0xf57c0faf', '7');
+round1_step( 0,'%edx','%eax','%ebx','%ecx', '6','0x4787c62a','12');
+round1_step( 0,'%ecx','%edx','%eax','%ebx', '7','0xa8304613','17');
+round1_step( 0,'%ebx','%ecx','%edx','%eax', '8','0xfd469501','22');
+round1_step( 0,'%eax','%ebx','%ecx','%edx', '9','0x698098d8', '7');
+round1_step( 0,'%edx','%eax','%ebx','%ecx','10','0x8b44f7af','12');
+round1_step( 0,'%ecx','%edx','%eax','%ebx','11','0xffff5bb1','17');
+round1_step( 0,'%ebx','%ecx','%edx','%eax','12','0x895cd7be','22');
+round1_step( 0,'%eax','%ebx','%ecx','%edx','13','0x6b901122', '7');
+round1_step( 0,'%edx','%eax','%ebx','%ecx','14','0xfd987193','12');
+round1_step( 0,'%ecx','%edx','%eax','%ebx','15','0xa679438e','17');
+round1_step( 1,'%ebx','%ecx','%edx','%eax', '1','0x49b40821','22');
+
+round2_step(-1,'%eax','%ebx','%ecx','%edx', '6','0xf61e2562', '5');
+round2_step( 0,'%edx','%eax','%ebx','%ecx','11','0xc040b340', '9');
+round2_step( 0,'%ecx','%edx','%eax','%ebx', '0','0x265e5a51','14');
+round2_step( 0,'%ebx','%ecx','%edx','%eax', '5','0xe9b6c7aa','20');
+round2_step( 0,'%eax','%ebx','%ecx','%edx','10','0xd62f105d', '5');
+round2_step( 0,'%edx','%eax','%ebx','%ecx','15', '0x2441453', '9');
+round2_step( 0,'%ecx','%edx','%eax','%ebx', '4','0xd8a1e681','14');
+round2_step( 0,'%ebx','%ecx','%edx','%eax', '9','0xe7d3fbc8','20');
+round2_step( 0,'%eax','%ebx','%ecx','%edx','14','0x21e1cde6', '5');
+round2_step( 0,'%edx','%eax','%ebx','%ecx', '3','0xc33707d6', '9');
+round2_step( 0,'%ecx','%edx','%eax','%ebx', '8','0xf4d50d87','14');
+round2_step( 0,'%ebx','%ecx','%edx','%eax','13','0x455a14ed','20');
+round2_step( 0,'%eax','%ebx','%ecx','%edx', '2','0xa9e3e905', '5');
+round2_step( 0,'%edx','%eax','%ebx','%ecx', '7','0xfcefa3f8', '9');
+round2_step( 0,'%ecx','%edx','%eax','%ebx','12','0x676f02d9','14');
+round2_step( 1,'%ebx','%ecx','%edx','%eax', '5','0x8d2a4c8a','20');
+
+round3_step(-1,'%eax','%ebx','%ecx','%edx', '8','0xfffa3942', '4');
+round3_step( 0,'%edx','%eax','%ebx','%ecx','11','0x8771f681','11');
+round3_step( 0,'%ecx','%edx','%eax','%ebx','14','0x6d9d6122','16');
+round3_step( 0,'%ebx','%ecx','%edx','%eax', '1','0xfde5380c','23');
+round3_step( 0,'%eax','%ebx','%ecx','%edx', '4','0xa4beea44', '4');
+round3_step( 0,'%edx','%eax','%ebx','%ecx', '7','0x4bdecfa9','11');
+round3_step( 0,'%ecx','%edx','%eax','%ebx','10','0xf6bb4b60','16');
+round3_step( 0,'%ebx','%ecx','%edx','%eax','13','0xbebfbc70','23');
+round3_step( 0,'%eax','%ebx','%ecx','%edx', '0','0x289b7ec6', '4');
+round3_step( 0,'%edx','%eax','%ebx','%ecx', '3','0xeaa127fa','11');
+round3_step( 0,'%ecx','%edx','%eax','%ebx', '6','0xd4ef3085','16');
+round3_step( 0,'%ebx','%ecx','%edx','%eax', '9', '0x4881d05','23');
+round3_step( 0,'%eax','%ebx','%ecx','%edx','12','0xd9d4d039', '4');
+round3_step( 0,'%edx','%eax','%ebx','%ecx','15','0xe6db99e5','11');
+round3_step( 0,'%ecx','%edx','%eax','%ebx', '2','0x1fa27cf8','16');
+round3_step( 1,'%ebx','%ecx','%edx','%eax', '0','0xc4ac5665','23');
+
+round4_step(-1,'%eax','%ebx','%ecx','%edx', '7','0xf4292244', '6');
+round4_step( 0,'%edx','%eax','%ebx','%ecx','14','0x432aff97','10');
+round4_step( 0,'%ecx','%edx','%eax','%ebx', '5','0xab9423a7','15');
+round4_step( 0,'%ebx','%ecx','%edx','%eax','12','0xfc93a039','21');
+round4_step( 0,'%eax','%ebx','%ecx','%edx', '3','0x655b59c3', '6');
+round4_step( 0,'%edx','%eax','%ebx','%ecx','10','0x8f0ccc92','10');
+round4_step( 0,'%ecx','%edx','%eax','%ebx', '1','0xffeff47d','15');
+round4_step( 0,'%ebx','%ecx','%edx','%eax', '8','0x85845dd1','21');
+round4_step( 0,'%eax','%ebx','%ecx','%edx','15','0x6fa87e4f', '6');
+round4_step( 0,'%edx','%eax','%ebx','%ecx', '6','0xfe2ce6e0','10');
+round4_step( 0,'%ecx','%edx','%eax','%ebx','13','0xa3014314','15');
+round4_step( 0,'%ebx','%ecx','%edx','%eax', '4','0x4e0811a1','21');
+round4_step( 0,'%eax','%ebx','%ecx','%edx','11','0xf7537e82', '6');
+round4_step( 0,'%edx','%eax','%ebx','%ecx', '2','0xbd3af235','10');
+round4_step( 0,'%ecx','%edx','%eax','%ebx', '9','0x2ad7d2bb','15');
+round4_step( 1,'%ebx','%ecx','%edx','%eax', '0','0xeb86d391','21');
+$code .= <<EOF;
+	# add old values of A, B, C, D
+	add	%r8d,	%eax
+	add	%r9d,	%ebx
+	add	%r14d,	%ecx
+	add	%r15d,	%edx
+
+	# loop control
+	add	\$64,		%rsi		# ptr += 64
+	cmp	%rdi,		%rsi		# cmp end with ptr
+	jb	.Lloop				# jmp if ptr < end
+	# END of loop over 16-word blocks
+
+.Lend:
+	mov	%eax,		0*4(%rbp)	# ctx->A = A
+	mov	%ebx,		1*4(%rbp)	# ctx->B = B
+	mov	%ecx,		2*4(%rbp)	# ctx->C = C
+	mov	%edx,		3*4(%rbp)	# ctx->D = D
+
+	mov	(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r12
+.cfi_restore	%r12
+	mov	24(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	32(%rsp),%rbp
+.cfi_restore	%rbp
+	add	\$40,%rsp
+.cfi_adjust_cfa_offset	-40
+.Lepilogue:
+	ret
+.cfi_endproc
+.size ossl_md5_block_asm_data_order,.-ossl_md5_block_asm_data_order
+EOF
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lprologue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lprologue
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lin_prologue
+
+	lea	40(%rax),%rax
+
+	mov	-8(%rax),%rbp
+	mov	-16(%rax),%rbx
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r14
+	mov	-40(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_ossl_md5_block_asm_data_order
+	.rva	.LSEH_end_ossl_md5_block_asm_data_order
+	.rva	.LSEH_info_ossl_md5_block_asm_data_order
+
+.section	.xdata
+.align	8
+.LSEH_info_ossl_md5_block_asm_data_order:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";