; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py

; RUN: llc < %s -mtriple=thumbv8m.base-arm-none-eabi < %s | FileCheck %s

define void @arm_q15_to_q31(ptr nocapture noundef readonly %pSrc, ptr nocapture noundef writeonly %pDst, i32 noundef %blockSize) {
; CHECK-LABEL: arm_q15_to_q31:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
; CHECK-NEXT:    .pad #8
; CHECK-NEXT:    sub sp, #8
; CHECK-NEXT:    mov r7, r2
; CHECK-NEXT:    lsrs r3, r2, #2
; CHECK-NEXT:    beq .LBB0_6
; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
; CHECK-NEXT:    movs r5, #3
; CHECK-NEXT:    ands r5, r3
; CHECK-NEXT:    subs r2, r3, #1
; CHECK-NEXT:    cbz r5, .LBB0_4
; CHECK-NEXT:  @ %bb.2: @ %while.body.prol
; CHECK-NEXT:    str r2, [sp] @ 4-byte Spill
; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
; CHECK-NEXT:    ldrh r2, [r0]
; CHECK-NEXT:    ldrh r7, [r0, #2]
; CHECK-NEXT:    ldrh r4, [r0, #4]
; CHECK-NEXT:    ldrh r6, [r0, #6]
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    lsls r7, r7, #16
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    stm r1!, {r2, r7}
; CHECK-NEXT:    str r4, [r1]
; CHECK-NEXT:    str r6, [r1, #4]
; CHECK-NEXT:    subs r1, #8
; CHECK-NEXT:    cmp r5, #1
; CHECK-NEXT:    bne .LBB0_11
; CHECK-NEXT:  @ %bb.3:
; CHECK-NEXT:    adds r1, #16
; CHECK-NEXT:    adds r0, #8
; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
; CHECK-NEXT:    mov r3, r2
; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
; CHECK-NEXT:  .LBB0_4: @ %while.body.prol.loopexit
; CHECK-NEXT:    cmp r2, #3
; CHECK-NEXT:    blo .LBB0_6
; CHECK-NEXT:  .LBB0_5: @ %while.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldrh r2, [r0]
; CHECK-NEXT:    ldrh r4, [r0, #2]
; CHECK-NEXT:    ldrh r5, [r0, #4]
; CHECK-NEXT:    ldrh r6, [r0, #6]
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    str r6, [r1, #12]
; CHECK-NEXT:    lsls r5, r5, #16
; CHECK-NEXT:    str r5, [r1, #8]
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    str r4, [r1, #4]
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1]
; CHECK-NEXT:    ldrh r2, [r0, #8]
; CHECK-NEXT:    ldrh r4, [r0, #10]
; CHECK-NEXT:    ldrh r5, [r0, #12]
; CHECK-NEXT:    ldrh r6, [r0, #14]
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    str r6, [r1, #28]
; CHECK-NEXT:    lsls r5, r5, #16
; CHECK-NEXT:    str r5, [r1, #24]
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    str r4, [r1, #20]
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1, #16]
; CHECK-NEXT:    ldrh r2, [r0, #16]
; CHECK-NEXT:    ldrh r4, [r0, #18]
; CHECK-NEXT:    ldrh r5, [r0, #20]
; CHECK-NEXT:    ldrh r6, [r0, #22]
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    str r6, [r1, #44]
; CHECK-NEXT:    lsls r5, r5, #16
; CHECK-NEXT:    str r5, [r1, #40]
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    str r4, [r1, #36]
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1, #32]
; CHECK-NEXT:    ldrh r2, [r0, #24]
; CHECK-NEXT:    ldrh r4, [r0, #26]
; CHECK-NEXT:    ldrh r5, [r0, #28]
; CHECK-NEXT:    ldrh r6, [r0, #30]
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    str r6, [r1, #60]
; CHECK-NEXT:    lsls r5, r5, #16
; CHECK-NEXT:    str r5, [r1, #56]
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    str r4, [r1, #52]
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1, #48]
; CHECK-NEXT:    adds r1, #64
; CHECK-NEXT:    adds r0, #32
; CHECK-NEXT:    subs r3, r3, #4
; CHECK-NEXT:    bne .LBB0_5
; CHECK-NEXT:  .LBB0_6: @ %while.end
; CHECK-NEXT:    movs r2, #3
; CHECK-NEXT:    ands r7, r2
; CHECK-NEXT:    beq .LBB0_10
; CHECK-NEXT:  @ %bb.7: @ %while.body12
; CHECK-NEXT:    ldrh r2, [r0]
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1]
; CHECK-NEXT:    cmp r7, #1
; CHECK-NEXT:    beq .LBB0_10
; CHECK-NEXT:  @ %bb.8: @ %while.body12.1
; CHECK-NEXT:    ldrh r2, [r0, #2]
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1, #4]
; CHECK-NEXT:    cmp r7, #2
; CHECK-NEXT:    beq .LBB0_10
; CHECK-NEXT:  @ %bb.9: @ %while.body12.2
; CHECK-NEXT:    ldrh r0, [r0, #4]
; CHECK-NEXT:    lsls r0, r0, #16
; CHECK-NEXT:    str r0, [r1, #8]
; CHECK-NEXT:  .LBB0_10: @ %while.end17
; CHECK-NEXT:    add sp, #8
; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
; CHECK-NEXT:  .LBB0_11: @ %while.body.prol.1
; CHECK-NEXT:    ldrh r2, [r0, #8]
; CHECK-NEXT:    ldrh r4, [r0, #10]
; CHECK-NEXT:    ldrh r6, [r0, #12]
; CHECK-NEXT:    ldrh r7, [r0, #14]
; CHECK-NEXT:    lsls r7, r7, #16
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1, #16]
; CHECK-NEXT:    str r4, [r1, #20]
; CHECK-NEXT:    str r6, [r1, #24]
; CHECK-NEXT:    str r7, [r1, #28]
; CHECK-NEXT:    cmp r5, #2
; CHECK-NEXT:    bne .LBB0_13
; CHECK-NEXT:  @ %bb.12:
; CHECK-NEXT:    subs r3, r3, #2
; CHECK-NEXT:    adds r1, #32
; CHECK-NEXT:    adds r0, #16
; CHECK-NEXT:    b .LBB0_14
; CHECK-NEXT:  .LBB0_13: @ %while.body.prol.2
; CHECK-NEXT:    ldrh r2, [r0, #16]
; CHECK-NEXT:    ldrh r4, [r0, #18]
; CHECK-NEXT:    ldrh r5, [r0, #20]
; CHECK-NEXT:    ldrh r6, [r0, #22]
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    lsls r5, r5, #16
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    mov r7, r1
; CHECK-NEXT:    adds r7, #32
; CHECK-NEXT:    stm r7!, {r2, r4, r5, r6}
; CHECK-NEXT:    subs r3, r3, #3
; CHECK-NEXT:    adds r1, #48
; CHECK-NEXT:    adds r0, #24
; CHECK-NEXT:  .LBB0_14: @ %while.body.prol.loopexit
; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
; CHECK-NEXT:    cmp r2, #3
; CHECK-NEXT:    bhs .LBB0_5
; CHECK-NEXT:    b .LBB0_6
entry:
  %cmp.not19 = icmp ult i32 %blockSize, 4
  br i1 %cmp.not19, label %while.end, label %while.body.preheader

while.body.preheader:                             ; preds = %entry
  %shr = lshr i32 %blockSize, 2
  %0 = add nsw i32 %shr, -1
  %xtraiter = and i32 %shr, 3
  %lcmp.mod.not = icmp eq i32 %xtraiter, 0
  br i1 %lcmp.mod.not, label %while.body.prol.loopexit, label %while.body.prol

while.body.prol:                                  ; preds = %while.body.preheader
  %pIn.0.val.prol = load i16, ptr %pSrc, align 2
  %1 = getelementptr i8, ptr %pSrc, i32 2
  %pIn.0.val13.prol = load i16, ptr %1, align 2
  %conv.i.prol = sext i16 %pIn.0.val13.prol to i32
  %shl.i.prol = shl nsw i32 %conv.i.prol, 16
  %conv22.i.prol = zext i16 %pIn.0.val.prol to i32
  %add.ptr2.prol = getelementptr inbounds i16, ptr %pSrc, i32 4
  %add.ptr3.prol = getelementptr inbounds i16, ptr %pSrc, i32 2
  %add.ptr3.val.prol = load i16, ptr %add.ptr3.prol, align 2
  %2 = getelementptr i16, ptr %pSrc, i32 3
  %add.ptr3.val14.prol = load i16, ptr %2, align 2
  %conv.i15.prol = sext i16 %add.ptr3.val14.prol to i32
  %shl.i16.prol = shl nsw i32 %conv.i15.prol, 16
  %conv22.i17.prol = zext i16 %add.ptr3.val.prol to i32
  %shl.prol = shl nuw i32 %conv22.i.prol, 16
  %shl5.prol = shl nuw i32 %conv22.i17.prol, 16
  %incdec.ptr.prol = getelementptr inbounds i32, ptr %pDst, i32 1
  store i32 %shl.prol, ptr %pDst, align 4
  %incdec.ptr7.prol = getelementptr inbounds i32, ptr %pDst, i32 2
  store i32 %shl.i.prol, ptr %incdec.ptr.prol, align 4
  %incdec.ptr8.prol = getelementptr inbounds i32, ptr %pDst, i32 3
  store i32 %shl5.prol, ptr %incdec.ptr7.prol, align 4
  %incdec.ptr9.prol = getelementptr inbounds i32, ptr %pDst, i32 4
  store i32 %shl.i16.prol, ptr %incdec.ptr8.prol, align 4
  %dec.prol = add nsw i32 %shr, -1
  %prol.iter.cmp.not = icmp eq i32 %xtraiter, 1
  br i1 %prol.iter.cmp.not, label %while.body.prol.loopexit, label %while.body.prol.1

while.body.prol.1:                                ; preds = %while.body.prol
  %pIn.0.val.prol.1 = load i16, ptr %add.ptr2.prol, align 2
  %3 = getelementptr i16, ptr %pSrc, i32 5
  %pIn.0.val13.prol.1 = load i16, ptr %3, align 2
  %conv.i.prol.1 = sext i16 %pIn.0.val13.prol.1 to i32
  %shl.i.prol.1 = shl nsw i32 %conv.i.prol.1, 16
  %conv22.i.prol.1 = zext i16 %pIn.0.val.prol.1 to i32
  %add.ptr2.prol.1 = getelementptr inbounds i16, ptr %pSrc, i32 8
  %add.ptr3.prol.1 = getelementptr inbounds i16, ptr %pSrc, i32 6
  %add.ptr3.val.prol.1 = load i16, ptr %add.ptr3.prol.1, align 2
  %4 = getelementptr i16, ptr %pSrc, i32 7
  %add.ptr3.val14.prol.1 = load i16, ptr %4, align 2
  %conv.i15.prol.1 = sext i16 %add.ptr3.val14.prol.1 to i32
  %shl.i16.prol.1 = shl nsw i32 %conv.i15.prol.1, 16
  %conv22.i17.prol.1 = zext i16 %add.ptr3.val.prol.1 to i32
  %shl.prol.1 = shl nuw i32 %conv22.i.prol.1, 16
  %shl5.prol.1 = shl nuw i32 %conv22.i17.prol.1, 16
  %incdec.ptr.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 5
  store i32 %shl.prol.1, ptr %incdec.ptr9.prol, align 4
  %incdec.ptr7.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 6
  store i32 %shl.i.prol.1, ptr %incdec.ptr.prol.1, align 4
  %incdec.ptr8.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 7
  store i32 %shl5.prol.1, ptr %incdec.ptr7.prol.1, align 4
  %incdec.ptr9.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 8
  store i32 %shl.i16.prol.1, ptr %incdec.ptr8.prol.1, align 4
  %dec.prol.1 = add nsw i32 %shr, -2
  %prol.iter.cmp.1.not = icmp eq i32 %xtraiter, 2
  br i1 %prol.iter.cmp.1.not, label %while.body.prol.loopexit, label %while.body.prol.2

while.body.prol.2:                                ; preds = %while.body.prol.1
  %pIn.0.val.prol.2 = load i16, ptr %add.ptr2.prol.1, align 2
  %5 = getelementptr i16, ptr %pSrc, i32 9
  %pIn.0.val13.prol.2 = load i16, ptr %5, align 2
  %conv.i.prol.2 = sext i16 %pIn.0.val13.prol.2 to i32
  %shl.i.prol.2 = shl nsw i32 %conv.i.prol.2, 16
  %conv22.i.prol.2 = zext i16 %pIn.0.val.prol.2 to i32
  %add.ptr2.prol.2 = getelementptr inbounds i16, ptr %pSrc, i32 12
  %add.ptr3.prol.2 = getelementptr inbounds i16, ptr %pSrc, i32 10
  %add.ptr3.val.prol.2 = load i16, ptr %add.ptr3.prol.2, align 2
  %6 = getelementptr i16, ptr %pSrc, i32 11
  %add.ptr3.val14.prol.2 = load i16, ptr %6, align 2
  %conv.i15.prol.2 = sext i16 %add.ptr3.val14.prol.2 to i32
  %shl.i16.prol.2 = shl nsw i32 %conv.i15.prol.2, 16
  %conv22.i17.prol.2 = zext i16 %add.ptr3.val.prol.2 to i32
  %shl.prol.2 = shl nuw i32 %conv22.i.prol.2, 16
  %shl5.prol.2 = shl nuw i32 %conv22.i17.prol.2, 16
  %incdec.ptr.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 9
  store i32 %shl.prol.2, ptr %incdec.ptr9.prol.1, align 4
  %incdec.ptr7.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 10
  store i32 %shl.i.prol.2, ptr %incdec.ptr.prol.2, align 4
  %incdec.ptr8.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 11
  store i32 %shl5.prol.2, ptr %incdec.ptr7.prol.2, align 4
  %incdec.ptr9.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 12
  store i32 %shl.i16.prol.2, ptr %incdec.ptr8.prol.2, align 4
  %dec.prol.2 = add nsw i32 %shr, -3
  br label %while.body.prol.loopexit

while.body.prol.loopexit:                         ; preds = %while.body.prol, %while.body.prol.1, %while.body.prol.2, %while.body.preheader
  %add.ptr2.lcssa.unr = phi ptr [ undef, %while.body.preheader ], [ %add.ptr2.prol, %while.body.prol ], [ %add.ptr2.prol.1, %while.body.prol.1 ], [ %add.ptr2.prol.2, %while.body.prol.2 ]
  %incdec.ptr9.lcssa.unr = phi ptr [ undef, %while.body.preheader ], [ %incdec.ptr9.prol, %while.body.prol ], [ %incdec.ptr9.prol.1, %while.body.prol.1 ], [ %incdec.ptr9.prol.2, %while.body.prol.2 ]
  %pDst.addr.022.unr = phi ptr [ %pDst, %while.body.preheader ], [ %incdec.ptr9.prol, %while.body.prol ], [ %incdec.ptr9.prol.1, %while.body.prol.1 ], [ %incdec.ptr9.prol.2, %while.body.prol.2 ]
  %blkCnt.021.unr = phi i32 [ %shr, %while.body.preheader ], [ %dec.prol, %while.body.prol ], [ %dec.prol.1, %while.body.prol.1 ], [ %dec.prol.2, %while.body.prol.2 ]
  %pIn.020.unr = phi ptr [ %pSrc, %while.body.preheader ], [ %add.ptr2.prol, %while.body.prol ], [ %add.ptr2.prol.1, %while.body.prol.1 ], [ %add.ptr2.prol.2, %while.body.prol.2 ]
  %7 = icmp ult i32 %0, 3
  br i1 %7, label %while.end, label %while.body

while.body:                                       ; preds = %while.body.prol.loopexit, %while.body
  %pDst.addr.022 = phi ptr [ %incdec.ptr9.3, %while.body ], [ %pDst.addr.022.unr, %while.body.prol.loopexit ]
  %blkCnt.021 = phi i32 [ %dec.3, %while.body ], [ %blkCnt.021.unr, %while.body.prol.loopexit ]
  %pIn.020 = phi ptr [ %add.ptr2.3, %while.body ], [ %pIn.020.unr, %while.body.prol.loopexit ]
  %pIn.0.val = load i16, ptr %pIn.020, align 2
  %8 = getelementptr i8, ptr %pIn.020, i32 2
  %pIn.0.val13 = load i16, ptr %8, align 2
  %conv.i = sext i16 %pIn.0.val13 to i32
  %shl.i = shl nsw i32 %conv.i, 16
  %conv22.i = zext i16 %pIn.0.val to i32
  %add.ptr2 = getelementptr inbounds i16, ptr %pIn.020, i32 4
  %add.ptr3 = getelementptr inbounds i16, ptr %pIn.020, i32 2
  %add.ptr3.val = load i16, ptr %add.ptr3, align 2
  %9 = getelementptr i16, ptr %pIn.020, i32 3
  %add.ptr3.val14 = load i16, ptr %9, align 2
  %conv.i15 = sext i16 %add.ptr3.val14 to i32
  %shl.i16 = shl nsw i32 %conv.i15, 16
  %conv22.i17 = zext i16 %add.ptr3.val to i32
  %shl = shl nuw i32 %conv22.i, 16
  %shl5 = shl nuw i32 %conv22.i17, 16
  %incdec.ptr = getelementptr inbounds i32, ptr %pDst.addr.022, i32 1
  store i32 %shl, ptr %pDst.addr.022, align 4
  %incdec.ptr7 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 2
  store i32 %shl.i, ptr %incdec.ptr, align 4
  %incdec.ptr8 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 3
  store i32 %shl5, ptr %incdec.ptr7, align 4
  %incdec.ptr9 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 4
  store i32 %shl.i16, ptr %incdec.ptr8, align 4
  %pIn.0.val.1 = load i16, ptr %add.ptr2, align 2
  %10 = getelementptr i16, ptr %pIn.020, i32 5
  %pIn.0.val13.1 = load i16, ptr %10, align 2
  %conv.i.1 = sext i16 %pIn.0.val13.1 to i32
  %shl.i.1 = shl nsw i32 %conv.i.1, 16
  %conv22.i.1 = zext i16 %pIn.0.val.1 to i32
  %add.ptr2.1 = getelementptr inbounds i16, ptr %pIn.020, i32 8
  %add.ptr3.1 = getelementptr inbounds i16, ptr %pIn.020, i32 6
  %add.ptr3.val.1 = load i16, ptr %add.ptr3.1, align 2
  %11 = getelementptr i16, ptr %pIn.020, i32 7
  %add.ptr3.val14.1 = load i16, ptr %11, align 2
  %conv.i15.1 = sext i16 %add.ptr3.val14.1 to i32
  %shl.i16.1 = shl nsw i32 %conv.i15.1, 16
  %conv22.i17.1 = zext i16 %add.ptr3.val.1 to i32
  %shl.1 = shl nuw i32 %conv22.i.1, 16
  %shl5.1 = shl nuw i32 %conv22.i17.1, 16
  %incdec.ptr.1 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 5
  store i32 %shl.1, ptr %incdec.ptr9, align 4
  %incdec.ptr7.1 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 6
  store i32 %shl.i.1, ptr %incdec.ptr.1, align 4
  %incdec.ptr8.1 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 7
  store i32 %shl5.1, ptr %incdec.ptr7.1, align 4
  %incdec.ptr9.1 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 8
  store i32 %shl.i16.1, ptr %incdec.ptr8.1, align 4
  %pIn.0.val.2 = load i16, ptr %add.ptr2.1, align 2
  %12 = getelementptr i16, ptr %pIn.020, i32 9
  %pIn.0.val13.2 = load i16, ptr %12, align 2
  %conv.i.2 = sext i16 %pIn.0.val13.2 to i32
  %shl.i.2 = shl nsw i32 %conv.i.2, 16
  %conv22.i.2 = zext i16 %pIn.0.val.2 to i32
  %add.ptr2.2 = getelementptr inbounds i16, ptr %pIn.020, i32 12
  %add.ptr3.2 = getelementptr inbounds i16, ptr %pIn.020, i32 10
  %add.ptr3.val.2 = load i16, ptr %add.ptr3.2, align 2
  %13 = getelementptr i16, ptr %pIn.020, i32 11
  %add.ptr3.val14.2 = load i16, ptr %13, align 2
  %conv.i15.2 = sext i16 %add.ptr3.val14.2 to i32
  %shl.i16.2 = shl nsw i32 %conv.i15.2, 16
  %conv22.i17.2 = zext i16 %add.ptr3.val.2 to i32
  %shl.2 = shl nuw i32 %conv22.i.2, 16
  %shl5.2 = shl nuw i32 %conv22.i17.2, 16
  %incdec.ptr.2 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 9
  store i32 %shl.2, ptr %incdec.ptr9.1, align 4
  %incdec.ptr7.2 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 10
  store i32 %shl.i.2, ptr %incdec.ptr.2, align 4
  %incdec.ptr8.2 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 11
  store i32 %shl5.2, ptr %incdec.ptr7.2, align 4
  %incdec.ptr9.2 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 12
  store i32 %shl.i16.2, ptr %incdec.ptr8.2, align 4
  %pIn.0.val.3 = load i16, ptr %add.ptr2.2, align 2
  %14 = getelementptr i16, ptr %pIn.020, i32 13
  %pIn.0.val13.3 = load i16, ptr %14, align 2
  %conv.i.3 = sext i16 %pIn.0.val13.3 to i32
  %shl.i.3 = shl nsw i32 %conv.i.3, 16
  %conv22.i.3 = zext i16 %pIn.0.val.3 to i32
  %add.ptr2.3 = getelementptr inbounds i16, ptr %pIn.020, i32 16
  %add.ptr3.3 = getelementptr inbounds i16, ptr %pIn.020, i32 14
  %add.ptr3.val.3 = load i16, ptr %add.ptr3.3, align 2
  %15 = getelementptr i16, ptr %pIn.020, i32 15
  %add.ptr3.val14.3 = load i16, ptr %15, align 2
  %conv.i15.3 = sext i16 %add.ptr3.val14.3 to i32
  %shl.i16.3 = shl nsw i32 %conv.i15.3, 16
  %conv22.i17.3 = zext i16 %add.ptr3.val.3 to i32
  %shl.3 = shl nuw i32 %conv22.i.3, 16
  %shl5.3 = shl nuw i32 %conv22.i17.3, 16
  %incdec.ptr.3 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 13
  store i32 %shl.3, ptr %incdec.ptr9.2, align 4
  %incdec.ptr7.3 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 14
  store i32 %shl.i.3, ptr %incdec.ptr.3, align 4
  %incdec.ptr8.3 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 15
  store i32 %shl5.3, ptr %incdec.ptr7.3, align 4
  %incdec.ptr9.3 = getelementptr inbounds i32, ptr %pDst.addr.022, i32 16
  store i32 %shl.i16.3, ptr %incdec.ptr8.3, align 4
  %dec.3 = add nsw i32 %blkCnt.021, -4
  %cmp.not.3 = icmp eq i32 %dec.3, 0
  br i1 %cmp.not.3, label %while.end, label %while.body

while.end:                                        ; preds = %while.body.prol.loopexit, %while.body, %entry
  %pIn.0.lcssa = phi ptr [ %pSrc, %entry ], [ %add.ptr2.lcssa.unr, %while.body.prol.loopexit ], [ %add.ptr2.3, %while.body ]
  %pDst.addr.0.lcssa = phi ptr [ %pDst, %entry ], [ %incdec.ptr9.lcssa.unr, %while.body.prol.loopexit ], [ %incdec.ptr9.3, %while.body ]
  %rem = and i32 %blockSize, 3
  %cmp11.not24 = icmp eq i32 %rem, 0
  br i1 %cmp11.not24, label %while.end17, label %while.body12

while.body12:                                     ; preds = %while.end
  %16 = load i16, ptr %pIn.0.lcssa, align 2
  %conv = sext i16 %16 to i32
  %shl14 = shl nsw i32 %conv, 16
  store i32 %shl14, ptr %pDst.addr.0.lcssa, align 4
  %cmp11.not = icmp eq i32 %rem, 1
  br i1 %cmp11.not, label %while.end17, label %while.body12.1

while.body12.1:                                   ; preds = %while.body12
  %incdec.ptr15 = getelementptr inbounds i32, ptr %pDst.addr.0.lcssa, i32 1
  %incdec.ptr13 = getelementptr inbounds i16, ptr %pIn.0.lcssa, i32 1
  %17 = load i16, ptr %incdec.ptr13, align 2
  %conv.1 = sext i16 %17 to i32
  %shl14.1 = shl nsw i32 %conv.1, 16
  store i32 %shl14.1, ptr %incdec.ptr15, align 4
  %cmp11.not.1 = icmp eq i32 %rem, 2
  br i1 %cmp11.not.1, label %while.end17, label %while.body12.2

while.body12.2:                                   ; preds = %while.body12.1
  %incdec.ptr15.1 = getelementptr inbounds i32, ptr %pDst.addr.0.lcssa, i32 2
  %incdec.ptr13.1 = getelementptr inbounds i16, ptr %pIn.0.lcssa, i32 2
  %18 = load i16, ptr %incdec.ptr13.1, align 2
  %conv.2 = sext i16 %18 to i32
  %shl14.2 = shl nsw i32 %conv.2, 16
  store i32 %shl14.2, ptr %incdec.ptr15.1, align 4
  br label %while.end17

while.end17:                                      ; preds = %while.body12, %while.body12.1, %while.body12.2, %while.end
  ret void
}

define void @arm_q15_to_q31_altorder(ptr nocapture noundef readonly %pSrc, ptr nocapture noundef writeonly %pDst, i32 noundef %blockSize) {
; CHECK-LABEL: arm_q15_to_q31_altorder:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
; CHECK-NEXT:    .pad #8
; CHECK-NEXT:    sub sp, #8
; CHECK-NEXT:    mov r7, r2
; CHECK-NEXT:    lsrs r3, r2, #2
; CHECK-NEXT:    beq .LBB1_6
; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
; CHECK-NEXT:    movs r5, #3
; CHECK-NEXT:    ands r5, r3
; CHECK-NEXT:    subs r2, r3, #1
; CHECK-NEXT:    cbz r5, .LBB1_4
; CHECK-NEXT:  @ %bb.2: @ %while.body.prol
; CHECK-NEXT:    str r2, [sp] @ 4-byte Spill
; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
; CHECK-NEXT:    ldrh r2, [r0]
; CHECK-NEXT:    ldrh r7, [r0, #2]
; CHECK-NEXT:    ldrh r4, [r0, #4]
; CHECK-NEXT:    ldrh r6, [r0, #6]
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    lsls r7, r7, #16
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    stm r1!, {r2, r7}
; CHECK-NEXT:    str r4, [r1]
; CHECK-NEXT:    str r6, [r1, #4]
; CHECK-NEXT:    subs r1, #8
; CHECK-NEXT:    cmp r5, #1
; CHECK-NEXT:    bne .LBB1_11
; CHECK-NEXT:  @ %bb.3:
; CHECK-NEXT:    adds r1, #16
; CHECK-NEXT:    adds r0, #8
; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
; CHECK-NEXT:    mov r3, r2
; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
; CHECK-NEXT:  .LBB1_4: @ %while.body.prol.loopexit
; CHECK-NEXT:    cmp r2, #3
; CHECK-NEXT:    blo .LBB1_6
; CHECK-NEXT:  .LBB1_5: @ %while.body
; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldrh r2, [r0]
; CHECK-NEXT:    ldrh r4, [r0, #2]
; CHECK-NEXT:    ldrh r5, [r0, #4]
; CHECK-NEXT:    ldrh r6, [r0, #6]
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    str r6, [r1, #12]
; CHECK-NEXT:    lsls r5, r5, #16
; CHECK-NEXT:    str r5, [r1, #8]
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    str r4, [r1, #4]
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1]
; CHECK-NEXT:    ldrh r2, [r0, #8]
; CHECK-NEXT:    ldrh r4, [r0, #10]
; CHECK-NEXT:    ldrh r5, [r0, #12]
; CHECK-NEXT:    ldrh r6, [r0, #14]
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    str r6, [r1, #28]
; CHECK-NEXT:    lsls r5, r5, #16
; CHECK-NEXT:    str r5, [r1, #24]
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    str r4, [r1, #20]
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1, #16]
; CHECK-NEXT:    ldrh r2, [r0, #16]
; CHECK-NEXT:    ldrh r4, [r0, #18]
; CHECK-NEXT:    ldrh r5, [r0, #20]
; CHECK-NEXT:    ldrh r6, [r0, #22]
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    str r6, [r1, #44]
; CHECK-NEXT:    lsls r5, r5, #16
; CHECK-NEXT:    str r5, [r1, #40]
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    str r4, [r1, #36]
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1, #32]
; CHECK-NEXT:    ldrh r2, [r0, #24]
; CHECK-NEXT:    ldrh r4, [r0, #26]
; CHECK-NEXT:    ldrh r5, [r0, #28]
; CHECK-NEXT:    ldrh r6, [r0, #30]
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    str r6, [r1, #60]
; CHECK-NEXT:    lsls r5, r5, #16
; CHECK-NEXT:    str r5, [r1, #56]
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    str r4, [r1, #52]
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1, #48]
; CHECK-NEXT:    adds r1, #64
; CHECK-NEXT:    adds r0, #32
; CHECK-NEXT:    subs r3, r3, #4
; CHECK-NEXT:    bne .LBB1_5
; CHECK-NEXT:  .LBB1_6: @ %while.end
; CHECK-NEXT:    movs r2, #3
; CHECK-NEXT:    ands r7, r2
; CHECK-NEXT:    beq .LBB1_10
; CHECK-NEXT:  @ %bb.7: @ %while.body12
; CHECK-NEXT:    ldrh r2, [r0]
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1]
; CHECK-NEXT:    cmp r7, #1
; CHECK-NEXT:    beq .LBB1_10
; CHECK-NEXT:  @ %bb.8: @ %while.body12.1
; CHECK-NEXT:    ldrh r2, [r0, #2]
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1, #4]
; CHECK-NEXT:    cmp r7, #2
; CHECK-NEXT:    beq .LBB1_10
; CHECK-NEXT:  @ %bb.9: @ %while.body12.2
; CHECK-NEXT:    ldrh r0, [r0, #4]
; CHECK-NEXT:    lsls r0, r0, #16
; CHECK-NEXT:    str r0, [r1, #8]
; CHECK-NEXT:  .LBB1_10: @ %while.end17
; CHECK-NEXT:    add sp, #8
; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
; CHECK-NEXT:  .LBB1_11: @ %while.body.prol.1
; CHECK-NEXT:    ldrh r2, [r0, #8]
; CHECK-NEXT:    ldrh r4, [r0, #10]
; CHECK-NEXT:    ldrh r6, [r0, #12]
; CHECK-NEXT:    ldrh r7, [r0, #14]
; CHECK-NEXT:    lsls r7, r7, #16
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    str r2, [r1, #16]
; CHECK-NEXT:    str r4, [r1, #20]
; CHECK-NEXT:    str r6, [r1, #24]
; CHECK-NEXT:    str r7, [r1, #28]
; CHECK-NEXT:    cmp r5, #2
; CHECK-NEXT:    bne .LBB1_13
; CHECK-NEXT:  @ %bb.12:
; CHECK-NEXT:    subs r3, r3, #2
; CHECK-NEXT:    adds r1, #32
; CHECK-NEXT:    adds r0, #16
; CHECK-NEXT:    b .LBB1_14
; CHECK-NEXT:  .LBB1_13: @ %while.body.prol.2
; CHECK-NEXT:    ldrh r2, [r0, #16]
; CHECK-NEXT:    ldrh r4, [r0, #18]
; CHECK-NEXT:    ldrh r5, [r0, #20]
; CHECK-NEXT:    ldrh r6, [r0, #22]
; CHECK-NEXT:    lsls r6, r6, #16
; CHECK-NEXT:    lsls r5, r5, #16
; CHECK-NEXT:    lsls r4, r4, #16
; CHECK-NEXT:    lsls r2, r2, #16
; CHECK-NEXT:    mov r7, r1
; CHECK-NEXT:    adds r7, #32
; CHECK-NEXT:    stm r7!, {r2, r4, r5, r6}
; CHECK-NEXT:    subs r3, r3, #3
; CHECK-NEXT:    adds r1, #48
; CHECK-NEXT:    adds r0, #24
; CHECK-NEXT:  .LBB1_14: @ %while.body.prol.loopexit
; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
; CHECK-NEXT:    cmp r2, #3
; CHECK-NEXT:    bhs .LBB1_5
; CHECK-NEXT:    b .LBB1_6
entry:
  %cmp.not18 = icmp ult i32 %blockSize, 4
  br i1 %cmp.not18, label %while.end, label %while.body.preheader

while.body.preheader:                             ; preds = %entry
  %shr = lshr i32 %blockSize, 2
  %0 = add nsw i32 %shr, -1
  %xtraiter = and i32 %shr, 3
  %lcmp.mod.not = icmp eq i32 %xtraiter, 0
  br i1 %lcmp.mod.not, label %while.body.prol.loopexit, label %while.body.prol

while.body.prol:                                  ; preds = %while.body.preheader
  %arrayidx.i.prol = getelementptr inbounds i16, ptr %pSrc, i32 1
  %1 = load i16, ptr %arrayidx.i.prol, align 2
  %conv.i.prol = sext i16 %1 to i32
  %shl.i.prol = shl nsw i32 %conv.i.prol, 16
  %2 = load i16, ptr %pSrc, align 2
  %conv22.i.prol = zext i16 %2 to i32
  %add.ptr2.prol = getelementptr inbounds i16, ptr %pSrc, i32 4
  %add.ptr3.prol = getelementptr inbounds i16, ptr %pSrc, i32 2
  %arrayidx.i13.prol = getelementptr inbounds i16, ptr %pSrc, i32 3
  %3 = load i16, ptr %arrayidx.i13.prol, align 2
  %conv.i14.prol = sext i16 %3 to i32
  %shl.i15.prol = shl nsw i32 %conv.i14.prol, 16
  %4 = load i16, ptr %add.ptr3.prol, align 2
  %conv22.i16.prol = zext i16 %4 to i32
  %shl.prol = shl nuw i32 %conv22.i.prol, 16
  %shl5.prol = shl nuw i32 %conv22.i16.prol, 16
  %incdec.ptr.prol = getelementptr inbounds i32, ptr %pDst, i32 1
  store i32 %shl.prol, ptr %pDst, align 4
  %incdec.ptr7.prol = getelementptr inbounds i32, ptr %pDst, i32 2
  store i32 %shl.i.prol, ptr %incdec.ptr.prol, align 4
  %incdec.ptr8.prol = getelementptr inbounds i32, ptr %pDst, i32 3
  store i32 %shl5.prol, ptr %incdec.ptr7.prol, align 4
  %incdec.ptr9.prol = getelementptr inbounds i32, ptr %pDst, i32 4
  store i32 %shl.i15.prol, ptr %incdec.ptr8.prol, align 4
  %dec.prol = add nsw i32 %shr, -1
  %prol.iter.cmp.not = icmp eq i32 %xtraiter, 1
  br i1 %prol.iter.cmp.not, label %while.body.prol.loopexit, label %while.body.prol.1

while.body.prol.1:                                ; preds = %while.body.prol
  %arrayidx.i.prol.1 = getelementptr inbounds i16, ptr %pSrc, i32 5
  %5 = load i16, ptr %arrayidx.i.prol.1, align 2
  %conv.i.prol.1 = sext i16 %5 to i32
  %shl.i.prol.1 = shl nsw i32 %conv.i.prol.1, 16
  %6 = load i16, ptr %add.ptr2.prol, align 2
  %conv22.i.prol.1 = zext i16 %6 to i32
  %add.ptr2.prol.1 = getelementptr inbounds i16, ptr %pSrc, i32 8
  %add.ptr3.prol.1 = getelementptr inbounds i16, ptr %pSrc, i32 6
  %arrayidx.i13.prol.1 = getelementptr inbounds i16, ptr %pSrc, i32 7
  %7 = load i16, ptr %arrayidx.i13.prol.1, align 2
  %conv.i14.prol.1 = sext i16 %7 to i32
  %shl.i15.prol.1 = shl nsw i32 %conv.i14.prol.1, 16
  %8 = load i16, ptr %add.ptr3.prol.1, align 2
  %conv22.i16.prol.1 = zext i16 %8 to i32
  %shl.prol.1 = shl nuw i32 %conv22.i.prol.1, 16
  %shl5.prol.1 = shl nuw i32 %conv22.i16.prol.1, 16
  %incdec.ptr.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 5
  store i32 %shl.prol.1, ptr %incdec.ptr9.prol, align 4
  %incdec.ptr7.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 6
  store i32 %shl.i.prol.1, ptr %incdec.ptr.prol.1, align 4
  %incdec.ptr8.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 7
  store i32 %shl5.prol.1, ptr %incdec.ptr7.prol.1, align 4
  %incdec.ptr9.prol.1 = getelementptr inbounds i32, ptr %pDst, i32 8
  store i32 %shl.i15.prol.1, ptr %incdec.ptr8.prol.1, align 4
  %dec.prol.1 = add nsw i32 %shr, -2
  %prol.iter.cmp.1.not = icmp eq i32 %xtraiter, 2
  br i1 %prol.iter.cmp.1.not, label %while.body.prol.loopexit, label %while.body.prol.2

while.body.prol.2:                                ; preds = %while.body.prol.1
  %arrayidx.i.prol.2 = getelementptr inbounds i16, ptr %pSrc, i32 9
  %9 = load i16, ptr %arrayidx.i.prol.2, align 2
  %conv.i.prol.2 = sext i16 %9 to i32
  %shl.i.prol.2 = shl nsw i32 %conv.i.prol.2, 16
  %10 = load i16, ptr %add.ptr2.prol.1, align 2
  %conv22.i.prol.2 = zext i16 %10 to i32
  %add.ptr2.prol.2 = getelementptr inbounds i16, ptr %pSrc, i32 12
  %add.ptr3.prol.2 = getelementptr inbounds i16, ptr %pSrc, i32 10
  %arrayidx.i13.prol.2 = getelementptr inbounds i16, ptr %pSrc, i32 11
  %11 = load i16, ptr %arrayidx.i13.prol.2, align 2
  %conv.i14.prol.2 = sext i16 %11 to i32
  %shl.i15.prol.2 = shl nsw i32 %conv.i14.prol.2, 16
  %12 = load i16, ptr %add.ptr3.prol.2, align 2
  %conv22.i16.prol.2 = zext i16 %12 to i32
  %shl.prol.2 = shl nuw i32 %conv22.i.prol.2, 16
  %shl5.prol.2 = shl nuw i32 %conv22.i16.prol.2, 16
  %incdec.ptr.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 9
  store i32 %shl.prol.2, ptr %incdec.ptr9.prol.1, align 4
  %incdec.ptr7.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 10
  store i32 %shl.i.prol.2, ptr %incdec.ptr.prol.2, align 4
  %incdec.ptr8.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 11
  store i32 %shl5.prol.2, ptr %incdec.ptr7.prol.2, align 4
  %incdec.ptr9.prol.2 = getelementptr inbounds i32, ptr %pDst, i32 12
  store i32 %shl.i15.prol.2, ptr %incdec.ptr8.prol.2, align 4
  %dec.prol.2 = add nsw i32 %shr, -3
  br label %while.body.prol.loopexit

while.body.prol.loopexit:                         ; preds = %while.body.prol, %while.body.prol.1, %while.body.prol.2, %while.body.preheader
  %add.ptr2.lcssa.unr = phi ptr [ undef, %while.body.preheader ], [ %add.ptr2.prol, %while.body.prol ], [ %add.ptr2.prol.1, %while.body.prol.1 ], [ %add.ptr2.prol.2, %while.body.prol.2 ]
  %incdec.ptr9.lcssa.unr = phi ptr [ undef, %while.body.preheader ], [ %incdec.ptr9.prol, %while.body.prol ], [ %incdec.ptr9.prol.1, %while.body.prol.1 ], [ %incdec.ptr9.prol.2, %while.body.prol.2 ]
  %pDst.addr.021.unr = phi ptr [ %pDst, %while.body.preheader ], [ %incdec.ptr9.prol, %while.body.prol ], [ %incdec.ptr9.prol.1, %while.body.prol.1 ], [ %incdec.ptr9.prol.2, %while.body.prol.2 ]
  %blkCnt.020.unr = phi i32 [ %shr, %while.body.preheader ], [ %dec.prol, %while.body.prol ], [ %dec.prol.1, %while.body.prol.1 ], [ %dec.prol.2, %while.body.prol.2 ]
  %pIn.019.unr = phi ptr [ %pSrc, %while.body.preheader ], [ %add.ptr2.prol, %while.body.prol ], [ %add.ptr2.prol.1, %while.body.prol.1 ], [ %add.ptr2.prol.2, %while.body.prol.2 ]
  %13 = icmp ult i32 %0, 3
  br i1 %13, label %while.end, label %while.body

while.body:                                       ; preds = %while.body.prol.loopexit, %while.body
  %pDst.addr.021 = phi ptr [ %incdec.ptr9.3, %while.body ], [ %pDst.addr.021.unr, %while.body.prol.loopexit ]
  %blkCnt.020 = phi i32 [ %dec.3, %while.body ], [ %blkCnt.020.unr, %while.body.prol.loopexit ]
  %pIn.019 = phi ptr [ %add.ptr2.3, %while.body ], [ %pIn.019.unr, %while.body.prol.loopexit ]
  %arrayidx.i = getelementptr inbounds i16, ptr %pIn.019, i32 1
  %14 = load i16, ptr %arrayidx.i, align 2
  %conv.i = sext i16 %14 to i32
  %shl.i = shl nsw i32 %conv.i, 16
  %15 = load i16, ptr %pIn.019, align 2
  %conv22.i = zext i16 %15 to i32
  %add.ptr2 = getelementptr inbounds i16, ptr %pIn.019, i32 4
  %add.ptr3 = getelementptr inbounds i16, ptr %pIn.019, i32 2
  %arrayidx.i13 = getelementptr inbounds i16, ptr %pIn.019, i32 3
  %16 = load i16, ptr %arrayidx.i13, align 2
  %conv.i14 = sext i16 %16 to i32
  %shl.i15 = shl nsw i32 %conv.i14, 16
  %17 = load i16, ptr %add.ptr3, align 2
  %conv22.i16 = zext i16 %17 to i32
  %shl = shl nuw i32 %conv22.i, 16
  %shl5 = shl nuw i32 %conv22.i16, 16
  %incdec.ptr = getelementptr inbounds i32, ptr %pDst.addr.021, i32 1
  store i32 %shl, ptr %pDst.addr.021, align 4
  %incdec.ptr7 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 2
  store i32 %shl.i, ptr %incdec.ptr, align 4
  %incdec.ptr8 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 3
  store i32 %shl5, ptr %incdec.ptr7, align 4
  %incdec.ptr9 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 4
  store i32 %shl.i15, ptr %incdec.ptr8, align 4
  %arrayidx.i.1 = getelementptr inbounds i16, ptr %pIn.019, i32 5
  %18 = load i16, ptr %arrayidx.i.1, align 2
  %conv.i.1 = sext i16 %18 to i32
  %shl.i.1 = shl nsw i32 %conv.i.1, 16
  %19 = load i16, ptr %add.ptr2, align 2
  %conv22.i.1 = zext i16 %19 to i32
  %add.ptr2.1 = getelementptr inbounds i16, ptr %pIn.019, i32 8
  %add.ptr3.1 = getelementptr inbounds i16, ptr %pIn.019, i32 6
  %arrayidx.i13.1 = getelementptr inbounds i16, ptr %pIn.019, i32 7
  %20 = load i16, ptr %arrayidx.i13.1, align 2
  %conv.i14.1 = sext i16 %20 to i32
  %shl.i15.1 = shl nsw i32 %conv.i14.1, 16
  %21 = load i16, ptr %add.ptr3.1, align 2
  %conv22.i16.1 = zext i16 %21 to i32
  %shl.1 = shl nuw i32 %conv22.i.1, 16
  %shl5.1 = shl nuw i32 %conv22.i16.1, 16
  %incdec.ptr.1 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 5
  store i32 %shl.1, ptr %incdec.ptr9, align 4
  %incdec.ptr7.1 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 6
  store i32 %shl.i.1, ptr %incdec.ptr.1, align 4
  %incdec.ptr8.1 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 7
  store i32 %shl5.1, ptr %incdec.ptr7.1, align 4
  %incdec.ptr9.1 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 8
  store i32 %shl.i15.1, ptr %incdec.ptr8.1, align 4
  %arrayidx.i.2 = getelementptr inbounds i16, ptr %pIn.019, i32 9
  %22 = load i16, ptr %arrayidx.i.2, align 2
  %conv.i.2 = sext i16 %22 to i32
  %shl.i.2 = shl nsw i32 %conv.i.2, 16
  %23 = load i16, ptr %add.ptr2.1, align 2
  %conv22.i.2 = zext i16 %23 to i32
  %add.ptr2.2 = getelementptr inbounds i16, ptr %pIn.019, i32 12
  %add.ptr3.2 = getelementptr inbounds i16, ptr %pIn.019, i32 10
  %arrayidx.i13.2 = getelementptr inbounds i16, ptr %pIn.019, i32 11
  %24 = load i16, ptr %arrayidx.i13.2, align 2
  %conv.i14.2 = sext i16 %24 to i32
  %shl.i15.2 = shl nsw i32 %conv.i14.2, 16
  %25 = load i16, ptr %add.ptr3.2, align 2
  %conv22.i16.2 = zext i16 %25 to i32
  %shl.2 = shl nuw i32 %conv22.i.2, 16
  %shl5.2 = shl nuw i32 %conv22.i16.2, 16
  %incdec.ptr.2 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 9
  store i32 %shl.2, ptr %incdec.ptr9.1, align 4
  %incdec.ptr7.2 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 10
  store i32 %shl.i.2, ptr %incdec.ptr.2, align 4
  %incdec.ptr8.2 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 11
  store i32 %shl5.2, ptr %incdec.ptr7.2, align 4
  %incdec.ptr9.2 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 12
  store i32 %shl.i15.2, ptr %incdec.ptr8.2, align 4
  %arrayidx.i.3 = getelementptr inbounds i16, ptr %pIn.019, i32 13
  %26 = load i16, ptr %arrayidx.i.3, align 2
  %conv.i.3 = sext i16 %26 to i32
  %shl.i.3 = shl nsw i32 %conv.i.3, 16
  %27 = load i16, ptr %add.ptr2.2, align 2
  %conv22.i.3 = zext i16 %27 to i32
  %add.ptr2.3 = getelementptr inbounds i16, ptr %pIn.019, i32 16
  %add.ptr3.3 = getelementptr inbounds i16, ptr %pIn.019, i32 14
  %arrayidx.i13.3 = getelementptr inbounds i16, ptr %pIn.019, i32 15
  %28 = load i16, ptr %arrayidx.i13.3, align 2
  %conv.i14.3 = sext i16 %28 to i32
  %shl.i15.3 = shl nsw i32 %conv.i14.3, 16
  %29 = load i16, ptr %add.ptr3.3, align 2
  %conv22.i16.3 = zext i16 %29 to i32
  %shl.3 = shl nuw i32 %conv22.i.3, 16
  %shl5.3 = shl nuw i32 %conv22.i16.3, 16
  %incdec.ptr.3 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 13
  store i32 %shl.3, ptr %incdec.ptr9.2, align 4
  %incdec.ptr7.3 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 14
  store i32 %shl.i.3, ptr %incdec.ptr.3, align 4
  %incdec.ptr8.3 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 15
  store i32 %shl5.3, ptr %incdec.ptr7.3, align 4
  %incdec.ptr9.3 = getelementptr inbounds i32, ptr %pDst.addr.021, i32 16
  store i32 %shl.i15.3, ptr %incdec.ptr8.3, align 4
  %dec.3 = add nsw i32 %blkCnt.020, -4
  %cmp.not.3 = icmp eq i32 %dec.3, 0
  br i1 %cmp.not.3, label %while.end, label %while.body

while.end:                                        ; preds = %while.body.prol.loopexit, %while.body, %entry
  %pIn.0.lcssa = phi ptr [ %pSrc, %entry ], [ %add.ptr2.lcssa.unr, %while.body.prol.loopexit ], [ %add.ptr2.3, %while.body ]
  %pDst.addr.0.lcssa = phi ptr [ %pDst, %entry ], [ %incdec.ptr9.lcssa.unr, %while.body.prol.loopexit ], [ %incdec.ptr9.3, %while.body ]
  %rem = and i32 %blockSize, 3
  %cmp11.not23 = icmp eq i32 %rem, 0
  br i1 %cmp11.not23, label %while.end17, label %while.body12

while.body12:                                     ; preds = %while.end
  %30 = load i16, ptr %pIn.0.lcssa, align 2
  %conv = sext i16 %30 to i32
  %shl14 = shl nsw i32 %conv, 16
  store i32 %shl14, ptr %pDst.addr.0.lcssa, align 4
  %cmp11.not = icmp eq i32 %rem, 1
  br i1 %cmp11.not, label %while.end17, label %while.body12.1

while.body12.1:                                   ; preds = %while.body12
  %incdec.ptr15 = getelementptr inbounds i32, ptr %pDst.addr.0.lcssa, i32 1
  %incdec.ptr13 = getelementptr inbounds i16, ptr %pIn.0.lcssa, i32 1
  %31 = load i16, ptr %incdec.ptr13, align 2
  %conv.1 = sext i16 %31 to i32
  %shl14.1 = shl nsw i32 %conv.1, 16
  store i32 %shl14.1, ptr %incdec.ptr15, align 4
  %cmp11.not.1 = icmp eq i32 %rem, 2
  br i1 %cmp11.not.1, label %while.end17, label %while.body12.2

while.body12.2:                                   ; preds = %while.body12.1
  %incdec.ptr15.1 = getelementptr inbounds i32, ptr %pDst.addr.0.lcssa, i32 2
  %incdec.ptr13.1 = getelementptr inbounds i16, ptr %pIn.0.lcssa, i32 2
  %32 = load i16, ptr %incdec.ptr13.1, align 2
  %conv.2 = sext i16 %32 to i32
  %shl14.2 = shl nsw i32 %conv.2, 16
  store i32 %shl14.2, ptr %incdec.ptr15.1, align 4
  br label %while.end17

while.end17:                                      ; preds = %while.body12, %while.body12.1, %while.body12.2, %while.end
  ret void
}
